Files
openclaw/src/gateway/openai-http.ts
Vincent Koc 9521e61a22 Gateway: follow up HEIC input image handling (#38146)
* Media: scope HEIC MIME sniffing

* Media: hermeticize HEIC input tests

* Gateway: fix HEIC image budget accounting

* Gateway: add HEIC image budget regression test

* Changelog: note HEIC follow-up fix
2026-03-06 11:53:59 -05:00

620 lines
18 KiB
TypeScript

import { randomUUID } from "node:crypto";
import type { IncomingMessage, ServerResponse } from "node:http";
import { createDefaultDeps } from "../cli/deps.js";
import { agentCommandFromIngress } from "../commands/agent.js";
import type { ImageContent } from "../commands/agent/types.js";
import type { GatewayHttpChatCompletionsConfig } from "../config/types.gateway.js";
import { emitAgentEvent, onAgentEvent } from "../infra/agent-events.js";
import { logWarn } from "../logger.js";
import { estimateBase64DecodedBytes } from "../media/base64.js";
import {
DEFAULT_INPUT_IMAGE_MAX_BYTES,
DEFAULT_INPUT_IMAGE_MIMES,
DEFAULT_INPUT_MAX_REDIRECTS,
DEFAULT_INPUT_TIMEOUT_MS,
extractImageContentFromSource,
normalizeMimeList,
type InputImageLimits,
type InputImageSource,
} from "../media/input-files.js";
import { defaultRuntime } from "../runtime.js";
import { resolveAssistantStreamDeltaText } from "./agent-event-assistant-text.js";
import {
buildAgentMessageFromConversationEntries,
type ConversationEntry,
} from "./agent-prompt.js";
import type { AuthRateLimiter } from "./auth-rate-limit.js";
import type { ResolvedGatewayAuth } from "./auth.js";
import { sendJson, setSseHeaders, writeDone } from "./http-common.js";
import { handleGatewayPostJsonEndpoint } from "./http-endpoint-helpers.js";
import { resolveGatewayRequestContext } from "./http-utils.js";
type OpenAiHttpOptions = {
auth: ResolvedGatewayAuth;
config?: GatewayHttpChatCompletionsConfig;
maxBodyBytes?: number;
trustedProxies?: string[];
allowRealIpFallback?: boolean;
rateLimiter?: AuthRateLimiter;
};
type OpenAiChatMessage = {
role?: unknown;
content?: unknown;
name?: unknown;
};
type OpenAiChatCompletionRequest = {
model?: unknown;
stream?: unknown;
messages?: unknown;
user?: unknown;
};
const DEFAULT_OPENAI_CHAT_COMPLETIONS_BODY_BYTES = 20 * 1024 * 1024;
const IMAGE_ONLY_USER_MESSAGE = "User sent image(s) with no text.";
const DEFAULT_OPENAI_MAX_IMAGE_PARTS = 8;
const DEFAULT_OPENAI_MAX_TOTAL_IMAGE_BYTES = 20 * 1024 * 1024;
const DEFAULT_OPENAI_IMAGE_LIMITS: InputImageLimits = {
allowUrl: false,
allowedMimes: new Set(DEFAULT_INPUT_IMAGE_MIMES),
maxBytes: DEFAULT_INPUT_IMAGE_MAX_BYTES,
maxRedirects: DEFAULT_INPUT_MAX_REDIRECTS,
timeoutMs: DEFAULT_INPUT_TIMEOUT_MS,
};
type ResolvedOpenAiChatCompletionsLimits = {
maxBodyBytes: number;
maxImageParts: number;
maxTotalImageBytes: number;
images: InputImageLimits;
};
function normalizeHostnameAllowlist(values: string[] | undefined): string[] | undefined {
if (!values || values.length === 0) {
return undefined;
}
const normalized = values.map((value) => value.trim()).filter((value) => value.length > 0);
return normalized.length > 0 ? normalized : undefined;
}
function resolveOpenAiChatCompletionsLimits(
config: GatewayHttpChatCompletionsConfig | undefined,
): ResolvedOpenAiChatCompletionsLimits {
const imageConfig = config?.images;
return {
maxBodyBytes: config?.maxBodyBytes ?? DEFAULT_OPENAI_CHAT_COMPLETIONS_BODY_BYTES,
maxImageParts:
typeof config?.maxImageParts === "number"
? Math.max(0, Math.floor(config.maxImageParts))
: DEFAULT_OPENAI_MAX_IMAGE_PARTS,
maxTotalImageBytes:
typeof config?.maxTotalImageBytes === "number"
? Math.max(1, Math.floor(config.maxTotalImageBytes))
: DEFAULT_OPENAI_MAX_TOTAL_IMAGE_BYTES,
images: {
allowUrl: imageConfig?.allowUrl ?? DEFAULT_OPENAI_IMAGE_LIMITS.allowUrl,
urlAllowlist: normalizeHostnameAllowlist(imageConfig?.urlAllowlist),
allowedMimes: normalizeMimeList(imageConfig?.allowedMimes, DEFAULT_INPUT_IMAGE_MIMES),
maxBytes: imageConfig?.maxBytes ?? DEFAULT_INPUT_IMAGE_MAX_BYTES,
maxRedirects: imageConfig?.maxRedirects ?? DEFAULT_INPUT_MAX_REDIRECTS,
timeoutMs: imageConfig?.timeoutMs ?? DEFAULT_INPUT_TIMEOUT_MS,
},
};
}
function writeSse(res: ServerResponse, data: unknown) {
res.write(`data: ${JSON.stringify(data)}\n\n`);
}
function buildAgentCommandInput(params: {
prompt: { message: string; extraSystemPrompt?: string; images?: ImageContent[] };
sessionKey: string;
runId: string;
messageChannel: string;
}) {
return {
message: params.prompt.message,
extraSystemPrompt: params.prompt.extraSystemPrompt,
images: params.prompt.images,
sessionKey: params.sessionKey,
runId: params.runId,
deliver: false as const,
messageChannel: params.messageChannel,
bestEffortDeliver: false as const,
// HTTP API callers are authenticated operator clients for this gateway context.
senderIsOwner: true as const,
};
}
function writeAssistantRoleChunk(res: ServerResponse, params: { runId: string; model: string }) {
writeSse(res, {
id: params.runId,
object: "chat.completion.chunk",
created: Math.floor(Date.now() / 1000),
model: params.model,
choices: [{ index: 0, delta: { role: "assistant" } }],
});
}
function writeAssistantContentChunk(
res: ServerResponse,
params: { runId: string; model: string; content: string; finishReason: "stop" | null },
) {
writeSse(res, {
id: params.runId,
object: "chat.completion.chunk",
created: Math.floor(Date.now() / 1000),
model: params.model,
choices: [
{
index: 0,
delta: { content: params.content },
finish_reason: params.finishReason,
},
],
});
}
function asMessages(val: unknown): OpenAiChatMessage[] {
return Array.isArray(val) ? (val as OpenAiChatMessage[]) : [];
}
function extractTextContent(content: unknown): string {
if (typeof content === "string") {
return content;
}
if (Array.isArray(content)) {
return content
.map((part) => {
if (!part || typeof part !== "object") {
return "";
}
const type = (part as { type?: unknown }).type;
const text = (part as { text?: unknown }).text;
const inputText = (part as { input_text?: unknown }).input_text;
if (type === "text" && typeof text === "string") {
return text;
}
if (type === "input_text" && typeof text === "string") {
return text;
}
if (typeof inputText === "string") {
return inputText;
}
return "";
})
.filter(Boolean)
.join("\n");
}
return "";
}
function resolveImageUrlPart(part: unknown): string | undefined {
if (!part || typeof part !== "object") {
return undefined;
}
const imageUrl = (part as { image_url?: unknown }).image_url;
if (typeof imageUrl === "string") {
const trimmed = imageUrl.trim();
return trimmed.length > 0 ? trimmed : undefined;
}
if (!imageUrl || typeof imageUrl !== "object") {
return undefined;
}
const rawUrl = (imageUrl as { url?: unknown }).url;
if (typeof rawUrl !== "string") {
return undefined;
}
const trimmed = rawUrl.trim();
return trimmed.length > 0 ? trimmed : undefined;
}
function extractImageUrls(content: unknown): string[] {
if (!Array.isArray(content)) {
return [];
}
const urls: string[] = [];
for (const part of content) {
if (!part || typeof part !== "object") {
continue;
}
if ((part as { type?: unknown }).type !== "image_url") {
continue;
}
const url = resolveImageUrlPart(part);
if (url) {
urls.push(url);
}
}
return urls;
}
type ActiveTurnContext = {
activeTurnIndex: number;
activeUserMessageIndex: number;
urls: string[];
};
function parseImageUrlToSource(url: string): InputImageSource {
const dataUriMatch = /^data:([^,]*?),(.*)$/is.exec(url);
if (dataUriMatch) {
const metadata = dataUriMatch[1]?.trim() ?? "";
const data = dataUriMatch[2] ?? "";
const metadataParts = metadata
.split(";")
.map((part) => part.trim())
.filter(Boolean);
const isBase64 = metadataParts.some((part) => part.toLowerCase() === "base64");
if (!isBase64) {
throw new Error("image_url data URI must be base64 encoded");
}
if (!data.trim()) {
throw new Error("image_url data URI is missing payload data");
}
const mediaTypeRaw = metadataParts.find((part) => part.includes("/"));
return {
type: "base64",
mediaType: mediaTypeRaw,
data,
};
}
return { type: "url", url };
}
function resolveActiveTurnContext(messagesUnknown: unknown): ActiveTurnContext {
const messages = asMessages(messagesUnknown);
for (let i = messages.length - 1; i >= 0; i -= 1) {
const msg = messages[i];
if (!msg || typeof msg !== "object") {
continue;
}
const role = typeof msg.role === "string" ? msg.role.trim() : "";
const normalizedRole = role === "function" ? "tool" : role;
if (normalizedRole !== "user" && normalizedRole !== "tool") {
continue;
}
return {
activeTurnIndex: i,
activeUserMessageIndex: normalizedRole === "user" ? i : -1,
urls: normalizedRole === "user" ? extractImageUrls(msg.content) : [],
};
}
return { activeTurnIndex: -1, activeUserMessageIndex: -1, urls: [] };
}
async function resolveImagesForRequest(
activeTurnContext: Pick<ActiveTurnContext, "urls">,
limits: ResolvedOpenAiChatCompletionsLimits,
): Promise<ImageContent[]> {
const urls = activeTurnContext.urls;
if (urls.length === 0) {
return [];
}
if (urls.length > limits.maxImageParts) {
throw new Error(`Too many image_url parts (${urls.length}; limit ${limits.maxImageParts})`);
}
const images: ImageContent[] = [];
let totalBytes = 0;
for (const url of urls) {
const source = parseImageUrlToSource(url);
if (source.type === "base64") {
const sourceBytes = estimateBase64DecodedBytes(source.data);
if (totalBytes + sourceBytes > limits.maxTotalImageBytes) {
throw new Error(
`Total image payload too large (${totalBytes + sourceBytes}; limit ${limits.maxTotalImageBytes})`,
);
}
}
const image = await extractImageContentFromSource(source, limits.images);
totalBytes += estimateBase64DecodedBytes(image.data);
if (totalBytes > limits.maxTotalImageBytes) {
throw new Error(
`Total image payload too large (${totalBytes}; limit ${limits.maxTotalImageBytes})`,
);
}
images.push(image);
}
return images;
}
export const __testOnlyOpenAiHttp = {
resolveImagesForRequest,
resolveOpenAiChatCompletionsLimits,
};
function buildAgentPrompt(
messagesUnknown: unknown,
activeUserMessageIndex: number,
): {
message: string;
extraSystemPrompt?: string;
} {
const messages = asMessages(messagesUnknown);
const systemParts: string[] = [];
const conversationEntries: ConversationEntry[] = [];
for (const [i, msg] of messages.entries()) {
if (!msg || typeof msg !== "object") {
continue;
}
const role = typeof msg.role === "string" ? msg.role.trim() : "";
const content = extractTextContent(msg.content).trim();
const hasImage = extractImageUrls(msg.content).length > 0;
if (!role) {
continue;
}
if (role === "system" || role === "developer") {
if (content) {
systemParts.push(content);
}
continue;
}
const normalizedRole = role === "function" ? "tool" : role;
if (normalizedRole !== "user" && normalizedRole !== "assistant" && normalizedRole !== "tool") {
continue;
}
// Keep the image-only placeholder scoped to the active user turn so we don't
// mention historical image-only turns whose bytes are intentionally not replayed.
const messageContent =
normalizedRole === "user" && !content && hasImage && i === activeUserMessageIndex
? IMAGE_ONLY_USER_MESSAGE
: content;
if (!messageContent) {
continue;
}
const name = typeof msg.name === "string" ? msg.name.trim() : "";
const sender =
normalizedRole === "assistant"
? "Assistant"
: normalizedRole === "user"
? "User"
: name
? `Tool:${name}`
: "Tool";
conversationEntries.push({
role: normalizedRole,
entry: { sender, body: messageContent },
});
}
const message = buildAgentMessageFromConversationEntries(conversationEntries);
return {
message,
extraSystemPrompt: systemParts.length > 0 ? systemParts.join("\n\n") : undefined,
};
}
function coerceRequest(val: unknown): OpenAiChatCompletionRequest {
if (!val || typeof val !== "object") {
return {};
}
return val as OpenAiChatCompletionRequest;
}
function resolveAgentResponseText(result: unknown): string {
const payloads = (result as { payloads?: Array<{ text?: string }> } | null)?.payloads;
if (!Array.isArray(payloads) || payloads.length === 0) {
return "No response from OpenClaw.";
}
const content = payloads
.map((p) => (typeof p.text === "string" ? p.text : ""))
.filter(Boolean)
.join("\n\n");
return content || "No response from OpenClaw.";
}
export async function handleOpenAiHttpRequest(
req: IncomingMessage,
res: ServerResponse,
opts: OpenAiHttpOptions,
): Promise<boolean> {
const limits = resolveOpenAiChatCompletionsLimits(opts.config);
const handled = await handleGatewayPostJsonEndpoint(req, res, {
pathname: "/v1/chat/completions",
auth: opts.auth,
trustedProxies: opts.trustedProxies,
allowRealIpFallback: opts.allowRealIpFallback,
rateLimiter: opts.rateLimiter,
maxBodyBytes: opts.maxBodyBytes ?? limits.maxBodyBytes,
});
if (handled === false) {
return false;
}
if (!handled) {
return true;
}
const payload = coerceRequest(handled.body);
const stream = Boolean(payload.stream);
const model = typeof payload.model === "string" ? payload.model : "openclaw";
const user = typeof payload.user === "string" ? payload.user : undefined;
const { sessionKey, messageChannel } = resolveGatewayRequestContext({
req,
model,
user,
sessionPrefix: "openai",
defaultMessageChannel: "webchat",
useMessageChannelHeader: true,
});
const activeTurnContext = resolveActiveTurnContext(payload.messages);
const prompt = buildAgentPrompt(payload.messages, activeTurnContext.activeUserMessageIndex);
let images: ImageContent[] = [];
try {
images = await resolveImagesForRequest(activeTurnContext, limits);
} catch (err) {
logWarn(`openai-compat: invalid image_url content: ${String(err)}`);
sendJson(res, 400, {
error: {
message: "Invalid image_url content in `messages`.",
type: "invalid_request_error",
},
});
return true;
}
if (!prompt.message && images.length === 0) {
sendJson(res, 400, {
error: {
message: "Missing user message in `messages`.",
type: "invalid_request_error",
},
});
return true;
}
const runId = `chatcmpl_${randomUUID()}`;
const deps = createDefaultDeps();
const commandInput = buildAgentCommandInput({
prompt: {
message: prompt.message,
extraSystemPrompt: prompt.extraSystemPrompt,
images: images.length > 0 ? images : undefined,
},
sessionKey,
runId,
messageChannel,
});
if (!stream) {
try {
const result = await agentCommandFromIngress(commandInput, defaultRuntime, deps);
const content = resolveAgentResponseText(result);
sendJson(res, 200, {
id: runId,
object: "chat.completion",
created: Math.floor(Date.now() / 1000),
model,
choices: [
{
index: 0,
message: { role: "assistant", content },
finish_reason: "stop",
},
],
usage: { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 },
});
} catch (err) {
logWarn(`openai-compat: chat completion failed: ${String(err)}`);
sendJson(res, 500, {
error: { message: "internal error", type: "api_error" },
});
}
return true;
}
setSseHeaders(res);
let wroteRole = false;
let sawAssistantDelta = false;
let closed = false;
const unsubscribe = onAgentEvent((evt) => {
if (evt.runId !== runId) {
return;
}
if (closed) {
return;
}
if (evt.stream === "assistant") {
const content = resolveAssistantStreamDeltaText(evt) ?? "";
if (!content) {
return;
}
if (!wroteRole) {
wroteRole = true;
writeAssistantRoleChunk(res, { runId, model });
}
sawAssistantDelta = true;
writeAssistantContentChunk(res, {
runId,
model,
content,
finishReason: null,
});
return;
}
if (evt.stream === "lifecycle") {
const phase = evt.data?.phase;
if (phase === "end" || phase === "error") {
closed = true;
unsubscribe();
writeDone(res);
res.end();
}
}
});
req.on("close", () => {
closed = true;
unsubscribe();
});
void (async () => {
try {
const result = await agentCommandFromIngress(commandInput, defaultRuntime, deps);
if (closed) {
return;
}
if (!sawAssistantDelta) {
if (!wroteRole) {
wroteRole = true;
writeAssistantRoleChunk(res, { runId, model });
}
const content = resolveAgentResponseText(result);
sawAssistantDelta = true;
writeAssistantContentChunk(res, {
runId,
model,
content,
finishReason: null,
});
}
} catch (err) {
logWarn(`openai-compat: streaming chat completion failed: ${String(err)}`);
if (closed) {
return;
}
writeAssistantContentChunk(res, {
runId,
model,
content: "Error: internal error",
finishReason: "stop",
});
emitAgentEvent({
runId,
stream: "lifecycle",
data: { phase: "error" },
});
} finally {
if (!closed) {
closed = true;
unsubscribe();
writeDone(res);
res.end();
}
}
})();
return true;
}