fix(voice-call): add speed and instructions to OpenAI TTS config schema (#39226)

Merged via squash.

Prepared head SHA: 775e3063b58d4629f59021798ab1c7222ff069d9
Co-authored-by: ademczuk <5212682+ademczuk@users.noreply.github.com>
Co-authored-by: obviyus <22031114+obviyus@users.noreply.github.com>
Reviewed-by: @obviyus
This commit is contained in:
ademczuk
2026-03-11 18:45:48 +01:00
committed by GitHub
parent daf8afc954
commit 8618a711ff
11 changed files with 209 additions and 4 deletions

View File

@@ -535,6 +535,7 @@ Docs: https://docs.openclaw.ai
- Browser/config schema: accept `browser.profiles.*.driver: "openclaw"` while preserving legacy `"clawd"` compatibility in validated config. (#39374; based on #35621) Thanks @gambletan and @ingyukoh.
- Memory flush/bootstrap file protection: restrict memory-flush runs to append-only `read`/`write` tools and route host-side memory appends through root-enforced safe file handles so flush turns cannot overwrite bootstrap files via `exec` or unsafe raw rewrites. (#38574) Thanks @frankekn.
- Mattermost/DM media uploads: resolve bare 26-character Mattermost IDs user-first for direct messages so media sends no longer fail with `403 Forbidden` when targets are configured as unprefixed user IDs. (#29925) Thanks @teconomix.
- Voice-call/OpenAI TTS config parity: add missing `speed`, `instructions`, and `baseUrl` fields to the OpenAI TTS config schema and gate `instructions` to supported models so voice-call overrides validate and route cleanly through core TTS. (#39226) Thanks @ademczuk.
## 2026.3.2

View File

@@ -522,11 +522,22 @@
"apiKey": {
"type": "string"
},
"baseUrl": {
"type": "string"
},
"model": {
"type": "string"
},
"voice": {
"type": "string"
},
"speed": {
"type": "number",
"minimum": 0.25,
"maximum": 4.0
},
"instructions": {
"type": "string"
}
}
},

View File

@@ -1,3 +1,4 @@
import { resolveOpenAITtsInstructions } from "openclaw/plugin-sdk/voice-call";
import { pcmToMulaw } from "../telephony-audio.js";
/**
@@ -110,9 +111,11 @@ export class OpenAITTSProvider {
speed: this.speed,
};
// Add instructions if using gpt-4o-mini-tts model
const effectiveInstructions = trimToUndefined(instructions) ?? this.instructions;
if (effectiveInstructions && this.model.includes("gpt-4o-mini-tts")) {
const effectiveInstructions = resolveOpenAITtsInstructions(
this.model,
trimToUndefined(instructions) ?? this.instructions,
);
if (effectiveInstructions) {
body.instructions = effectiveInstructions;
}

View File

@@ -279,6 +279,31 @@ describe("config plugin validation", () => {
expect(res.ok).toBe(true);
});
it("accepts voice-call OpenAI TTS speed, instructions, and baseUrl config fields", async () => {
const res = validateInSuite({
agents: { list: [{ id: "pi" }] },
plugins: {
enabled: true,
load: { paths: [voiceCallSchemaPluginDir] },
entries: {
"voice-call-schema-fixture": {
config: {
tts: {
openai: {
baseUrl: "http://localhost:8880/v1",
voice: "alloy",
speed: 1.5,
instructions: "Speak in a cheerful tone",
},
},
},
},
},
},
});
expect(res.ok).toBe(true);
});
it("accepts known plugin ids and valid channel/heartbeat enums", async () => {
const res = validateInSuite({
agents: {

View File

@@ -61,6 +61,10 @@ export type TtsConfig = {
baseUrl?: string;
model?: string;
voice?: string;
/** Playback speed (0.254.0, default 1.0). */
speed?: number;
/** System-level instructions for the TTS model (gpt-4o-mini-tts only). */
instructions?: string;
};
/** Microsoft Edge (node-edge-tts) configuration. */
edge?: {

View File

@@ -404,6 +404,8 @@ export const TtsConfigSchema = z
baseUrl: z.string().optional(),
model: z.string().optional(),
voice: z.string().optional(),
speed: z.number().min(0.25).max(4).optional(),
instructions: z.string().optional(),
})
.strict()
.optional(),

View File

@@ -0,0 +1,36 @@
import { describe, expect, it } from "vitest";
import { TtsConfigSchema } from "./zod-schema.core.js";
describe("TtsConfigSchema openai speed and instructions", () => {
it("accepts speed and instructions in openai section", () => {
expect(() =>
TtsConfigSchema.parse({
openai: {
voice: "alloy",
speed: 1.5,
instructions: "Speak in a cheerful tone",
},
}),
).not.toThrow();
});
it("rejects out-of-range openai speed", () => {
expect(() =>
TtsConfigSchema.parse({
openai: {
speed: 5.0,
},
}),
).toThrow();
});
it("rejects openai speed below minimum", () => {
expect(() =>
TtsConfigSchema.parse({
openai: {
speed: 0.1,
},
}),
).toThrow();
});
});

View File

@@ -7,6 +7,7 @@ export {
TtsModeSchema,
TtsProviderSchema,
} from "../config/zod-schema.core.js";
export { resolveOpenAITtsInstructions } from "../tts/tts-core.js";
export type { GatewayRequestHandlerOptions } from "../gateway/server-methods/types.js";
export {
isRequestBodyLimitError,

View File

@@ -43,6 +43,11 @@ function normalizeOpenAITtsBaseUrl(baseUrl?: string): string {
return trimmed.replace(/\/+$/, "");
}
function trimToUndefined(value?: string): string | undefined {
const trimmed = value?.trim();
return trimmed ? trimmed : undefined;
}
function requireInRange(value: number, min: number, max: number, label: string): void {
if (!Number.isFinite(value) || value < min || value > max) {
throw new Error(`${label} must be between ${min} and ${max}`);
@@ -383,6 +388,14 @@ export function isValidOpenAIModel(model: string, baseUrl?: string): boolean {
return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
}
export function resolveOpenAITtsInstructions(
model: string,
instructions?: string,
): string | undefined {
const next = trimToUndefined(instructions);
return next && model.includes("gpt-4o-mini-tts") ? next : undefined;
}
export function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is OpenAiTtsVoice {
// Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices)
if (isCustomOpenAIEndpoint(baseUrl)) {
@@ -619,10 +632,14 @@ export async function openaiTTS(params: {
baseUrl: string;
model: string;
voice: string;
speed?: number;
instructions?: string;
responseFormat: "mp3" | "opus" | "pcm";
timeoutMs: number;
}): Promise<Buffer> {
const { text, apiKey, baseUrl, model, voice, responseFormat, timeoutMs } = params;
const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } =
params;
const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions);
if (!isValidOpenAIModel(model, baseUrl)) {
throw new Error(`Invalid model: ${model}`);
@@ -646,6 +663,8 @@ export async function openaiTTS(params: {
input: text,
voice,
response_format: responseFormat,
...(speed != null && { speed }),
...(effectiveInstructions != null && { instructions: effectiveInstructions }),
}),
signal: controller.signal,
});

View File

@@ -57,6 +57,7 @@ const {
OPENAI_TTS_MODELS,
OPENAI_TTS_VOICES,
parseTtsDirectives,
resolveOpenAITtsInstructions,
resolveModelOverridePolicy,
summarizeText,
resolveOutputFormat,
@@ -169,6 +170,20 @@ describe("tts", () => {
});
});
describe("resolveOpenAITtsInstructions", () => {
it("keeps instructions only for gpt-4o-mini-tts variants", () => {
expect(resolveOpenAITtsInstructions("gpt-4o-mini-tts", " Speak warmly ")).toBe(
"Speak warmly",
);
expect(resolveOpenAITtsInstructions("gpt-4o-mini-tts-2025-12-15", "Speak warmly")).toBe(
"Speak warmly",
);
expect(resolveOpenAITtsInstructions("tts-1", "Speak warmly")).toBeUndefined();
expect(resolveOpenAITtsInstructions("tts-1-hd", "Speak warmly")).toBeUndefined();
expect(resolveOpenAITtsInstructions("gpt-4o-mini-tts", " ")).toBeUndefined();
});
});
describe("resolveOutputFormat", () => {
it("selects opus for voice-bubble channels (telegram/feishu/whatsapp) and mp3 for others", () => {
const cases = [
@@ -557,6 +572,84 @@ describe("tts", () => {
});
});
describe("textToSpeechTelephony openai instructions", () => {
const withMockedTelephonyFetch = async (
run: (fetchMock: ReturnType<typeof vi.fn>) => Promise<void>,
) => {
const originalFetch = globalThis.fetch;
const fetchMock = vi.fn(async () => ({
ok: true,
arrayBuffer: async () => new ArrayBuffer(2),
}));
globalThis.fetch = fetchMock as unknown as typeof fetch;
try {
await run(fetchMock);
} finally {
globalThis.fetch = originalFetch;
}
};
it("omits instructions for unsupported speech models", async () => {
const cfg: OpenClawConfig = {
messages: {
tts: {
provider: "openai",
openai: {
apiKey: "test-key",
model: "tts-1",
voice: "alloy",
instructions: "Speak warmly",
},
},
},
};
await withMockedTelephonyFetch(async (fetchMock) => {
const result = await tts.textToSpeechTelephony({
text: "Hello there, friendly caller.",
cfg,
});
expect(result.success).toBe(true);
expect(fetchMock).toHaveBeenCalledTimes(1);
const [, init] = fetchMock.mock.calls[0] as [string, RequestInit];
expect(typeof init.body).toBe("string");
const body = JSON.parse(init.body as string) as Record<string, unknown>;
expect(body.instructions).toBeUndefined();
});
});
it("includes instructions for gpt-4o-mini-tts", async () => {
const cfg: OpenClawConfig = {
messages: {
tts: {
provider: "openai",
openai: {
apiKey: "test-key",
model: "gpt-4o-mini-tts",
voice: "alloy",
instructions: "Speak warmly",
},
},
},
};
await withMockedTelephonyFetch(async (fetchMock) => {
const result = await tts.textToSpeechTelephony({
text: "Hello there, friendly caller.",
cfg,
});
expect(result.success).toBe(true);
expect(fetchMock).toHaveBeenCalledTimes(1);
const [, init] = fetchMock.mock.calls[0] as [string, RequestInit];
expect(typeof init.body).toBe("string");
const body = JSON.parse(init.body as string) as Record<string, unknown>;
expect(body.instructions).toBe("Speak warmly");
});
});
});
describe("maybeApplyTtsToPayload", () => {
const baseCfg: OpenClawConfig = {
agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },

View File

@@ -37,6 +37,7 @@ import {
isValidVoiceId,
OPENAI_TTS_MODELS,
OPENAI_TTS_VOICES,
resolveOpenAITtsInstructions,
openaiTTS,
parseTtsDirectives,
scheduleCleanup,
@@ -117,6 +118,8 @@ export type ResolvedTtsConfig = {
baseUrl: string;
model: string;
voice: string;
speed?: number;
instructions?: string;
};
edge: {
enabled: boolean;
@@ -304,6 +307,8 @@ export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
).replace(/\/+$/, ""),
model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
speed: raw.openai?.speed,
instructions: raw.openai?.instructions?.trim() || undefined,
},
edge: {
enabled: raw.edge?.enabled ?? true,
@@ -692,6 +697,8 @@ export async function textToSpeech(params: {
baseUrl: config.openai.baseUrl,
model: openaiModelOverride ?? config.openai.model,
voice: openaiVoiceOverride ?? config.openai.voice,
speed: config.openai.speed,
instructions: config.openai.instructions,
responseFormat: output.openai,
timeoutMs: config.timeoutMs,
});
@@ -789,6 +796,8 @@ export async function textToSpeechTelephony(params: {
baseUrl: config.openai.baseUrl,
model: config.openai.model,
voice: config.openai.voice,
speed: config.openai.speed,
instructions: config.openai.instructions,
responseFormat: output.format,
timeoutMs: config.timeoutMs,
});
@@ -961,6 +970,7 @@ export const _test = {
isValidOpenAIModel,
OPENAI_TTS_MODELS,
OPENAI_TTS_VOICES,
resolveOpenAITtsInstructions,
parseTtsDirectives,
resolveModelOverridePolicy,
summarizeText,