fix(voice-call): add speed and instructions to OpenAI TTS config schema (#39226)
Merged via squash. Prepared head SHA: 775e3063b58d4629f59021798ab1c7222ff069d9 Co-authored-by: ademczuk <5212682+ademczuk@users.noreply.github.com> Co-authored-by: obviyus <22031114+obviyus@users.noreply.github.com> Reviewed-by: @obviyus
This commit is contained in:
@@ -535,6 +535,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Browser/config schema: accept `browser.profiles.*.driver: "openclaw"` while preserving legacy `"clawd"` compatibility in validated config. (#39374; based on #35621) Thanks @gambletan and @ingyukoh.
|
||||
- Memory flush/bootstrap file protection: restrict memory-flush runs to append-only `read`/`write` tools and route host-side memory appends through root-enforced safe file handles so flush turns cannot overwrite bootstrap files via `exec` or unsafe raw rewrites. (#38574) Thanks @frankekn.
|
||||
- Mattermost/DM media uploads: resolve bare 26-character Mattermost IDs user-first for direct messages so media sends no longer fail with `403 Forbidden` when targets are configured as unprefixed user IDs. (#29925) Thanks @teconomix.
|
||||
- Voice-call/OpenAI TTS config parity: add missing `speed`, `instructions`, and `baseUrl` fields to the OpenAI TTS config schema and gate `instructions` to supported models so voice-call overrides validate and route cleanly through core TTS. (#39226) Thanks @ademczuk.
|
||||
|
||||
## 2026.3.2
|
||||
|
||||
|
||||
@@ -522,11 +522,22 @@
|
||||
"apiKey": {
|
||||
"type": "string"
|
||||
},
|
||||
"baseUrl": {
|
||||
"type": "string"
|
||||
},
|
||||
"model": {
|
||||
"type": "string"
|
||||
},
|
||||
"voice": {
|
||||
"type": "string"
|
||||
},
|
||||
"speed": {
|
||||
"type": "number",
|
||||
"minimum": 0.25,
|
||||
"maximum": 4.0
|
||||
},
|
||||
"instructions": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import { resolveOpenAITtsInstructions } from "openclaw/plugin-sdk/voice-call";
|
||||
import { pcmToMulaw } from "../telephony-audio.js";
|
||||
|
||||
/**
|
||||
@@ -110,9 +111,11 @@ export class OpenAITTSProvider {
|
||||
speed: this.speed,
|
||||
};
|
||||
|
||||
// Add instructions if using gpt-4o-mini-tts model
|
||||
const effectiveInstructions = trimToUndefined(instructions) ?? this.instructions;
|
||||
if (effectiveInstructions && this.model.includes("gpt-4o-mini-tts")) {
|
||||
const effectiveInstructions = resolveOpenAITtsInstructions(
|
||||
this.model,
|
||||
trimToUndefined(instructions) ?? this.instructions,
|
||||
);
|
||||
if (effectiveInstructions) {
|
||||
body.instructions = effectiveInstructions;
|
||||
}
|
||||
|
||||
|
||||
@@ -279,6 +279,31 @@ describe("config plugin validation", () => {
|
||||
expect(res.ok).toBe(true);
|
||||
});
|
||||
|
||||
it("accepts voice-call OpenAI TTS speed, instructions, and baseUrl config fields", async () => {
|
||||
const res = validateInSuite({
|
||||
agents: { list: [{ id: "pi" }] },
|
||||
plugins: {
|
||||
enabled: true,
|
||||
load: { paths: [voiceCallSchemaPluginDir] },
|
||||
entries: {
|
||||
"voice-call-schema-fixture": {
|
||||
config: {
|
||||
tts: {
|
||||
openai: {
|
||||
baseUrl: "http://localhost:8880/v1",
|
||||
voice: "alloy",
|
||||
speed: 1.5,
|
||||
instructions: "Speak in a cheerful tone",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
expect(res.ok).toBe(true);
|
||||
});
|
||||
|
||||
it("accepts known plugin ids and valid channel/heartbeat enums", async () => {
|
||||
const res = validateInSuite({
|
||||
agents: {
|
||||
|
||||
@@ -61,6 +61,10 @@ export type TtsConfig = {
|
||||
baseUrl?: string;
|
||||
model?: string;
|
||||
voice?: string;
|
||||
/** Playback speed (0.25–4.0, default 1.0). */
|
||||
speed?: number;
|
||||
/** System-level instructions for the TTS model (gpt-4o-mini-tts only). */
|
||||
instructions?: string;
|
||||
};
|
||||
/** Microsoft Edge (node-edge-tts) configuration. */
|
||||
edge?: {
|
||||
|
||||
@@ -404,6 +404,8 @@ export const TtsConfigSchema = z
|
||||
baseUrl: z.string().optional(),
|
||||
model: z.string().optional(),
|
||||
voice: z.string().optional(),
|
||||
speed: z.number().min(0.25).max(4).optional(),
|
||||
instructions: z.string().optional(),
|
||||
})
|
||||
.strict()
|
||||
.optional(),
|
||||
|
||||
36
src/config/zod-schema.tts.test.ts
Normal file
36
src/config/zod-schema.tts.test.ts
Normal file
@@ -0,0 +1,36 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { TtsConfigSchema } from "./zod-schema.core.js";
|
||||
|
||||
describe("TtsConfigSchema openai speed and instructions", () => {
|
||||
it("accepts speed and instructions in openai section", () => {
|
||||
expect(() =>
|
||||
TtsConfigSchema.parse({
|
||||
openai: {
|
||||
voice: "alloy",
|
||||
speed: 1.5,
|
||||
instructions: "Speak in a cheerful tone",
|
||||
},
|
||||
}),
|
||||
).not.toThrow();
|
||||
});
|
||||
|
||||
it("rejects out-of-range openai speed", () => {
|
||||
expect(() =>
|
||||
TtsConfigSchema.parse({
|
||||
openai: {
|
||||
speed: 5.0,
|
||||
},
|
||||
}),
|
||||
).toThrow();
|
||||
});
|
||||
|
||||
it("rejects openai speed below minimum", () => {
|
||||
expect(() =>
|
||||
TtsConfigSchema.parse({
|
||||
openai: {
|
||||
speed: 0.1,
|
||||
},
|
||||
}),
|
||||
).toThrow();
|
||||
});
|
||||
});
|
||||
@@ -7,6 +7,7 @@ export {
|
||||
TtsModeSchema,
|
||||
TtsProviderSchema,
|
||||
} from "../config/zod-schema.core.js";
|
||||
export { resolveOpenAITtsInstructions } from "../tts/tts-core.js";
|
||||
export type { GatewayRequestHandlerOptions } from "../gateway/server-methods/types.js";
|
||||
export {
|
||||
isRequestBodyLimitError,
|
||||
|
||||
@@ -43,6 +43,11 @@ function normalizeOpenAITtsBaseUrl(baseUrl?: string): string {
|
||||
return trimmed.replace(/\/+$/, "");
|
||||
}
|
||||
|
||||
function trimToUndefined(value?: string): string | undefined {
|
||||
const trimmed = value?.trim();
|
||||
return trimmed ? trimmed : undefined;
|
||||
}
|
||||
|
||||
function requireInRange(value: number, min: number, max: number, label: string): void {
|
||||
if (!Number.isFinite(value) || value < min || value > max) {
|
||||
throw new Error(`${label} must be between ${min} and ${max}`);
|
||||
@@ -383,6 +388,14 @@ export function isValidOpenAIModel(model: string, baseUrl?: string): boolean {
|
||||
return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
|
||||
}
|
||||
|
||||
export function resolveOpenAITtsInstructions(
|
||||
model: string,
|
||||
instructions?: string,
|
||||
): string | undefined {
|
||||
const next = trimToUndefined(instructions);
|
||||
return next && model.includes("gpt-4o-mini-tts") ? next : undefined;
|
||||
}
|
||||
|
||||
export function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is OpenAiTtsVoice {
|
||||
// Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices)
|
||||
if (isCustomOpenAIEndpoint(baseUrl)) {
|
||||
@@ -619,10 +632,14 @@ export async function openaiTTS(params: {
|
||||
baseUrl: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
speed?: number;
|
||||
instructions?: string;
|
||||
responseFormat: "mp3" | "opus" | "pcm";
|
||||
timeoutMs: number;
|
||||
}): Promise<Buffer> {
|
||||
const { text, apiKey, baseUrl, model, voice, responseFormat, timeoutMs } = params;
|
||||
const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } =
|
||||
params;
|
||||
const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions);
|
||||
|
||||
if (!isValidOpenAIModel(model, baseUrl)) {
|
||||
throw new Error(`Invalid model: ${model}`);
|
||||
@@ -646,6 +663,8 @@ export async function openaiTTS(params: {
|
||||
input: text,
|
||||
voice,
|
||||
response_format: responseFormat,
|
||||
...(speed != null && { speed }),
|
||||
...(effectiveInstructions != null && { instructions: effectiveInstructions }),
|
||||
}),
|
||||
signal: controller.signal,
|
||||
});
|
||||
|
||||
@@ -57,6 +57,7 @@ const {
|
||||
OPENAI_TTS_MODELS,
|
||||
OPENAI_TTS_VOICES,
|
||||
parseTtsDirectives,
|
||||
resolveOpenAITtsInstructions,
|
||||
resolveModelOverridePolicy,
|
||||
summarizeText,
|
||||
resolveOutputFormat,
|
||||
@@ -169,6 +170,20 @@ describe("tts", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("resolveOpenAITtsInstructions", () => {
|
||||
it("keeps instructions only for gpt-4o-mini-tts variants", () => {
|
||||
expect(resolveOpenAITtsInstructions("gpt-4o-mini-tts", " Speak warmly ")).toBe(
|
||||
"Speak warmly",
|
||||
);
|
||||
expect(resolveOpenAITtsInstructions("gpt-4o-mini-tts-2025-12-15", "Speak warmly")).toBe(
|
||||
"Speak warmly",
|
||||
);
|
||||
expect(resolveOpenAITtsInstructions("tts-1", "Speak warmly")).toBeUndefined();
|
||||
expect(resolveOpenAITtsInstructions("tts-1-hd", "Speak warmly")).toBeUndefined();
|
||||
expect(resolveOpenAITtsInstructions("gpt-4o-mini-tts", " ")).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
describe("resolveOutputFormat", () => {
|
||||
it("selects opus for voice-bubble channels (telegram/feishu/whatsapp) and mp3 for others", () => {
|
||||
const cases = [
|
||||
@@ -557,6 +572,84 @@ describe("tts", () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe("textToSpeechTelephony – openai instructions", () => {
|
||||
const withMockedTelephonyFetch = async (
|
||||
run: (fetchMock: ReturnType<typeof vi.fn>) => Promise<void>,
|
||||
) => {
|
||||
const originalFetch = globalThis.fetch;
|
||||
const fetchMock = vi.fn(async () => ({
|
||||
ok: true,
|
||||
arrayBuffer: async () => new ArrayBuffer(2),
|
||||
}));
|
||||
globalThis.fetch = fetchMock as unknown as typeof fetch;
|
||||
try {
|
||||
await run(fetchMock);
|
||||
} finally {
|
||||
globalThis.fetch = originalFetch;
|
||||
}
|
||||
};
|
||||
|
||||
it("omits instructions for unsupported speech models", async () => {
|
||||
const cfg: OpenClawConfig = {
|
||||
messages: {
|
||||
tts: {
|
||||
provider: "openai",
|
||||
openai: {
|
||||
apiKey: "test-key",
|
||||
model: "tts-1",
|
||||
voice: "alloy",
|
||||
instructions: "Speak warmly",
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
await withMockedTelephonyFetch(async (fetchMock) => {
|
||||
const result = await tts.textToSpeechTelephony({
|
||||
text: "Hello there, friendly caller.",
|
||||
cfg,
|
||||
});
|
||||
|
||||
expect(result.success).toBe(true);
|
||||
expect(fetchMock).toHaveBeenCalledTimes(1);
|
||||
const [, init] = fetchMock.mock.calls[0] as [string, RequestInit];
|
||||
expect(typeof init.body).toBe("string");
|
||||
const body = JSON.parse(init.body as string) as Record<string, unknown>;
|
||||
expect(body.instructions).toBeUndefined();
|
||||
});
|
||||
});
|
||||
|
||||
it("includes instructions for gpt-4o-mini-tts", async () => {
|
||||
const cfg: OpenClawConfig = {
|
||||
messages: {
|
||||
tts: {
|
||||
provider: "openai",
|
||||
openai: {
|
||||
apiKey: "test-key",
|
||||
model: "gpt-4o-mini-tts",
|
||||
voice: "alloy",
|
||||
instructions: "Speak warmly",
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
await withMockedTelephonyFetch(async (fetchMock) => {
|
||||
const result = await tts.textToSpeechTelephony({
|
||||
text: "Hello there, friendly caller.",
|
||||
cfg,
|
||||
});
|
||||
|
||||
expect(result.success).toBe(true);
|
||||
expect(fetchMock).toHaveBeenCalledTimes(1);
|
||||
const [, init] = fetchMock.mock.calls[0] as [string, RequestInit];
|
||||
expect(typeof init.body).toBe("string");
|
||||
const body = JSON.parse(init.body as string) as Record<string, unknown>;
|
||||
expect(body.instructions).toBe("Speak warmly");
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("maybeApplyTtsToPayload", () => {
|
||||
const baseCfg: OpenClawConfig = {
|
||||
agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
|
||||
|
||||
@@ -37,6 +37,7 @@ import {
|
||||
isValidVoiceId,
|
||||
OPENAI_TTS_MODELS,
|
||||
OPENAI_TTS_VOICES,
|
||||
resolveOpenAITtsInstructions,
|
||||
openaiTTS,
|
||||
parseTtsDirectives,
|
||||
scheduleCleanup,
|
||||
@@ -117,6 +118,8 @@ export type ResolvedTtsConfig = {
|
||||
baseUrl: string;
|
||||
model: string;
|
||||
voice: string;
|
||||
speed?: number;
|
||||
instructions?: string;
|
||||
};
|
||||
edge: {
|
||||
enabled: boolean;
|
||||
@@ -304,6 +307,8 @@ export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
|
||||
).replace(/\/+$/, ""),
|
||||
model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
|
||||
voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
|
||||
speed: raw.openai?.speed,
|
||||
instructions: raw.openai?.instructions?.trim() || undefined,
|
||||
},
|
||||
edge: {
|
||||
enabled: raw.edge?.enabled ?? true,
|
||||
@@ -692,6 +697,8 @@ export async function textToSpeech(params: {
|
||||
baseUrl: config.openai.baseUrl,
|
||||
model: openaiModelOverride ?? config.openai.model,
|
||||
voice: openaiVoiceOverride ?? config.openai.voice,
|
||||
speed: config.openai.speed,
|
||||
instructions: config.openai.instructions,
|
||||
responseFormat: output.openai,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
@@ -789,6 +796,8 @@ export async function textToSpeechTelephony(params: {
|
||||
baseUrl: config.openai.baseUrl,
|
||||
model: config.openai.model,
|
||||
voice: config.openai.voice,
|
||||
speed: config.openai.speed,
|
||||
instructions: config.openai.instructions,
|
||||
responseFormat: output.format,
|
||||
timeoutMs: config.timeoutMs,
|
||||
});
|
||||
@@ -961,6 +970,7 @@ export const _test = {
|
||||
isValidOpenAIModel,
|
||||
OPENAI_TTS_MODELS,
|
||||
OPENAI_TTS_VOICES,
|
||||
resolveOpenAITtsInstructions,
|
||||
parseTtsDirectives,
|
||||
resolveModelOverridePolicy,
|
||||
summarizeText,
|
||||
|
||||
Reference in New Issue
Block a user