fix(voice-call): add speed and instructions to OpenAI TTS config schema (#39226)

Merged via squash. Prepared head SHA: 775e3063b58d4629f59021798ab1c7222ff069d9 Co-authored-by: ademczuk <5212682+ademczuk@users.noreply.github.com> Co-authored-by: obviyus <22031114+obviyus@users.noreply.github.com> Reviewed-by: @obviyus
2026-03-11 18:45:48 +01:00
parent daf8afc954
commit 8618a711ff
11 changed files with 209 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -535,6 +535,7 @@ Docs: https://docs.openclaw.ai
 - Browser/config schema: accept `browser.profiles.*.driver: "openclaw"` while preserving legacy `"clawd"` compatibility in validated config. (#39374; based on #35621) Thanks @gambletan and @ingyukoh.
 - Memory flush/bootstrap file protection: restrict memory-flush runs to append-only `read`/`write` tools and route host-side memory appends through root-enforced safe file handles so flush turns cannot overwrite bootstrap files via `exec` or unsafe raw rewrites. (#38574) Thanks @frankekn.
 - Mattermost/DM media uploads: resolve bare 26-character Mattermost IDs user-first for direct messages so media sends no longer fail with `403 Forbidden` when targets are configured as unprefixed user IDs. (#29925) Thanks @teconomix.
+- Voice-call/OpenAI TTS config parity: add missing `speed`, `instructions`, and `baseUrl` fields to the OpenAI TTS config schema and gate `instructions` to supported models so voice-call overrides validate and route cleanly through core TTS. (#39226) Thanks @ademczuk.

 ## 2026.3.2

--- a/extensions/voice-call/openclaw.plugin.json
+++ b/extensions/voice-call/openclaw.plugin.json
@@ -522,11 +522,22 @@
              "apiKey": {
                "type": "string"
              },
+              "baseUrl": {
+                "type": "string"
+              },
              "model": {
                "type": "string"
              },
              "voice": {
                "type": "string"
+              },
+              "speed": {
+                "type": "number",
+                "minimum": 0.25,
+                "maximum": 4.0
+              },
+              "instructions": {
+                "type": "string"
              }
            }
          },
--- a/extensions/voice-call/src/providers/tts-openai.ts
+++ b/extensions/voice-call/src/providers/tts-openai.ts
@@ -1,3 +1,4 @@
+import { resolveOpenAITtsInstructions } from "openclaw/plugin-sdk/voice-call";
 import { pcmToMulaw } from "../telephony-audio.js";

 /**
@@ -110,9 +111,11 @@ export class OpenAITTSProvider {
      speed: this.speed,
    };

-    // Add instructions if using gpt-4o-mini-tts model
-    const effectiveInstructions = trimToUndefined(instructions) ?? this.instructions;
-    if (effectiveInstructions && this.model.includes("gpt-4o-mini-tts")) {
+    const effectiveInstructions = resolveOpenAITtsInstructions(
+      this.model,
+      trimToUndefined(instructions) ?? this.instructions,
+    );
+    if (effectiveInstructions) {
      body.instructions = effectiveInstructions;
    }

--- a/src/config/config.plugin-validation.test.ts
+++ b/src/config/config.plugin-validation.test.ts
@@ -279,6 +279,31 @@ describe("config plugin validation", () => {
    expect(res.ok).toBe(true);
  });

+  it("accepts voice-call OpenAI TTS speed, instructions, and baseUrl config fields", async () => {
+    const res = validateInSuite({
+      agents: { list: [{ id: "pi" }] },
+      plugins: {
+        enabled: true,
+        load: { paths: [voiceCallSchemaPluginDir] },
+        entries: {
+          "voice-call-schema-fixture": {
+            config: {
+              tts: {
+                openai: {
+                  baseUrl: "http://localhost:8880/v1",
+                  voice: "alloy",
+                  speed: 1.5,
+                  instructions: "Speak in a cheerful tone",
+                },
+              },
+            },
+          },
+        },
+      },
+    });
+    expect(res.ok).toBe(true);
+  });
+
  it("accepts known plugin ids and valid channel/heartbeat enums", async () => {
    const res = validateInSuite({
      agents: {
--- a/src/config/types.tts.ts
+++ b/src/config/types.tts.ts
@@ -61,6 +61,10 @@ export type TtsConfig = {
    baseUrl?: string;
    model?: string;
    voice?: string;
+    /** Playback speed (0.25–4.0, default 1.0). */
+    speed?: number;
+    /** System-level instructions for the TTS model (gpt-4o-mini-tts only). */
+    instructions?: string;
  };
  /** Microsoft Edge (node-edge-tts) configuration. */
  edge?: {
--- a/src/config/zod-schema.core.ts
+++ b/src/config/zod-schema.core.ts
@@ -404,6 +404,8 @@ export const TtsConfigSchema = z
        baseUrl: z.string().optional(),
        model: z.string().optional(),
        voice: z.string().optional(),
+        speed: z.number().min(0.25).max(4).optional(),
+        instructions: z.string().optional(),
      })
      .strict()
      .optional(),
--- a/src/config/zod-schema.tts.test.ts
+++ b/src/config/zod-schema.tts.test.ts
@@ -0,0 +1,36 @@
+import { describe, expect, it } from "vitest";
+import { TtsConfigSchema } from "./zod-schema.core.js";
+
+describe("TtsConfigSchema openai speed and instructions", () => {
+  it("accepts speed and instructions in openai section", () => {
+    expect(() =>
+      TtsConfigSchema.parse({
+        openai: {
+          voice: "alloy",
+          speed: 1.5,
+          instructions: "Speak in a cheerful tone",
+        },
+      }),
+    ).not.toThrow();
+  });
+
+  it("rejects out-of-range openai speed", () => {
+    expect(() =>
+      TtsConfigSchema.parse({
+        openai: {
+          speed: 5.0,
+        },
+      }),
+    ).toThrow();
+  });
+
+  it("rejects openai speed below minimum", () => {
+    expect(() =>
+      TtsConfigSchema.parse({
+        openai: {
+          speed: 0.1,
+        },
+      }),
+    ).toThrow();
+  });
+});
--- a/src/plugin-sdk/voice-call.ts
+++ b/src/plugin-sdk/voice-call.ts
@@ -7,6 +7,7 @@ export {
  TtsModeSchema,
  TtsProviderSchema,
 } from "../config/zod-schema.core.js";
+export { resolveOpenAITtsInstructions } from "../tts/tts-core.js";
 export type { GatewayRequestHandlerOptions } from "../gateway/server-methods/types.js";
 export {
  isRequestBodyLimitError,
--- a/src/tts/tts-core.ts
+++ b/src/tts/tts-core.ts
@@ -43,6 +43,11 @@ function normalizeOpenAITtsBaseUrl(baseUrl?: string): string {
  return trimmed.replace(/\/+$/, "");
 }

+function trimToUndefined(value?: string): string | undefined {
+  const trimmed = value?.trim();
+  return trimmed ? trimmed : undefined;
+}
+
 function requireInRange(value: number, min: number, max: number, label: string): void {
  if (!Number.isFinite(value) || value < min || value > max) {
    throw new Error(`${label} must be between ${min} and ${max}`);
@@ -383,6 +388,14 @@ export function isValidOpenAIModel(model: string, baseUrl?: string): boolean {
  return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]);
 }

+export function resolveOpenAITtsInstructions(
+  model: string,
+  instructions?: string,
+): string | undefined {
+  const next = trimToUndefined(instructions);
+  return next && model.includes("gpt-4o-mini-tts") ? next : undefined;
+}
+
 export function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is OpenAiTtsVoice {
  // Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices)
  if (isCustomOpenAIEndpoint(baseUrl)) {
@@ -619,10 +632,14 @@ export async function openaiTTS(params: {
  baseUrl: string;
  model: string;
  voice: string;
+  speed?: number;
+  instructions?: string;
  responseFormat: "mp3" | "opus" | "pcm";
  timeoutMs: number;
 }): Promise<Buffer> {
-  const { text, apiKey, baseUrl, model, voice, responseFormat, timeoutMs } = params;
+  const { text, apiKey, baseUrl, model, voice, speed, instructions, responseFormat, timeoutMs } =
+    params;
+  const effectiveInstructions = resolveOpenAITtsInstructions(model, instructions);

  if (!isValidOpenAIModel(model, baseUrl)) {
    throw new Error(`Invalid model: ${model}`);
@@ -646,6 +663,8 @@ export async function openaiTTS(params: {
        input: text,
        voice,
        response_format: responseFormat,
+        ...(speed != null && { speed }),
+        ...(effectiveInstructions != null && { instructions: effectiveInstructions }),
      }),
      signal: controller.signal,
    });
--- a/src/tts/tts.test.ts
+++ b/src/tts/tts.test.ts
@@ -57,6 +57,7 @@ const {
  OPENAI_TTS_MODELS,
  OPENAI_TTS_VOICES,
  parseTtsDirectives,
+  resolveOpenAITtsInstructions,
  resolveModelOverridePolicy,
  summarizeText,
  resolveOutputFormat,
@@ -169,6 +170,20 @@ describe("tts", () => {
    });
  });

+  describe("resolveOpenAITtsInstructions", () => {
+    it("keeps instructions only for gpt-4o-mini-tts variants", () => {
+      expect(resolveOpenAITtsInstructions("gpt-4o-mini-tts", " Speak warmly ")).toBe(
+        "Speak warmly",
+      );
+      expect(resolveOpenAITtsInstructions("gpt-4o-mini-tts-2025-12-15", "Speak warmly")).toBe(
+        "Speak warmly",
+      );
+      expect(resolveOpenAITtsInstructions("tts-1", "Speak warmly")).toBeUndefined();
+      expect(resolveOpenAITtsInstructions("tts-1-hd", "Speak warmly")).toBeUndefined();
+      expect(resolveOpenAITtsInstructions("gpt-4o-mini-tts", "   ")).toBeUndefined();
+    });
+  });
+
  describe("resolveOutputFormat", () => {
    it("selects opus for voice-bubble channels (telegram/feishu/whatsapp) and mp3 for others", () => {
      const cases = [
@@ -557,6 +572,84 @@ describe("tts", () => {
    });
  });

+  describe("textToSpeechTelephony – openai instructions", () => {
+    const withMockedTelephonyFetch = async (
+      run: (fetchMock: ReturnType<typeof vi.fn>) => Promise<void>,
+    ) => {
+      const originalFetch = globalThis.fetch;
+      const fetchMock = vi.fn(async () => ({
+        ok: true,
+        arrayBuffer: async () => new ArrayBuffer(2),
+      }));
+      globalThis.fetch = fetchMock as unknown as typeof fetch;
+      try {
+        await run(fetchMock);
+      } finally {
+        globalThis.fetch = originalFetch;
+      }
+    };
+
+    it("omits instructions for unsupported speech models", async () => {
+      const cfg: OpenClawConfig = {
+        messages: {
+          tts: {
+            provider: "openai",
+            openai: {
+              apiKey: "test-key",
+              model: "tts-1",
+              voice: "alloy",
+              instructions: "Speak warmly",
+            },
+          },
+        },
+      };
+
+      await withMockedTelephonyFetch(async (fetchMock) => {
+        const result = await tts.textToSpeechTelephony({
+          text: "Hello there, friendly caller.",
+          cfg,
+        });
+
+        expect(result.success).toBe(true);
+        expect(fetchMock).toHaveBeenCalledTimes(1);
+        const [, init] = fetchMock.mock.calls[0] as [string, RequestInit];
+        expect(typeof init.body).toBe("string");
+        const body = JSON.parse(init.body as string) as Record<string, unknown>;
+        expect(body.instructions).toBeUndefined();
+      });
+    });
+
+    it("includes instructions for gpt-4o-mini-tts", async () => {
+      const cfg: OpenClawConfig = {
+        messages: {
+          tts: {
+            provider: "openai",
+            openai: {
+              apiKey: "test-key",
+              model: "gpt-4o-mini-tts",
+              voice: "alloy",
+              instructions: "Speak warmly",
+            },
+          },
+        },
+      };
+
+      await withMockedTelephonyFetch(async (fetchMock) => {
+        const result = await tts.textToSpeechTelephony({
+          text: "Hello there, friendly caller.",
+          cfg,
+        });
+
+        expect(result.success).toBe(true);
+        expect(fetchMock).toHaveBeenCalledTimes(1);
+        const [, init] = fetchMock.mock.calls[0] as [string, RequestInit];
+        expect(typeof init.body).toBe("string");
+        const body = JSON.parse(init.body as string) as Record<string, unknown>;
+        expect(body.instructions).toBe("Speak warmly");
+      });
+    });
+  });
+
  describe("maybeApplyTtsToPayload", () => {
    const baseCfg: OpenClawConfig = {
      agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } },
--- a/src/tts/tts.ts
+++ b/src/tts/tts.ts
@@ -37,6 +37,7 @@ import {
  isValidVoiceId,
  OPENAI_TTS_MODELS,
  OPENAI_TTS_VOICES,
+  resolveOpenAITtsInstructions,
  openaiTTS,
  parseTtsDirectives,
  scheduleCleanup,
@@ -117,6 +118,8 @@ export type ResolvedTtsConfig = {
    baseUrl: string;
    model: string;
    voice: string;
+    speed?: number;
+    instructions?: string;
  };
  edge: {
    enabled: boolean;
@@ -304,6 +307,8 @@ export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig {
      ).replace(/\/+$/, ""),
      model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL,
      voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE,
+      speed: raw.openai?.speed,
+      instructions: raw.openai?.instructions?.trim() || undefined,
    },
    edge: {
      enabled: raw.edge?.enabled ?? true,
@@ -692,6 +697,8 @@ export async function textToSpeech(params: {
          baseUrl: config.openai.baseUrl,
          model: openaiModelOverride ?? config.openai.model,
          voice: openaiVoiceOverride ?? config.openai.voice,
+          speed: config.openai.speed,
+          instructions: config.openai.instructions,
          responseFormat: output.openai,
          timeoutMs: config.timeoutMs,
        });
@@ -789,6 +796,8 @@ export async function textToSpeechTelephony(params: {
        baseUrl: config.openai.baseUrl,
        model: config.openai.model,
        voice: config.openai.voice,
+        speed: config.openai.speed,
+        instructions: config.openai.instructions,
        responseFormat: output.format,
        timeoutMs: config.timeoutMs,
      });
@@ -961,6 +970,7 @@ export const _test = {
  isValidOpenAIModel,
  OPENAI_TTS_MODELS,
  OPENAI_TTS_VOICES,
+  resolveOpenAITtsInstructions,
  parseTtsDirectives,
  resolveModelOverridePolicy,
  summarizeText,