diff --git a/CHANGELOG.md b/CHANGELOG.md index bc3e1369b..4fa5806ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Docs: https://docs.openclaw.ai - Telegram/topic agent routing: support per-topic `agentId` overrides in forum groups and DM topics so topics can route to dedicated agents with isolated sessions. (#33647; based on #31513) Thanks @kesor and @Sid-Qin. - Slack/DM typing feedback: add `channels.slack.typingReaction` so Socket Mode DMs can show reaction-based processing status even when Slack native assistant typing is unavailable. (#19816) Thanks @dalefrieswthat. - Cron/job snapshot persistence: skip backup during normalization persistence in `ensureLoaded` so `jobs.json.bak` keeps the pre-edit snapshot for recovery, while preserving backup creation on explicit user-driven writes. (#35234) Thanks @0xsline. +- TTS/OpenAI-compatible endpoints: add `messages.tts.openai.baseUrl` config support with config-over-env precedence, endpoint-aware directive validation, and OpenAI TTS request routing to the resolved base URL. (#34321) thanks @RealKai42. ### Fixes diff --git a/src/config/types.tts.ts b/src/config/types.tts.ts index a9bb0ac07..3d898ff9c 100644 --- a/src/config/types.tts.ts +++ b/src/config/types.tts.ts @@ -58,6 +58,7 @@ export type TtsConfig = { /** OpenAI configuration. */ openai?: { apiKey?: SecretInput; + baseUrl?: string; model?: string; voice?: string; }; diff --git a/src/config/zod-schema.core.ts b/src/config/zod-schema.core.ts index a3ced77d9..48c442994 100644 --- a/src/config/zod-schema.core.ts +++ b/src/config/zod-schema.core.ts @@ -401,6 +401,7 @@ export const TtsConfigSchema = z openai: z .object({ apiKey: SecretInputSchema.optional().register(sensitive), + baseUrl: z.string().optional(), model: z.string().optional(), voice: z.string().optional(), }) diff --git a/src/discord/voice/manager.ts b/src/discord/voice/manager.ts index 31b964ccb..abec26d90 100644 --- a/src/discord/voice/manager.ts +++ b/src/discord/voice/manager.ts @@ -673,7 +673,11 @@ export class DiscordVoiceManager { cfg: this.params.cfg, override: this.params.discordConfig.voice?.tts, }); - const directive = parseTtsDirectives(replyText, ttsConfig.modelOverrides); + const directive = parseTtsDirectives( + replyText, + ttsConfig.modelOverrides, + ttsConfig.openai.baseUrl, + ); const speakText = directive.overrides.ttsText ?? directive.cleanedText.trim(); if (!speakText) { logVoiceVerbose( diff --git a/src/tts/tts-core.ts b/src/tts/tts-core.ts index c460793c3..a39eff698 100644 --- a/src/tts/tts-core.ts +++ b/src/tts/tts-core.ts @@ -18,6 +18,7 @@ import type { } from "./tts.js"; const DEFAULT_ELEVENLABS_BASE_URL = "https://api.elevenlabs.io"; +export const DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1"; const TEMP_FILE_CLEANUP_DELAY_MS = 5 * 60 * 1000; // 5 minutes export function isValidVoiceId(voiceId: string): boolean { @@ -32,6 +33,14 @@ function normalizeElevenLabsBaseUrl(baseUrl: string): string { return trimmed.replace(/\/+$/, ""); } +function normalizeOpenAITtsBaseUrl(baseUrl?: string): string { + const trimmed = baseUrl?.trim(); + if (!trimmed) { + return DEFAULT_OPENAI_BASE_URL; + } + return trimmed.replace(/\/+$/, ""); +} + function requireInRange(value: number, min: number, max: number, label: string): void { if (!Number.isFinite(value) || value < min || value > max) { throw new Error(`${label} must be between ${min} and ${max}`); @@ -99,6 +108,7 @@ function parseNumberValue(value: string): number | undefined { export function parseTtsDirectives( text: string, policy: ResolvedTtsModelOverrides, + openaiBaseUrl?: string, ): TtsDirectiveParseResult { if (!policy.enabled) { return { cleanedText: text, overrides: {}, warnings: [], hasDirective: false }; @@ -151,7 +161,7 @@ export function parseTtsDirectives( if (!policy.allowVoice) { break; } - if (isValidOpenAIVoice(rawValue)) { + if (isValidOpenAIVoice(rawValue, openaiBaseUrl)) { overrides.openai = { ...overrides.openai, voice: rawValue }; } else { warnings.push(`invalid OpenAI voice "${rawValue}"`); @@ -180,7 +190,7 @@ export function parseTtsDirectives( if (!policy.allowModelId) { break; } - if (isValidOpenAIModel(rawValue)) { + if (isValidOpenAIModel(rawValue, openaiBaseUrl)) { overrides.openai = { ...overrides.openai, model: rawValue }; } else { overrides.elevenlabs = { ...overrides.elevenlabs, modelId: rawValue }; @@ -335,14 +345,14 @@ export const OPENAI_TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] as con * Note: Read at runtime (not module load) to support config.env loading. */ function getOpenAITtsBaseUrl(): string { - return (process.env.OPENAI_TTS_BASE_URL?.trim() || "https://api.openai.com/v1").replace( - /\/+$/, - "", - ); + return normalizeOpenAITtsBaseUrl(process.env.OPENAI_TTS_BASE_URL); } -function isCustomOpenAIEndpoint(): boolean { - return getOpenAITtsBaseUrl() !== "https://api.openai.com/v1"; +function isCustomOpenAIEndpoint(baseUrl?: string): boolean { + if (baseUrl != null) { + return normalizeOpenAITtsBaseUrl(baseUrl) !== DEFAULT_OPENAI_BASE_URL; + } + return getOpenAITtsBaseUrl() !== DEFAULT_OPENAI_BASE_URL; } export const OPENAI_TTS_VOICES = [ "alloy", @@ -363,17 +373,17 @@ export const OPENAI_TTS_VOICES = [ type OpenAiTtsVoice = (typeof OPENAI_TTS_VOICES)[number]; -export function isValidOpenAIModel(model: string): boolean { +export function isValidOpenAIModel(model: string, baseUrl?: string): boolean { // Allow any model when using custom endpoint (e.g., Kokoro, LocalAI) - if (isCustomOpenAIEndpoint()) { + if (isCustomOpenAIEndpoint(baseUrl)) { return true; } return OPENAI_TTS_MODELS.includes(model as (typeof OPENAI_TTS_MODELS)[number]); } -export function isValidOpenAIVoice(voice: string): voice is OpenAiTtsVoice { +export function isValidOpenAIVoice(voice: string, baseUrl?: string): voice is OpenAiTtsVoice { // Allow any voice when using custom endpoint (e.g., Kokoro Chinese voices) - if (isCustomOpenAIEndpoint()) { + if (isCustomOpenAIEndpoint(baseUrl)) { return true; } return OPENAI_TTS_VOICES.includes(voice as OpenAiTtsVoice); @@ -591,17 +601,18 @@ export async function elevenLabsTTS(params: { export async function openaiTTS(params: { text: string; apiKey: string; + baseUrl: string; model: string; voice: string; responseFormat: "mp3" | "opus" | "pcm"; timeoutMs: number; }): Promise { - const { text, apiKey, model, voice, responseFormat, timeoutMs } = params; + const { text, apiKey, baseUrl, model, voice, responseFormat, timeoutMs } = params; - if (!isValidOpenAIModel(model)) { + if (!isValidOpenAIModel(model, baseUrl)) { throw new Error(`Invalid model: ${model}`); } - if (!isValidOpenAIVoice(voice)) { + if (!isValidOpenAIVoice(voice, baseUrl)) { throw new Error(`Invalid voice: ${voice}`); } @@ -609,7 +620,7 @@ export async function openaiTTS(params: { const timeout = setTimeout(() => controller.abort(), timeoutMs); try { - const response = await fetch(`${getOpenAITtsBaseUrl()}/audio/speech`, { + const response = await fetch(`${baseUrl}/audio/speech`, { method: "POST", headers: { Authorization: `Bearer ${apiKey}`, diff --git a/src/tts/tts.test.ts b/src/tts/tts.test.ts index d6bc88db4..0b4d7c56d 100644 --- a/src/tts/tts.test.ts +++ b/src/tts/tts.test.ts @@ -129,6 +129,10 @@ describe("tts", () => { expect(isValidOpenAIVoice("alloy ")).toBe(false); expect(isValidOpenAIVoice(" alloy")).toBe(false); }); + + it("treats the default endpoint with trailing slash as the default endpoint", () => { + expect(isValidOpenAIVoice("kokoro-custom-voice", "https://api.openai.com/v1/")).toBe(false); + }); }); describe("isValidOpenAIModel", () => { @@ -151,6 +155,10 @@ describe("tts", () => { expect(isValidOpenAIModel(testCase.model), testCase.model).toBe(testCase.expected); } }); + + it("treats the default endpoint with trailing slash as the default endpoint", () => { + expect(isValidOpenAIModel("kokoro-custom-model", "https://api.openai.com/v1/")).toBe(false); + }); }); describe("resolveOutputFormat", () => { @@ -277,6 +285,29 @@ describe("tts", () => { expect(result.cleanedText).toBe(input); expect(result.overrides.provider).toBeUndefined(); }); + + it("accepts custom voices and models when openaiBaseUrl is a non-default endpoint", () => { + const policy = resolveModelOverridePolicy({ enabled: true }); + const input = "Hello [[tts:voice=kokoro-chinese model=kokoro-v1]] world"; + const customBaseUrl = "http://localhost:8880/v1"; + + const result = parseTtsDirectives(input, policy, customBaseUrl); + + expect(result.overrides.openai?.voice).toBe("kokoro-chinese"); + expect(result.overrides.openai?.model).toBe("kokoro-v1"); + expect(result.warnings).toHaveLength(0); + }); + + it("rejects unknown voices and models when openaiBaseUrl is the default OpenAI endpoint", () => { + const policy = resolveModelOverridePolicy({ enabled: true }); + const input = "Hello [[tts:voice=kokoro-chinese model=kokoro-v1]] world"; + const defaultBaseUrl = "https://api.openai.com/v1"; + + const result = parseTtsDirectives(input, policy, defaultBaseUrl); + + expect(result.overrides.openai?.voice).toBeUndefined(); + expect(result.warnings).toContain('invalid OpenAI voice "kokoro-chinese"'); + }); }); describe("summarizeText", () => { @@ -437,6 +468,58 @@ describe("tts", () => { }); }); + describe("resolveTtsConfig – openai.baseUrl", () => { + const baseCfg: OpenClawConfig = { + agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } }, + messages: { tts: {} }, + }; + + it("defaults to the official OpenAI endpoint", () => { + withEnv({ OPENAI_TTS_BASE_URL: undefined }, () => { + const config = resolveTtsConfig(baseCfg); + expect(config.openai.baseUrl).toBe("https://api.openai.com/v1"); + }); + }); + + it("picks up OPENAI_TTS_BASE_URL env var when no config baseUrl is set", () => { + withEnv({ OPENAI_TTS_BASE_URL: "http://localhost:8880/v1" }, () => { + const config = resolveTtsConfig(baseCfg); + expect(config.openai.baseUrl).toBe("http://localhost:8880/v1"); + }); + }); + + it("config baseUrl takes precedence over env var", () => { + const cfg: OpenClawConfig = { + ...baseCfg, + messages: { + tts: { openai: { baseUrl: "http://my-server:9000/v1" } }, + }, + }; + withEnv({ OPENAI_TTS_BASE_URL: "http://localhost:8880/v1" }, () => { + const config = resolveTtsConfig(cfg); + expect(config.openai.baseUrl).toBe("http://my-server:9000/v1"); + }); + }); + + it("strips trailing slashes from the resolved baseUrl", () => { + const cfg: OpenClawConfig = { + ...baseCfg, + messages: { + tts: { openai: { baseUrl: "http://my-server:9000/v1///" } }, + }, + }; + const config = resolveTtsConfig(cfg); + expect(config.openai.baseUrl).toBe("http://my-server:9000/v1"); + }); + + it("strips trailing slashes from env var baseUrl", () => { + withEnv({ OPENAI_TTS_BASE_URL: "http://localhost:8880/v1/" }, () => { + const config = resolveTtsConfig(baseCfg); + expect(config.openai.baseUrl).toBe("http://localhost:8880/v1"); + }); + }); + }); + describe("maybeApplyTtsToPayload", () => { const baseCfg: OpenClawConfig = { agents: { defaults: { model: { primary: "openai/gpt-4o-mini" } } }, diff --git a/src/tts/tts.ts b/src/tts/tts.ts index eb0517f55..f76000029 100644 --- a/src/tts/tts.ts +++ b/src/tts/tts.ts @@ -28,6 +28,7 @@ import { stripMarkdown } from "../line/markdown-to-line.js"; import { isVoiceCompatibleAudio } from "../media/audio.js"; import { CONFIG_DIR, resolveUserPath } from "../utils.js"; import { + DEFAULT_OPENAI_BASE_URL, edgeTTS, elevenLabsTTS, inferEdgeExtension, @@ -113,6 +114,7 @@ export type ResolvedTtsConfig = { }; openai: { apiKey?: string; + baseUrl: string; model: string; voice: string; }; @@ -294,6 +296,12 @@ export function resolveTtsConfig(cfg: OpenClawConfig): ResolvedTtsConfig { value: raw.openai?.apiKey, path: "messages.tts.openai.apiKey", }), + // Config > env var > default; strip trailing slashes for consistency. + baseUrl: ( + raw.openai?.baseUrl?.trim() || + process.env.OPENAI_TTS_BASE_URL?.trim() || + DEFAULT_OPENAI_BASE_URL + ).replace(/\/+$/, ""), model: raw.openai?.model ?? DEFAULT_OPENAI_MODEL, voice: raw.openai?.voice ?? DEFAULT_OPENAI_VOICE, }, @@ -681,6 +689,7 @@ export async function textToSpeech(params: { audioBuffer = await openaiTTS({ text: params.text, apiKey, + baseUrl: config.openai.baseUrl, model: openaiModelOverride ?? config.openai.model, voice: openaiVoiceOverride ?? config.openai.voice, responseFormat: output.openai, @@ -777,6 +786,7 @@ export async function textToSpeechTelephony(params: { const audioBuffer = await openaiTTS({ text: params.text, apiKey, + baseUrl: config.openai.baseUrl, model: config.openai.model, voice: config.openai.voice, responseFormat: output.format, @@ -819,7 +829,7 @@ export async function maybeApplyTtsToPayload(params: { } const text = params.payload.text ?? ""; - const directives = parseTtsDirectives(text, config.modelOverrides); + const directives = parseTtsDirectives(text, config.modelOverrides, config.openai.baseUrl); if (directives.warnings.length > 0) { logVerbose(`TTS: ignored directive overrides (${directives.warnings.join("; ")})`); }