fix: support parakeet-mlx output-dir transcript parsing (#9177) (thanks @mac-110)

2026-03-02 22:21:57 +00:00
parent f257818ea5
commit d89c25d69e
5 changed files with 95 additions and 1 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -29,6 +29,7 @@ Docs: https://docs.openclaw.ai
 - Plugin SDK/runtime hardening: add package export verification in CI/release checks to catch missing runtime exports before publish-time regressions. (#28575) Thanks @Glucksberg.
 - Media understanding/provider HTTP proxy routing: pass a proxy-aware fetch function from `HTTPS_PROXY`/`HTTP_PROXY` env vars into audio/video provider calls (with graceful malformed-proxy fallback) so transcription/video requests honor configured outbound proxies. (#27093) Thanks @mcaxtr.
 - Media understanding/malformed attachment guards: harden attachment selection and decision summary formatting against non-array or malformed attachment payloads to prevent runtime crashes on invalid inbound metadata shapes. (#28024) Thanks @claw9267.
+- Media understanding/parakeet CLI output parsing: read `parakeet-mlx` transcripts from `--output-dir/<media-basename>.txt` when txt output is requested (or default), with stdout fallback for non-txt formats. (#9177) Thanks @mac-110.
 - Media understanding/audio transcription guard: skip tiny/empty audio files (<1024 bytes) before provider/CLI transcription to avoid noisy invalid-audio failures and preserve clean fallback behavior. (#8388) Thanks @Glucksberg.
 - OpenAI media capabilities: include `audio` in the OpenAI provider capability list so audio transcription models are eligible in media-understanding provider selection. (#12717) Thanks @openjay.
 - Security/Node exec approvals: preserve shell/dispatch-wrapper argv semantics during approval hardening so approved wrapper commands (for example `env sh -c ...`) cannot drift into a different runtime command shape, and add regression coverage for both approval-plan generation and approved runtime execution paths. Thanks @tdjackey for reporting.
--- a/docs/nodes/audio.md
+++ b/docs/nodes/audio.md
@@ -176,5 +176,6 @@ When `requireMention: true` is set for a group chat, OpenClaw now transcribes au

 - Scope rules use first-match wins. `chatType` is normalized to `direct`, `group`, or `room`.
 - Ensure your CLI exits 0 and prints plain text; JSON needs to be massaged via `jq -r .text`.
+- For `parakeet-mlx`, if you pass `--output-dir`, OpenClaw reads `<output-dir>/<media-basename>.txt` when `--output-format` is `txt` (or omitted); non-`txt` output formats fall back to stdout parsing.
 - Keep timeouts reasonable (`timeoutSeconds`, default 60s) to avoid blocking the reply queue.
 - Preflight transcription only processes the **first** audio attachment for mention detection. Additional audio is processed during the main media understanding phase.
--- a/docs/nodes/media-understanding.md
+++ b/docs/nodes/media-understanding.md
@@ -210,6 +210,7 @@ If you omit `capabilities`, the entry is eligible for the list it appears in.

 - `openai/gpt-4o-mini-transcribe`, `groq/whisper-large-v3-turbo`, `deepgram/nova-3`, or `mistral/voxtral-mini-latest`.
 - CLI fallback: `whisper-cli` (whisper-cpp) or `whisper`.
+- `parakeet-mlx` note: with `--output-dir`, OpenClaw reads `<output-dir>/<media-basename>.txt` when output format is `txt` (or unspecified); non-`txt` formats fall back to stdout.
 - Deepgram setup: [Deepgram (audio transcription)](/providers/deepgram).

 **Video**
--- a/src/media-understanding/apply.test.ts
+++ b/src/media-understanding/apply.test.ts
@@ -477,6 +477,82 @@ describe("applyMediaUnderstanding", () => {
    expect(ctx.Body).toBe("[Audio]\nTranscript:\ncli transcript");
  });

+  it("reads parakeet-mlx transcript from output-dir txt file", async () => {
+    const ctx = await createAudioCtx({ fileName: "sample.wav", mediaType: "audio/wav" });
+    const cfg: OpenClawConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            models: [
+              {
+                type: "cli",
+                command: "parakeet-mlx",
+                args: ["{{MediaPath}}", "--output-format", "txt", "--output-dir", "{{OutputDir}}"],
+              },
+            ],
+          },
+        },
+      },
+    };
+
+    mockedRunExec.mockImplementationOnce(async (_cmd, args) => {
+      const mediaPath = args[0];
+      const outputDirArgIndex = args.indexOf("--output-dir");
+      const outputDir = outputDirArgIndex >= 0 ? args[outputDirArgIndex + 1] : undefined;
+      const transcriptPath =
+        mediaPath && outputDir ? path.join(outputDir, `${path.parse(mediaPath).name}.txt`) : "";
+      if (transcriptPath) {
+        await fs.writeFile(transcriptPath, "parakeet transcript\n");
+      }
+      return { stdout: "", stderr: "" };
+    });
+
+    const result = await applyMediaUnderstanding({ ctx, cfg });
+
+    expect(result.appliedAudio).toBe(true);
+    expect(ctx.Transcript).toBe("parakeet transcript");
+    expect(ctx.Body).toBe("[Audio]\nTranscript:\nparakeet transcript");
+  });
+
+  it("falls back to stdout for parakeet-mlx when output format is not txt", async () => {
+    const ctx = await createAudioCtx({ fileName: "sample.wav", mediaType: "audio/wav" });
+    const cfg: OpenClawConfig = {
+      tools: {
+        media: {
+          audio: {
+            enabled: true,
+            models: [
+              {
+                type: "cli",
+                command: "parakeet-mlx",
+                args: ["{{MediaPath}}", "--output-format", "json", "--output-dir", "{{OutputDir}}"],
+              },
+            ],
+          },
+        },
+      },
+    };
+
+    mockedRunExec.mockImplementationOnce(async (_cmd, args) => {
+      const mediaPath = args[0];
+      const outputDirArgIndex = args.indexOf("--output-dir");
+      const outputDir = outputDirArgIndex >= 0 ? args[outputDirArgIndex + 1] : undefined;
+      const transcriptPath =
+        mediaPath && outputDir ? path.join(outputDir, `${path.parse(mediaPath).name}.txt`) : "";
+      if (transcriptPath) {
+        await fs.writeFile(transcriptPath, "should-not-be-used\n");
+      }
+      return { stdout: "stdout transcript\n", stderr: "" };
+    });
+
+    const result = await applyMediaUnderstanding({ ctx, cfg });
+
+    expect(result.appliedAudio).toBe(true);
+    expect(ctx.Transcript).toBe("stdout transcript");
+    expect(ctx.Body).toBe("[Audio]\nTranscript:\nstdout transcript");
+  });
+
  it("auto-detects sherpa for audio when binary and model files are available", async () => {
    const binDir = await createTempMediaDir();
    const modelDir = await createTempMediaDir();
--- a/src/media-understanding/runner.entries.ts
+++ b/src/media-understanding/runner.entries.ts
@@ -136,6 +136,19 @@ function resolveWhisperCppOutputPath(args: string[]): string | null {
  return `${outputBase}.txt`;
 }

+function resolveParakeetOutputPath(args: string[], mediaPath: string): string | null {
+  const outputDir = findArgValue(args, ["--output-dir"]);
+  const outputFormat = findArgValue(args, ["--output-format"]);
+  if (!outputDir) {
+    return null;
+  }
+  if (outputFormat && outputFormat !== "txt") {
+    return null;
+  }
+  const base = path.parse(mediaPath).name;
+  return path.join(outputDir, `${base}.txt`);
+}
+
 async function resolveCliOutput(params: {
  command: string;
  args: string[];
@@ -148,7 +161,9 @@ async function resolveCliOutput(params: {
      ? resolveWhisperCppOutputPath(params.args)
      : commandId === "whisper"
        ? resolveWhisperOutputPath(params.args, params.mediaPath)
-        : null;
+        : commandId === "parakeet-mlx"
+          ? resolveParakeetOutputPath(params.args, params.mediaPath)
+          : null;
  if (fileOutput && (await fileExists(fileOutput))) {
    try {
      const content = await fs.readFile(fileOutput, "utf8");