perf: harden chunking against quadratic scans
This commit is contained in:
90
extensions/feishu/src/docx-batch-insert.test.ts
Normal file
90
extensions/feishu/src/docx-batch-insert.test.ts
Normal file
@@ -0,0 +1,90 @@
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import { BATCH_SIZE, insertBlocksInBatches } from "./docx-batch-insert.js";
|
||||
|
||||
function createCountingIterable<T>(values: T[]) {
|
||||
let iterations = 0;
|
||||
return {
|
||||
values: {
|
||||
[Symbol.iterator]: function* () {
|
||||
iterations += 1;
|
||||
yield* values;
|
||||
},
|
||||
},
|
||||
getIterations: () => iterations,
|
||||
};
|
||||
}
|
||||
|
||||
describe("insertBlocksInBatches", () => {
|
||||
it("builds the source block map once for large flat trees", async () => {
|
||||
const blockCount = BATCH_SIZE + 200;
|
||||
const blocks = Array.from({ length: blockCount }, (_, index) => ({
|
||||
block_id: `block_${index}`,
|
||||
block_type: 2,
|
||||
}));
|
||||
const counting = createCountingIterable(blocks);
|
||||
const createMock = vi.fn(async ({ data }: { data: { children_id: string[] } }) => ({
|
||||
code: 0,
|
||||
data: {
|
||||
children: data.children_id.map((id) => ({ block_id: id })),
|
||||
},
|
||||
}));
|
||||
const client = {
|
||||
docx: {
|
||||
documentBlockDescendant: {
|
||||
create: createMock,
|
||||
},
|
||||
},
|
||||
} as any;
|
||||
|
||||
const result = await insertBlocksInBatches(
|
||||
client,
|
||||
"doc_1",
|
||||
counting.values as any[],
|
||||
blocks.map((block) => block.block_id),
|
||||
);
|
||||
|
||||
expect(counting.getIterations()).toBe(1);
|
||||
expect(createMock).toHaveBeenCalledTimes(2);
|
||||
expect(createMock.mock.calls[0]?.[0]?.data.children_id).toHaveLength(BATCH_SIZE);
|
||||
expect(createMock.mock.calls[1]?.[0]?.data.children_id).toHaveLength(200);
|
||||
expect(result.children).toHaveLength(blockCount);
|
||||
});
|
||||
|
||||
it("keeps nested descendants grouped with their root blocks", async () => {
|
||||
const createMock = vi.fn(
|
||||
async ({
|
||||
data,
|
||||
}: {
|
||||
data: { children_id: string[]; descendants: Array<{ block_id: string }> };
|
||||
}) => ({
|
||||
code: 0,
|
||||
data: {
|
||||
children: data.children_id.map((id) => ({ block_id: id })),
|
||||
},
|
||||
}),
|
||||
);
|
||||
const client = {
|
||||
docx: {
|
||||
documentBlockDescendant: {
|
||||
create: createMock,
|
||||
},
|
||||
},
|
||||
} as any;
|
||||
const blocks = [
|
||||
{ block_id: "root_a", block_type: 1, children: ["child_a"] },
|
||||
{ block_id: "child_a", block_type: 2 },
|
||||
{ block_id: "root_b", block_type: 1, children: ["child_b"] },
|
||||
{ block_id: "child_b", block_type: 2 },
|
||||
];
|
||||
|
||||
await insertBlocksInBatches(client, "doc_1", blocks as any[], ["root_a", "root_b"]);
|
||||
|
||||
expect(createMock).toHaveBeenCalledTimes(1);
|
||||
expect(createMock.mock.calls[0]?.[0]?.data.children_id).toEqual(["root_a", "root_b"]);
|
||||
expect(
|
||||
createMock.mock.calls[0]?.[0]?.data.descendants.map(
|
||||
(block: { block_id: string }) => block.block_id,
|
||||
),
|
||||
).toEqual(["root_a", "child_a", "root_b", "child_b"]);
|
||||
});
|
||||
});
|
||||
@@ -14,16 +14,11 @@ export const BATCH_SIZE = 1000; // Feishu API limit per request
|
||||
type Logger = { info?: (msg: string) => void };
|
||||
|
||||
/**
|
||||
* Collect all descendant blocks for a given set of first-level block IDs.
|
||||
* Collect all descendant blocks for a given first-level block ID.
|
||||
* Recursively traverses the block tree to gather all children.
|
||||
*/
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- SDK block types
|
||||
function collectDescendants(blocks: any[], firstLevelIds: string[]): any[] {
|
||||
const blockMap = new Map<string, any>();
|
||||
for (const block of blocks) {
|
||||
blockMap.set(block.block_id, block);
|
||||
}
|
||||
|
||||
function collectDescendants(blockMap: Map<string, any>, rootId: string): any[] {
|
||||
const result: any[] = [];
|
||||
const visited = new Set<string>();
|
||||
|
||||
@@ -47,9 +42,7 @@ function collectDescendants(blocks: any[], firstLevelIds: string[]): any[] {
|
||||
}
|
||||
}
|
||||
|
||||
for (const id of firstLevelIds) {
|
||||
collect(id);
|
||||
}
|
||||
collect(rootId);
|
||||
|
||||
return result;
|
||||
}
|
||||
@@ -123,9 +116,13 @@ export async function insertBlocksInBatches(
|
||||
const batches: { firstLevelIds: string[]; blocks: any[] }[] = [];
|
||||
let currentBatch: { firstLevelIds: string[]; blocks: any[] } = { firstLevelIds: [], blocks: [] };
|
||||
const usedBlockIds = new Set<string>();
|
||||
const blockMap = new Map<string, any>();
|
||||
for (const block of blocks) {
|
||||
blockMap.set(block.block_id, block);
|
||||
}
|
||||
|
||||
for (const firstLevelId of firstLevelBlockIds) {
|
||||
const descendants = collectDescendants(blocks, [firstLevelId]);
|
||||
const descendants = collectDescendants(blockMap, firstLevelId);
|
||||
const newBlocks = descendants.filter((b) => !usedBlockIds.has(b.block_id));
|
||||
|
||||
// A single block whose subtree exceeds the API limit cannot be split
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import * as fences from "../markdown/fences.js";
|
||||
import { EmbeddedBlockChunker } from "./pi-embedded-block-chunker.js";
|
||||
|
||||
function createFlushOnParagraphChunker(params: { minChars: number; maxChars: number }) {
|
||||
@@ -120,4 +121,20 @@ describe("EmbeddedBlockChunker", () => {
|
||||
expect(chunks).toEqual(["Intro\n```js\nconst a = 1;\n\nconst b = 2;\n```"]);
|
||||
expect(chunker.bufferedText).toBe("After fence");
|
||||
});
|
||||
|
||||
it("parses fence spans once per drain call for long fenced buffers", () => {
|
||||
const parseSpy = vi.spyOn(fences, "parseFenceSpans");
|
||||
const chunker = new EmbeddedBlockChunker({
|
||||
minChars: 20,
|
||||
maxChars: 80,
|
||||
breakPreference: "paragraph",
|
||||
});
|
||||
|
||||
chunker.append(`\`\`\`txt\n${"line\n".repeat(600)}\`\`\``);
|
||||
const chunks = drainChunks(chunker);
|
||||
|
||||
expect(chunks.length).toBeGreaterThan(2);
|
||||
expect(parseSpy).toHaveBeenCalledTimes(1);
|
||||
parseSpy.mockRestore();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -12,6 +12,7 @@ export type BlockReplyChunking = {
|
||||
type FenceSplit = {
|
||||
closeFenceLine: string;
|
||||
reopenFenceLine: string;
|
||||
fence: FenceSpan;
|
||||
};
|
||||
|
||||
type BreakResult = {
|
||||
@@ -28,6 +29,7 @@ function findSafeSentenceBreakIndex(
|
||||
text: string,
|
||||
fenceSpans: FenceSpan[],
|
||||
minChars: number,
|
||||
offset = 0,
|
||||
): number {
|
||||
const matches = text.matchAll(/[.!?](?=\s|$)/g);
|
||||
let sentenceIdx = -1;
|
||||
@@ -37,7 +39,7 @@ function findSafeSentenceBreakIndex(
|
||||
continue;
|
||||
}
|
||||
const candidate = at + 1;
|
||||
if (isSafeFenceBreak(fenceSpans, candidate)) {
|
||||
if (isSafeFenceBreak(fenceSpans, offset + candidate)) {
|
||||
sentenceIdx = candidate;
|
||||
}
|
||||
}
|
||||
@@ -49,8 +51,9 @@ function findSafeParagraphBreakIndex(params: {
|
||||
fenceSpans: FenceSpan[];
|
||||
minChars: number;
|
||||
reverse: boolean;
|
||||
offset?: number;
|
||||
}): number {
|
||||
const { text, fenceSpans, minChars, reverse } = params;
|
||||
const { text, fenceSpans, minChars, reverse, offset = 0 } = params;
|
||||
let paragraphIdx = reverse ? text.lastIndexOf("\n\n") : text.indexOf("\n\n");
|
||||
while (reverse ? paragraphIdx >= minChars : paragraphIdx !== -1) {
|
||||
const candidates = [paragraphIdx, paragraphIdx + 1];
|
||||
@@ -61,7 +64,7 @@ function findSafeParagraphBreakIndex(params: {
|
||||
if (candidate < 0 || candidate >= text.length) {
|
||||
continue;
|
||||
}
|
||||
if (isSafeFenceBreak(fenceSpans, candidate)) {
|
||||
if (isSafeFenceBreak(fenceSpans, offset + candidate)) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
@@ -77,11 +80,12 @@ function findSafeNewlineBreakIndex(params: {
|
||||
fenceSpans: FenceSpan[];
|
||||
minChars: number;
|
||||
reverse: boolean;
|
||||
offset?: number;
|
||||
}): number {
|
||||
const { text, fenceSpans, minChars, reverse } = params;
|
||||
const { text, fenceSpans, minChars, reverse, offset = 0 } = params;
|
||||
let newlineIdx = reverse ? text.lastIndexOf("\n") : text.indexOf("\n");
|
||||
while (reverse ? newlineIdx >= minChars : newlineIdx !== -1) {
|
||||
if (newlineIdx >= minChars && isSafeFenceBreak(fenceSpans, newlineIdx)) {
|
||||
if (newlineIdx >= minChars && isSafeFenceBreak(fenceSpans, offset + newlineIdx)) {
|
||||
return newlineIdx;
|
||||
}
|
||||
newlineIdx = reverse
|
||||
@@ -125,14 +129,7 @@ export class EmbeddedBlockChunker {
|
||||
const minChars = Math.max(1, Math.floor(this.#chunking.minChars));
|
||||
const maxChars = Math.max(minChars, Math.floor(this.#chunking.maxChars));
|
||||
|
||||
// When flushOnParagraph is set (chunkMode="newline"), eagerly split on \n\n
|
||||
// boundaries regardless of minChars so each paragraph is sent immediately.
|
||||
if (this.#chunking.flushOnParagraph && !force) {
|
||||
this.#drainParagraphs(emit, maxChars);
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.#buffer.length < minChars && !force) {
|
||||
if (this.#buffer.length < minChars && !force && !this.#chunking.flushOnParagraph) {
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -144,108 +141,132 @@ export class EmbeddedBlockChunker {
|
||||
return;
|
||||
}
|
||||
|
||||
while (this.#buffer.length >= minChars || (force && this.#buffer.length > 0)) {
|
||||
const source = this.#buffer;
|
||||
const fenceSpans = parseFenceSpans(source);
|
||||
let start = 0;
|
||||
let reopenFence: FenceSpan | undefined;
|
||||
|
||||
while (start < source.length) {
|
||||
const reopenPrefix = reopenFence ? `${reopenFence.openLine}\n` : "";
|
||||
const remainingLength = reopenPrefix.length + (source.length - start);
|
||||
|
||||
if (!force && !this.#chunking.flushOnParagraph && remainingLength < minChars) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (this.#chunking.flushOnParagraph && !force) {
|
||||
const paragraphBreak = findNextParagraphBreak(source, fenceSpans, start);
|
||||
const paragraphLimit = Math.max(1, maxChars - reopenPrefix.length);
|
||||
if (paragraphBreak && paragraphBreak.index - start <= paragraphLimit) {
|
||||
const chunk = `${reopenPrefix}${source.slice(start, paragraphBreak.index)}`;
|
||||
if (chunk.trim().length > 0) {
|
||||
emit(chunk);
|
||||
}
|
||||
start = skipLeadingNewlines(source, paragraphBreak.index + paragraphBreak.length);
|
||||
reopenFence = undefined;
|
||||
continue;
|
||||
}
|
||||
if (remainingLength < maxChars) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const view = source.slice(start);
|
||||
const breakResult =
|
||||
force && this.#buffer.length <= maxChars
|
||||
? this.#pickSoftBreakIndex(this.#buffer, 1)
|
||||
: this.#pickBreakIndex(this.#buffer, force ? 1 : undefined);
|
||||
force && remainingLength <= maxChars
|
||||
? this.#pickSoftBreakIndex(view, fenceSpans, 1, start)
|
||||
: this.#pickBreakIndex(
|
||||
view,
|
||||
fenceSpans,
|
||||
force || this.#chunking.flushOnParagraph ? 1 : undefined,
|
||||
start,
|
||||
);
|
||||
if (breakResult.index <= 0) {
|
||||
if (force) {
|
||||
emit(this.#buffer);
|
||||
this.#buffer = "";
|
||||
emit(`${reopenPrefix}${source.slice(start)}`);
|
||||
start = source.length;
|
||||
reopenFence = undefined;
|
||||
}
|
||||
return;
|
||||
break;
|
||||
}
|
||||
|
||||
if (!this.#emitBreakResult(breakResult, emit)) {
|
||||
const consumed = this.#emitBreakResult({
|
||||
breakResult,
|
||||
emit,
|
||||
reopenPrefix,
|
||||
source,
|
||||
start,
|
||||
});
|
||||
if (consumed === null) {
|
||||
continue;
|
||||
}
|
||||
start = consumed.start;
|
||||
reopenFence = consumed.reopenFence;
|
||||
|
||||
if (this.#buffer.length < minChars && !force) {
|
||||
return;
|
||||
const nextLength =
|
||||
(reopenFence ? `${reopenFence.openLine}\n`.length : 0) + (source.length - start);
|
||||
if (nextLength < minChars && !force && !this.#chunking.flushOnParagraph) {
|
||||
break;
|
||||
}
|
||||
if (this.#buffer.length < maxChars && !force) {
|
||||
return;
|
||||
if (nextLength < maxChars && !force && !this.#chunking.flushOnParagraph) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
this.#buffer = reopenFence
|
||||
? `${reopenFence.openLine}\n${source.slice(start)}`
|
||||
: stripLeadingNewlines(source.slice(start));
|
||||
}
|
||||
|
||||
/** Eagerly emit complete paragraphs (text before \n\n) regardless of minChars. */
|
||||
#drainParagraphs(emit: (chunk: string) => void, maxChars: number) {
|
||||
while (this.#buffer.length > 0) {
|
||||
const fenceSpans = parseFenceSpans(this.#buffer);
|
||||
const paragraphBreak = findNextParagraphBreak(this.#buffer, fenceSpans);
|
||||
if (!paragraphBreak || paragraphBreak.index > maxChars) {
|
||||
// No paragraph boundary yet (or the next boundary is too far). If the
|
||||
// buffer exceeds maxChars, fall back to normal break logic to avoid
|
||||
// oversized chunks or unbounded accumulation.
|
||||
if (this.#buffer.length >= maxChars) {
|
||||
const breakResult = this.#pickBreakIndex(this.#buffer, 1);
|
||||
if (breakResult.index > 0) {
|
||||
this.#emitBreakResult(breakResult, emit);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const chunk = this.#buffer.slice(0, paragraphBreak.index);
|
||||
if (chunk.trim().length > 0) {
|
||||
emit(chunk);
|
||||
}
|
||||
this.#buffer = stripLeadingNewlines(
|
||||
this.#buffer.slice(paragraphBreak.index + paragraphBreak.length),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#emitBreakResult(breakResult: BreakResult, emit: (chunk: string) => void): boolean {
|
||||
#emitBreakResult(params: {
|
||||
breakResult: BreakResult;
|
||||
emit: (chunk: string) => void;
|
||||
reopenPrefix: string;
|
||||
source: string;
|
||||
start: number;
|
||||
}): { start: number; reopenFence?: FenceSpan } | null {
|
||||
const { breakResult, emit, reopenPrefix, source, start } = params;
|
||||
const breakIdx = breakResult.index;
|
||||
if (breakIdx <= 0) {
|
||||
return false;
|
||||
return null;
|
||||
}
|
||||
|
||||
let rawChunk = this.#buffer.slice(0, breakIdx);
|
||||
const absoluteBreakIdx = start + breakIdx;
|
||||
let rawChunk = `${reopenPrefix}${source.slice(start, absoluteBreakIdx)}`;
|
||||
if (rawChunk.trim().length === 0) {
|
||||
this.#buffer = stripLeadingNewlines(this.#buffer.slice(breakIdx)).trimStart();
|
||||
return false;
|
||||
return { start: skipLeadingNewlines(source, absoluteBreakIdx), reopenFence: undefined };
|
||||
}
|
||||
|
||||
let nextBuffer = this.#buffer.slice(breakIdx);
|
||||
const fenceSplit = breakResult.fenceSplit;
|
||||
if (fenceSplit) {
|
||||
const closeFence = rawChunk.endsWith("\n")
|
||||
? `${fenceSplit.closeFenceLine}\n`
|
||||
: `\n${fenceSplit.closeFenceLine}\n`;
|
||||
rawChunk = `${rawChunk}${closeFence}`;
|
||||
|
||||
const reopenFence = fenceSplit.reopenFenceLine.endsWith("\n")
|
||||
? fenceSplit.reopenFenceLine
|
||||
: `${fenceSplit.reopenFenceLine}\n`;
|
||||
nextBuffer = `${reopenFence}${nextBuffer}`;
|
||||
}
|
||||
|
||||
emit(rawChunk);
|
||||
|
||||
if (fenceSplit) {
|
||||
this.#buffer = nextBuffer;
|
||||
} else {
|
||||
const nextStart =
|
||||
breakIdx < this.#buffer.length && /\s/.test(this.#buffer[breakIdx])
|
||||
? breakIdx + 1
|
||||
: breakIdx;
|
||||
this.#buffer = stripLeadingNewlines(this.#buffer.slice(nextStart));
|
||||
return { start: absoluteBreakIdx, reopenFence: fenceSplit.fence };
|
||||
}
|
||||
|
||||
return true;
|
||||
const nextStart =
|
||||
absoluteBreakIdx < source.length && /\s/.test(source[absoluteBreakIdx])
|
||||
? absoluteBreakIdx + 1
|
||||
: absoluteBreakIdx;
|
||||
return { start: skipLeadingNewlines(source, nextStart), reopenFence: undefined };
|
||||
}
|
||||
|
||||
#pickSoftBreakIndex(buffer: string, minCharsOverride?: number): BreakResult {
|
||||
#pickSoftBreakIndex(
|
||||
buffer: string,
|
||||
fenceSpans: FenceSpan[],
|
||||
minCharsOverride?: number,
|
||||
offset = 0,
|
||||
): BreakResult {
|
||||
const minChars = Math.max(1, Math.floor(minCharsOverride ?? this.#chunking.minChars));
|
||||
if (buffer.length < minChars) {
|
||||
return { index: -1 };
|
||||
}
|
||||
const fenceSpans = parseFenceSpans(buffer);
|
||||
const preference = this.#chunking.breakPreference ?? "paragraph";
|
||||
|
||||
if (preference === "paragraph") {
|
||||
@@ -254,6 +275,7 @@ export class EmbeddedBlockChunker {
|
||||
fenceSpans,
|
||||
minChars,
|
||||
reverse: false,
|
||||
offset,
|
||||
});
|
||||
if (paragraphIdx !== -1) {
|
||||
return { index: paragraphIdx };
|
||||
@@ -266,6 +288,7 @@ export class EmbeddedBlockChunker {
|
||||
fenceSpans,
|
||||
minChars,
|
||||
reverse: false,
|
||||
offset,
|
||||
});
|
||||
if (newlineIdx !== -1) {
|
||||
return { index: newlineIdx };
|
||||
@@ -273,7 +296,7 @@ export class EmbeddedBlockChunker {
|
||||
}
|
||||
|
||||
if (preference !== "newline") {
|
||||
const sentenceIdx = findSafeSentenceBreakIndex(buffer, fenceSpans, minChars);
|
||||
const sentenceIdx = findSafeSentenceBreakIndex(buffer, fenceSpans, minChars, offset);
|
||||
if (sentenceIdx !== -1) {
|
||||
return { index: sentenceIdx };
|
||||
}
|
||||
@@ -282,14 +305,18 @@ export class EmbeddedBlockChunker {
|
||||
return { index: -1 };
|
||||
}
|
||||
|
||||
#pickBreakIndex(buffer: string, minCharsOverride?: number): BreakResult {
|
||||
#pickBreakIndex(
|
||||
buffer: string,
|
||||
fenceSpans: FenceSpan[],
|
||||
minCharsOverride?: number,
|
||||
offset = 0,
|
||||
): BreakResult {
|
||||
const minChars = Math.max(1, Math.floor(minCharsOverride ?? this.#chunking.minChars));
|
||||
const maxChars = Math.max(minChars, Math.floor(this.#chunking.maxChars));
|
||||
if (buffer.length < minChars) {
|
||||
return { index: -1 };
|
||||
}
|
||||
const window = buffer.slice(0, Math.min(maxChars, buffer.length));
|
||||
const fenceSpans = parseFenceSpans(buffer);
|
||||
|
||||
const preference = this.#chunking.breakPreference ?? "paragraph";
|
||||
if (preference === "paragraph") {
|
||||
@@ -298,6 +325,7 @@ export class EmbeddedBlockChunker {
|
||||
fenceSpans,
|
||||
minChars,
|
||||
reverse: true,
|
||||
offset,
|
||||
});
|
||||
if (paragraphIdx !== -1) {
|
||||
return { index: paragraphIdx };
|
||||
@@ -310,6 +338,7 @@ export class EmbeddedBlockChunker {
|
||||
fenceSpans,
|
||||
minChars,
|
||||
reverse: true,
|
||||
offset,
|
||||
});
|
||||
if (newlineIdx !== -1) {
|
||||
return { index: newlineIdx };
|
||||
@@ -317,7 +346,7 @@ export class EmbeddedBlockChunker {
|
||||
}
|
||||
|
||||
if (preference !== "newline") {
|
||||
const sentenceIdx = findSafeSentenceBreakIndex(window, fenceSpans, minChars);
|
||||
const sentenceIdx = findSafeSentenceBreakIndex(window, fenceSpans, minChars, offset);
|
||||
if (sentenceIdx !== -1) {
|
||||
return { index: sentenceIdx };
|
||||
}
|
||||
@@ -328,22 +357,23 @@ export class EmbeddedBlockChunker {
|
||||
}
|
||||
|
||||
for (let i = window.length - 1; i >= minChars; i--) {
|
||||
if (/\s/.test(window[i]) && isSafeFenceBreak(fenceSpans, i)) {
|
||||
if (/\s/.test(window[i]) && isSafeFenceBreak(fenceSpans, offset + i)) {
|
||||
return { index: i };
|
||||
}
|
||||
}
|
||||
|
||||
if (buffer.length >= maxChars) {
|
||||
if (isSafeFenceBreak(fenceSpans, maxChars)) {
|
||||
if (isSafeFenceBreak(fenceSpans, offset + maxChars)) {
|
||||
return { index: maxChars };
|
||||
}
|
||||
const fence = findFenceSpanAt(fenceSpans, maxChars);
|
||||
const fence = findFenceSpanAt(fenceSpans, offset + maxChars);
|
||||
if (fence) {
|
||||
return {
|
||||
index: maxChars,
|
||||
fenceSplit: {
|
||||
closeFenceLine: `${fence.indent}${fence.marker}`,
|
||||
reopenFenceLine: fence.openLine,
|
||||
fence,
|
||||
},
|
||||
};
|
||||
}
|
||||
@@ -354,12 +384,17 @@ export class EmbeddedBlockChunker {
|
||||
}
|
||||
}
|
||||
|
||||
function stripLeadingNewlines(value: string): string {
|
||||
let i = 0;
|
||||
function skipLeadingNewlines(value: string, start = 0): number {
|
||||
let i = start;
|
||||
while (i < value.length && value[i] === "\n") {
|
||||
i++;
|
||||
}
|
||||
return i > 0 ? value.slice(i) : value;
|
||||
return i;
|
||||
}
|
||||
|
||||
function stripLeadingNewlines(value: string): string {
|
||||
const start = skipLeadingNewlines(value);
|
||||
return start > 0 ? value.slice(start) : value;
|
||||
}
|
||||
|
||||
function findNextParagraphBreak(
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import * as fences from "../markdown/fences.js";
|
||||
import { hasBalancedFences } from "../test-utils/chunk-test-helpers.js";
|
||||
import {
|
||||
chunkByNewline,
|
||||
@@ -217,6 +218,17 @@ describe("chunkMarkdownText", () => {
|
||||
expect(chunks[0]?.length).toBe(20);
|
||||
expect(chunks.join("")).toBe(text);
|
||||
});
|
||||
|
||||
it("parses fence spans once for long fenced payloads", () => {
|
||||
const parseSpy = vi.spyOn(fences, "parseFenceSpans");
|
||||
const text = `\`\`\`txt\n${"line\n".repeat(600)}\`\`\``;
|
||||
|
||||
const chunks = chunkMarkdownText(text, 80);
|
||||
|
||||
expect(chunks.length).toBeGreaterThan(2);
|
||||
expect(parseSpy).toHaveBeenCalledTimes(1);
|
||||
parseSpy.mockRestore();
|
||||
});
|
||||
});
|
||||
|
||||
describe("chunkByNewline", () => {
|
||||
|
||||
@@ -306,7 +306,7 @@ export function chunkText(text: string, limit: number): string[] {
|
||||
}
|
||||
return chunkTextByBreakResolver(text, limit, (window) => {
|
||||
// 1) Prefer a newline break inside the window (outside parentheses).
|
||||
const { lastNewline, lastWhitespace } = scanParenAwareBreakpoints(window);
|
||||
const { lastNewline, lastWhitespace } = scanParenAwareBreakpoints(window, 0, window.length);
|
||||
// 2) Otherwise prefer the last whitespace (word boundary) inside the window.
|
||||
return lastNewline > 0 ? lastNewline : lastWhitespace;
|
||||
});
|
||||
@@ -319,14 +319,24 @@ export function chunkMarkdownText(text: string, limit: number): string[] {
|
||||
}
|
||||
|
||||
const chunks: string[] = [];
|
||||
let remaining = text;
|
||||
const spans = parseFenceSpans(text);
|
||||
let start = 0;
|
||||
let reopenFence: ReturnType<typeof findFenceSpanAt> | undefined;
|
||||
|
||||
while (remaining.length > limit) {
|
||||
const spans = parseFenceSpans(remaining);
|
||||
const window = remaining.slice(0, limit);
|
||||
while (start < text.length) {
|
||||
const reopenPrefix = reopenFence ? `${reopenFence.openLine}\n` : "";
|
||||
const contentLimit = Math.max(1, limit - reopenPrefix.length);
|
||||
if (text.length - start <= contentLimit) {
|
||||
const finalChunk = `${reopenPrefix}${text.slice(start)}`;
|
||||
if (finalChunk.length > 0) {
|
||||
chunks.push(finalChunk);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
const softBreak = pickSafeBreakIndex(window, spans);
|
||||
let breakIdx = softBreak > 0 ? softBreak : limit;
|
||||
const windowEnd = Math.min(text.length, start + contentLimit);
|
||||
const softBreak = pickSafeBreakIndex(text, start, windowEnd, spans);
|
||||
let breakIdx = softBreak > start ? softBreak : windowEnd;
|
||||
|
||||
const initialFence = isSafeFenceBreak(spans, breakIdx)
|
||||
? undefined
|
||||
@@ -335,38 +345,38 @@ export function chunkMarkdownText(text: string, limit: number): string[] {
|
||||
let fenceToSplit = initialFence;
|
||||
if (initialFence) {
|
||||
const closeLine = `${initialFence.indent}${initialFence.marker}`;
|
||||
const maxIdxIfNeedNewline = limit - (closeLine.length + 1);
|
||||
const maxIdxIfNeedNewline = start + (contentLimit - (closeLine.length + 1));
|
||||
|
||||
if (maxIdxIfNeedNewline <= 0) {
|
||||
if (maxIdxIfNeedNewline <= start) {
|
||||
fenceToSplit = undefined;
|
||||
breakIdx = limit;
|
||||
breakIdx = windowEnd;
|
||||
} else {
|
||||
const minProgressIdx = Math.min(
|
||||
remaining.length,
|
||||
initialFence.start + initialFence.openLine.length + 2,
|
||||
text.length,
|
||||
Math.max(start + 1, initialFence.start + initialFence.openLine.length + 2),
|
||||
);
|
||||
const maxIdxIfAlreadyNewline = limit - closeLine.length;
|
||||
const maxIdxIfAlreadyNewline = start + (contentLimit - closeLine.length);
|
||||
|
||||
let pickedNewline = false;
|
||||
let lastNewline = remaining.lastIndexOf("\n", Math.max(0, maxIdxIfAlreadyNewline - 1));
|
||||
while (lastNewline !== -1) {
|
||||
let lastNewline = text.lastIndexOf("\n", Math.max(start, maxIdxIfAlreadyNewline - 1));
|
||||
while (lastNewline >= start) {
|
||||
const candidateBreak = lastNewline + 1;
|
||||
if (candidateBreak < minProgressIdx) {
|
||||
break;
|
||||
}
|
||||
const candidateFence = findFenceSpanAt(spans, candidateBreak);
|
||||
if (candidateFence && candidateFence.start === initialFence.start) {
|
||||
breakIdx = Math.max(1, candidateBreak);
|
||||
breakIdx = candidateBreak;
|
||||
pickedNewline = true;
|
||||
break;
|
||||
}
|
||||
lastNewline = remaining.lastIndexOf("\n", lastNewline - 1);
|
||||
lastNewline = text.lastIndexOf("\n", lastNewline - 1);
|
||||
}
|
||||
|
||||
if (!pickedNewline) {
|
||||
if (minProgressIdx > maxIdxIfAlreadyNewline) {
|
||||
fenceToSplit = undefined;
|
||||
breakIdx = limit;
|
||||
breakIdx = windowEnd;
|
||||
} else {
|
||||
breakIdx = Math.max(minProgressIdx, maxIdxIfNeedNewline);
|
||||
}
|
||||
@@ -378,68 +388,72 @@ export function chunkMarkdownText(text: string, limit: number): string[] {
|
||||
fenceAtBreak && fenceAtBreak.start === initialFence.start ? fenceAtBreak : undefined;
|
||||
}
|
||||
|
||||
let rawChunk = remaining.slice(0, breakIdx);
|
||||
if (!rawChunk) {
|
||||
const rawContent = text.slice(start, breakIdx);
|
||||
if (!rawContent) {
|
||||
break;
|
||||
}
|
||||
|
||||
const brokeOnSeparator = breakIdx < remaining.length && /\s/.test(remaining[breakIdx]);
|
||||
const nextStart = Math.min(remaining.length, breakIdx + (brokeOnSeparator ? 1 : 0));
|
||||
let next = remaining.slice(nextStart);
|
||||
let rawChunk = `${reopenPrefix}${rawContent}`;
|
||||
const brokeOnSeparator = breakIdx < text.length && /\s/.test(text[breakIdx]);
|
||||
let nextStart = Math.min(text.length, breakIdx + (brokeOnSeparator ? 1 : 0));
|
||||
|
||||
if (fenceToSplit) {
|
||||
const closeLine = `${fenceToSplit.indent}${fenceToSplit.marker}`;
|
||||
rawChunk = rawChunk.endsWith("\n") ? `${rawChunk}${closeLine}` : `${rawChunk}\n${closeLine}`;
|
||||
next = `${fenceToSplit.openLine}\n${next}`;
|
||||
reopenFence = fenceToSplit;
|
||||
} else {
|
||||
next = stripLeadingNewlines(next);
|
||||
nextStart = skipLeadingNewlines(text, nextStart);
|
||||
reopenFence = undefined;
|
||||
}
|
||||
|
||||
chunks.push(rawChunk);
|
||||
remaining = next;
|
||||
}
|
||||
|
||||
if (remaining.length) {
|
||||
chunks.push(remaining);
|
||||
start = nextStart;
|
||||
}
|
||||
return chunks;
|
||||
}
|
||||
|
||||
function stripLeadingNewlines(value: string): string {
|
||||
let i = 0;
|
||||
function skipLeadingNewlines(value: string, start = 0): number {
|
||||
let i = start;
|
||||
while (i < value.length && value[i] === "\n") {
|
||||
i++;
|
||||
}
|
||||
return i > 0 ? value.slice(i) : value;
|
||||
return i;
|
||||
}
|
||||
|
||||
function pickSafeBreakIndex(window: string, spans: ReturnType<typeof parseFenceSpans>): number {
|
||||
const { lastNewline, lastWhitespace } = scanParenAwareBreakpoints(window, (index) =>
|
||||
function pickSafeBreakIndex(
|
||||
text: string,
|
||||
start: number,
|
||||
end: number,
|
||||
spans: ReturnType<typeof parseFenceSpans>,
|
||||
): number {
|
||||
const { lastNewline, lastWhitespace } = scanParenAwareBreakpoints(text, start, end, (index) =>
|
||||
isSafeFenceBreak(spans, index),
|
||||
);
|
||||
|
||||
if (lastNewline > 0) {
|
||||
if (lastNewline > start) {
|
||||
return lastNewline;
|
||||
}
|
||||
if (lastWhitespace > 0) {
|
||||
if (lastWhitespace > start) {
|
||||
return lastWhitespace;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
function scanParenAwareBreakpoints(
|
||||
window: string,
|
||||
text: string,
|
||||
start: number,
|
||||
end: number,
|
||||
isAllowed: (index: number) => boolean = () => true,
|
||||
): { lastNewline: number; lastWhitespace: number } {
|
||||
let lastNewline = -1;
|
||||
let lastWhitespace = -1;
|
||||
let depth = 0;
|
||||
|
||||
for (let i = 0; i < window.length; i++) {
|
||||
for (let i = start; i < end; i++) {
|
||||
if (!isAllowed(i)) {
|
||||
continue;
|
||||
}
|
||||
const char = window[i];
|
||||
const char = text[i];
|
||||
if (char === "(") {
|
||||
depth += 1;
|
||||
continue;
|
||||
|
||||
@@ -73,7 +73,27 @@ export function parseFenceSpans(buffer: string): FenceSpan[] {
|
||||
}
|
||||
|
||||
export function findFenceSpanAt(spans: FenceSpan[], index: number): FenceSpan | undefined {
|
||||
return spans.find((span) => index > span.start && index < span.end);
|
||||
let low = 0;
|
||||
let high = spans.length - 1;
|
||||
|
||||
while (low <= high) {
|
||||
const mid = Math.floor((low + high) / 2);
|
||||
const span = spans[mid];
|
||||
if (!span) {
|
||||
break;
|
||||
}
|
||||
if (index <= span.start) {
|
||||
high = mid - 1;
|
||||
continue;
|
||||
}
|
||||
if (index >= span.end) {
|
||||
low = mid + 1;
|
||||
continue;
|
||||
}
|
||||
return span;
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
export function isSafeFenceBreak(spans: FenceSpan[], index: number): boolean {
|
||||
|
||||
Reference in New Issue
Block a user