perf: harden chunking against quadratic scans

This commit is contained in:
Peter Steinberger
2026-03-07 16:50:15 +00:00
parent b393b9e8ff
commit 74912037dc
7 changed files with 324 additions and 139 deletions

View File

@@ -0,0 +1,90 @@
import { describe, expect, it, vi } from "vitest";
import { BATCH_SIZE, insertBlocksInBatches } from "./docx-batch-insert.js";
function createCountingIterable<T>(values: T[]) {
let iterations = 0;
return {
values: {
[Symbol.iterator]: function* () {
iterations += 1;
yield* values;
},
},
getIterations: () => iterations,
};
}
describe("insertBlocksInBatches", () => {
it("builds the source block map once for large flat trees", async () => {
const blockCount = BATCH_SIZE + 200;
const blocks = Array.from({ length: blockCount }, (_, index) => ({
block_id: `block_${index}`,
block_type: 2,
}));
const counting = createCountingIterable(blocks);
const createMock = vi.fn(async ({ data }: { data: { children_id: string[] } }) => ({
code: 0,
data: {
children: data.children_id.map((id) => ({ block_id: id })),
},
}));
const client = {
docx: {
documentBlockDescendant: {
create: createMock,
},
},
} as any;
const result = await insertBlocksInBatches(
client,
"doc_1",
counting.values as any[],
blocks.map((block) => block.block_id),
);
expect(counting.getIterations()).toBe(1);
expect(createMock).toHaveBeenCalledTimes(2);
expect(createMock.mock.calls[0]?.[0]?.data.children_id).toHaveLength(BATCH_SIZE);
expect(createMock.mock.calls[1]?.[0]?.data.children_id).toHaveLength(200);
expect(result.children).toHaveLength(blockCount);
});
it("keeps nested descendants grouped with their root blocks", async () => {
const createMock = vi.fn(
async ({
data,
}: {
data: { children_id: string[]; descendants: Array<{ block_id: string }> };
}) => ({
code: 0,
data: {
children: data.children_id.map((id) => ({ block_id: id })),
},
}),
);
const client = {
docx: {
documentBlockDescendant: {
create: createMock,
},
},
} as any;
const blocks = [
{ block_id: "root_a", block_type: 1, children: ["child_a"] },
{ block_id: "child_a", block_type: 2 },
{ block_id: "root_b", block_type: 1, children: ["child_b"] },
{ block_id: "child_b", block_type: 2 },
];
await insertBlocksInBatches(client, "doc_1", blocks as any[], ["root_a", "root_b"]);
expect(createMock).toHaveBeenCalledTimes(1);
expect(createMock.mock.calls[0]?.[0]?.data.children_id).toEqual(["root_a", "root_b"]);
expect(
createMock.mock.calls[0]?.[0]?.data.descendants.map(
(block: { block_id: string }) => block.block_id,
),
).toEqual(["root_a", "child_a", "root_b", "child_b"]);
});
});

View File

@@ -14,16 +14,11 @@ export const BATCH_SIZE = 1000; // Feishu API limit per request
type Logger = { info?: (msg: string) => void };
/**
* Collect all descendant blocks for a given set of first-level block IDs.
* Collect all descendant blocks for a given first-level block ID.
* Recursively traverses the block tree to gather all children.
*/
// eslint-disable-next-line @typescript-eslint/no-explicit-any -- SDK block types
function collectDescendants(blocks: any[], firstLevelIds: string[]): any[] {
const blockMap = new Map<string, any>();
for (const block of blocks) {
blockMap.set(block.block_id, block);
}
function collectDescendants(blockMap: Map<string, any>, rootId: string): any[] {
const result: any[] = [];
const visited = new Set<string>();
@@ -47,9 +42,7 @@ function collectDescendants(blocks: any[], firstLevelIds: string[]): any[] {
}
}
for (const id of firstLevelIds) {
collect(id);
}
collect(rootId);
return result;
}
@@ -123,9 +116,13 @@ export async function insertBlocksInBatches(
const batches: { firstLevelIds: string[]; blocks: any[] }[] = [];
let currentBatch: { firstLevelIds: string[]; blocks: any[] } = { firstLevelIds: [], blocks: [] };
const usedBlockIds = new Set<string>();
const blockMap = new Map<string, any>();
for (const block of blocks) {
blockMap.set(block.block_id, block);
}
for (const firstLevelId of firstLevelBlockIds) {
const descendants = collectDescendants(blocks, [firstLevelId]);
const descendants = collectDescendants(blockMap, firstLevelId);
const newBlocks = descendants.filter((b) => !usedBlockIds.has(b.block_id));
// A single block whose subtree exceeds the API limit cannot be split

View File

@@ -1,4 +1,5 @@
import { describe, expect, it } from "vitest";
import { describe, expect, it, vi } from "vitest";
import * as fences from "../markdown/fences.js";
import { EmbeddedBlockChunker } from "./pi-embedded-block-chunker.js";
function createFlushOnParagraphChunker(params: { minChars: number; maxChars: number }) {
@@ -120,4 +121,20 @@ describe("EmbeddedBlockChunker", () => {
expect(chunks).toEqual(["Intro\n```js\nconst a = 1;\n\nconst b = 2;\n```"]);
expect(chunker.bufferedText).toBe("After fence");
});
it("parses fence spans once per drain call for long fenced buffers", () => {
const parseSpy = vi.spyOn(fences, "parseFenceSpans");
const chunker = new EmbeddedBlockChunker({
minChars: 20,
maxChars: 80,
breakPreference: "paragraph",
});
chunker.append(`\`\`\`txt\n${"line\n".repeat(600)}\`\`\``);
const chunks = drainChunks(chunker);
expect(chunks.length).toBeGreaterThan(2);
expect(parseSpy).toHaveBeenCalledTimes(1);
parseSpy.mockRestore();
});
});

View File

@@ -12,6 +12,7 @@ export type BlockReplyChunking = {
type FenceSplit = {
closeFenceLine: string;
reopenFenceLine: string;
fence: FenceSpan;
};
type BreakResult = {
@@ -28,6 +29,7 @@ function findSafeSentenceBreakIndex(
text: string,
fenceSpans: FenceSpan[],
minChars: number,
offset = 0,
): number {
const matches = text.matchAll(/[.!?](?=\s|$)/g);
let sentenceIdx = -1;
@@ -37,7 +39,7 @@ function findSafeSentenceBreakIndex(
continue;
}
const candidate = at + 1;
if (isSafeFenceBreak(fenceSpans, candidate)) {
if (isSafeFenceBreak(fenceSpans, offset + candidate)) {
sentenceIdx = candidate;
}
}
@@ -49,8 +51,9 @@ function findSafeParagraphBreakIndex(params: {
fenceSpans: FenceSpan[];
minChars: number;
reverse: boolean;
offset?: number;
}): number {
const { text, fenceSpans, minChars, reverse } = params;
const { text, fenceSpans, minChars, reverse, offset = 0 } = params;
let paragraphIdx = reverse ? text.lastIndexOf("\n\n") : text.indexOf("\n\n");
while (reverse ? paragraphIdx >= minChars : paragraphIdx !== -1) {
const candidates = [paragraphIdx, paragraphIdx + 1];
@@ -61,7 +64,7 @@ function findSafeParagraphBreakIndex(params: {
if (candidate < 0 || candidate >= text.length) {
continue;
}
if (isSafeFenceBreak(fenceSpans, candidate)) {
if (isSafeFenceBreak(fenceSpans, offset + candidate)) {
return candidate;
}
}
@@ -77,11 +80,12 @@ function findSafeNewlineBreakIndex(params: {
fenceSpans: FenceSpan[];
minChars: number;
reverse: boolean;
offset?: number;
}): number {
const { text, fenceSpans, minChars, reverse } = params;
const { text, fenceSpans, minChars, reverse, offset = 0 } = params;
let newlineIdx = reverse ? text.lastIndexOf("\n") : text.indexOf("\n");
while (reverse ? newlineIdx >= minChars : newlineIdx !== -1) {
if (newlineIdx >= minChars && isSafeFenceBreak(fenceSpans, newlineIdx)) {
if (newlineIdx >= minChars && isSafeFenceBreak(fenceSpans, offset + newlineIdx)) {
return newlineIdx;
}
newlineIdx = reverse
@@ -125,14 +129,7 @@ export class EmbeddedBlockChunker {
const minChars = Math.max(1, Math.floor(this.#chunking.minChars));
const maxChars = Math.max(minChars, Math.floor(this.#chunking.maxChars));
// When flushOnParagraph is set (chunkMode="newline"), eagerly split on \n\n
// boundaries regardless of minChars so each paragraph is sent immediately.
if (this.#chunking.flushOnParagraph && !force) {
this.#drainParagraphs(emit, maxChars);
return;
}
if (this.#buffer.length < minChars && !force) {
if (this.#buffer.length < minChars && !force && !this.#chunking.flushOnParagraph) {
return;
}
@@ -144,108 +141,132 @@ export class EmbeddedBlockChunker {
return;
}
while (this.#buffer.length >= minChars || (force && this.#buffer.length > 0)) {
const source = this.#buffer;
const fenceSpans = parseFenceSpans(source);
let start = 0;
let reopenFence: FenceSpan | undefined;
while (start < source.length) {
const reopenPrefix = reopenFence ? `${reopenFence.openLine}\n` : "";
const remainingLength = reopenPrefix.length + (source.length - start);
if (!force && !this.#chunking.flushOnParagraph && remainingLength < minChars) {
break;
}
if (this.#chunking.flushOnParagraph && !force) {
const paragraphBreak = findNextParagraphBreak(source, fenceSpans, start);
const paragraphLimit = Math.max(1, maxChars - reopenPrefix.length);
if (paragraphBreak && paragraphBreak.index - start <= paragraphLimit) {
const chunk = `${reopenPrefix}${source.slice(start, paragraphBreak.index)}`;
if (chunk.trim().length > 0) {
emit(chunk);
}
start = skipLeadingNewlines(source, paragraphBreak.index + paragraphBreak.length);
reopenFence = undefined;
continue;
}
if (remainingLength < maxChars) {
break;
}
}
const view = source.slice(start);
const breakResult =
force && this.#buffer.length <= maxChars
? this.#pickSoftBreakIndex(this.#buffer, 1)
: this.#pickBreakIndex(this.#buffer, force ? 1 : undefined);
force && remainingLength <= maxChars
? this.#pickSoftBreakIndex(view, fenceSpans, 1, start)
: this.#pickBreakIndex(
view,
fenceSpans,
force || this.#chunking.flushOnParagraph ? 1 : undefined,
start,
);
if (breakResult.index <= 0) {
if (force) {
emit(this.#buffer);
this.#buffer = "";
emit(`${reopenPrefix}${source.slice(start)}`);
start = source.length;
reopenFence = undefined;
}
return;
break;
}
if (!this.#emitBreakResult(breakResult, emit)) {
const consumed = this.#emitBreakResult({
breakResult,
emit,
reopenPrefix,
source,
start,
});
if (consumed === null) {
continue;
}
start = consumed.start;
reopenFence = consumed.reopenFence;
if (this.#buffer.length < minChars && !force) {
return;
const nextLength =
(reopenFence ? `${reopenFence.openLine}\n`.length : 0) + (source.length - start);
if (nextLength < minChars && !force && !this.#chunking.flushOnParagraph) {
break;
}
if (this.#buffer.length < maxChars && !force) {
return;
if (nextLength < maxChars && !force && !this.#chunking.flushOnParagraph) {
break;
}
}
this.#buffer = reopenFence
? `${reopenFence.openLine}\n${source.slice(start)}`
: stripLeadingNewlines(source.slice(start));
}
/** Eagerly emit complete paragraphs (text before \n\n) regardless of minChars. */
#drainParagraphs(emit: (chunk: string) => void, maxChars: number) {
while (this.#buffer.length > 0) {
const fenceSpans = parseFenceSpans(this.#buffer);
const paragraphBreak = findNextParagraphBreak(this.#buffer, fenceSpans);
if (!paragraphBreak || paragraphBreak.index > maxChars) {
// No paragraph boundary yet (or the next boundary is too far). If the
// buffer exceeds maxChars, fall back to normal break logic to avoid
// oversized chunks or unbounded accumulation.
if (this.#buffer.length >= maxChars) {
const breakResult = this.#pickBreakIndex(this.#buffer, 1);
if (breakResult.index > 0) {
this.#emitBreakResult(breakResult, emit);
continue;
}
}
return;
}
const chunk = this.#buffer.slice(0, paragraphBreak.index);
if (chunk.trim().length > 0) {
emit(chunk);
}
this.#buffer = stripLeadingNewlines(
this.#buffer.slice(paragraphBreak.index + paragraphBreak.length),
);
}
}
#emitBreakResult(breakResult: BreakResult, emit: (chunk: string) => void): boolean {
#emitBreakResult(params: {
breakResult: BreakResult;
emit: (chunk: string) => void;
reopenPrefix: string;
source: string;
start: number;
}): { start: number; reopenFence?: FenceSpan } | null {
const { breakResult, emit, reopenPrefix, source, start } = params;
const breakIdx = breakResult.index;
if (breakIdx <= 0) {
return false;
return null;
}
let rawChunk = this.#buffer.slice(0, breakIdx);
const absoluteBreakIdx = start + breakIdx;
let rawChunk = `${reopenPrefix}${source.slice(start, absoluteBreakIdx)}`;
if (rawChunk.trim().length === 0) {
this.#buffer = stripLeadingNewlines(this.#buffer.slice(breakIdx)).trimStart();
return false;
return { start: skipLeadingNewlines(source, absoluteBreakIdx), reopenFence: undefined };
}
let nextBuffer = this.#buffer.slice(breakIdx);
const fenceSplit = breakResult.fenceSplit;
if (fenceSplit) {
const closeFence = rawChunk.endsWith("\n")
? `${fenceSplit.closeFenceLine}\n`
: `\n${fenceSplit.closeFenceLine}\n`;
rawChunk = `${rawChunk}${closeFence}`;
const reopenFence = fenceSplit.reopenFenceLine.endsWith("\n")
? fenceSplit.reopenFenceLine
: `${fenceSplit.reopenFenceLine}\n`;
nextBuffer = `${reopenFence}${nextBuffer}`;
}
emit(rawChunk);
if (fenceSplit) {
this.#buffer = nextBuffer;
} else {
const nextStart =
breakIdx < this.#buffer.length && /\s/.test(this.#buffer[breakIdx])
? breakIdx + 1
: breakIdx;
this.#buffer = stripLeadingNewlines(this.#buffer.slice(nextStart));
return { start: absoluteBreakIdx, reopenFence: fenceSplit.fence };
}
return true;
const nextStart =
absoluteBreakIdx < source.length && /\s/.test(source[absoluteBreakIdx])
? absoluteBreakIdx + 1
: absoluteBreakIdx;
return { start: skipLeadingNewlines(source, nextStart), reopenFence: undefined };
}
#pickSoftBreakIndex(buffer: string, minCharsOverride?: number): BreakResult {
#pickSoftBreakIndex(
buffer: string,
fenceSpans: FenceSpan[],
minCharsOverride?: number,
offset = 0,
): BreakResult {
const minChars = Math.max(1, Math.floor(minCharsOverride ?? this.#chunking.minChars));
if (buffer.length < minChars) {
return { index: -1 };
}
const fenceSpans = parseFenceSpans(buffer);
const preference = this.#chunking.breakPreference ?? "paragraph";
if (preference === "paragraph") {
@@ -254,6 +275,7 @@ export class EmbeddedBlockChunker {
fenceSpans,
minChars,
reverse: false,
offset,
});
if (paragraphIdx !== -1) {
return { index: paragraphIdx };
@@ -266,6 +288,7 @@ export class EmbeddedBlockChunker {
fenceSpans,
minChars,
reverse: false,
offset,
});
if (newlineIdx !== -1) {
return { index: newlineIdx };
@@ -273,7 +296,7 @@ export class EmbeddedBlockChunker {
}
if (preference !== "newline") {
const sentenceIdx = findSafeSentenceBreakIndex(buffer, fenceSpans, minChars);
const sentenceIdx = findSafeSentenceBreakIndex(buffer, fenceSpans, minChars, offset);
if (sentenceIdx !== -1) {
return { index: sentenceIdx };
}
@@ -282,14 +305,18 @@ export class EmbeddedBlockChunker {
return { index: -1 };
}
#pickBreakIndex(buffer: string, minCharsOverride?: number): BreakResult {
#pickBreakIndex(
buffer: string,
fenceSpans: FenceSpan[],
minCharsOverride?: number,
offset = 0,
): BreakResult {
const minChars = Math.max(1, Math.floor(minCharsOverride ?? this.#chunking.minChars));
const maxChars = Math.max(minChars, Math.floor(this.#chunking.maxChars));
if (buffer.length < minChars) {
return { index: -1 };
}
const window = buffer.slice(0, Math.min(maxChars, buffer.length));
const fenceSpans = parseFenceSpans(buffer);
const preference = this.#chunking.breakPreference ?? "paragraph";
if (preference === "paragraph") {
@@ -298,6 +325,7 @@ export class EmbeddedBlockChunker {
fenceSpans,
minChars,
reverse: true,
offset,
});
if (paragraphIdx !== -1) {
return { index: paragraphIdx };
@@ -310,6 +338,7 @@ export class EmbeddedBlockChunker {
fenceSpans,
minChars,
reverse: true,
offset,
});
if (newlineIdx !== -1) {
return { index: newlineIdx };
@@ -317,7 +346,7 @@ export class EmbeddedBlockChunker {
}
if (preference !== "newline") {
const sentenceIdx = findSafeSentenceBreakIndex(window, fenceSpans, minChars);
const sentenceIdx = findSafeSentenceBreakIndex(window, fenceSpans, minChars, offset);
if (sentenceIdx !== -1) {
return { index: sentenceIdx };
}
@@ -328,22 +357,23 @@ export class EmbeddedBlockChunker {
}
for (let i = window.length - 1; i >= minChars; i--) {
if (/\s/.test(window[i]) && isSafeFenceBreak(fenceSpans, i)) {
if (/\s/.test(window[i]) && isSafeFenceBreak(fenceSpans, offset + i)) {
return { index: i };
}
}
if (buffer.length >= maxChars) {
if (isSafeFenceBreak(fenceSpans, maxChars)) {
if (isSafeFenceBreak(fenceSpans, offset + maxChars)) {
return { index: maxChars };
}
const fence = findFenceSpanAt(fenceSpans, maxChars);
const fence = findFenceSpanAt(fenceSpans, offset + maxChars);
if (fence) {
return {
index: maxChars,
fenceSplit: {
closeFenceLine: `${fence.indent}${fence.marker}`,
reopenFenceLine: fence.openLine,
fence,
},
};
}
@@ -354,12 +384,17 @@ export class EmbeddedBlockChunker {
}
}
function stripLeadingNewlines(value: string): string {
let i = 0;
function skipLeadingNewlines(value: string, start = 0): number {
let i = start;
while (i < value.length && value[i] === "\n") {
i++;
}
return i > 0 ? value.slice(i) : value;
return i;
}
function stripLeadingNewlines(value: string): string {
const start = skipLeadingNewlines(value);
return start > 0 ? value.slice(start) : value;
}
function findNextParagraphBreak(

View File

@@ -1,4 +1,5 @@
import { describe, expect, it } from "vitest";
import { describe, expect, it, vi } from "vitest";
import * as fences from "../markdown/fences.js";
import { hasBalancedFences } from "../test-utils/chunk-test-helpers.js";
import {
chunkByNewline,
@@ -217,6 +218,17 @@ describe("chunkMarkdownText", () => {
expect(chunks[0]?.length).toBe(20);
expect(chunks.join("")).toBe(text);
});
it("parses fence spans once for long fenced payloads", () => {
const parseSpy = vi.spyOn(fences, "parseFenceSpans");
const text = `\`\`\`txt\n${"line\n".repeat(600)}\`\`\``;
const chunks = chunkMarkdownText(text, 80);
expect(chunks.length).toBeGreaterThan(2);
expect(parseSpy).toHaveBeenCalledTimes(1);
parseSpy.mockRestore();
});
});
describe("chunkByNewline", () => {

View File

@@ -306,7 +306,7 @@ export function chunkText(text: string, limit: number): string[] {
}
return chunkTextByBreakResolver(text, limit, (window) => {
// 1) Prefer a newline break inside the window (outside parentheses).
const { lastNewline, lastWhitespace } = scanParenAwareBreakpoints(window);
const { lastNewline, lastWhitespace } = scanParenAwareBreakpoints(window, 0, window.length);
// 2) Otherwise prefer the last whitespace (word boundary) inside the window.
return lastNewline > 0 ? lastNewline : lastWhitespace;
});
@@ -319,14 +319,24 @@ export function chunkMarkdownText(text: string, limit: number): string[] {
}
const chunks: string[] = [];
let remaining = text;
const spans = parseFenceSpans(text);
let start = 0;
let reopenFence: ReturnType<typeof findFenceSpanAt> | undefined;
while (remaining.length > limit) {
const spans = parseFenceSpans(remaining);
const window = remaining.slice(0, limit);
while (start < text.length) {
const reopenPrefix = reopenFence ? `${reopenFence.openLine}\n` : "";
const contentLimit = Math.max(1, limit - reopenPrefix.length);
if (text.length - start <= contentLimit) {
const finalChunk = `${reopenPrefix}${text.slice(start)}`;
if (finalChunk.length > 0) {
chunks.push(finalChunk);
}
break;
}
const softBreak = pickSafeBreakIndex(window, spans);
let breakIdx = softBreak > 0 ? softBreak : limit;
const windowEnd = Math.min(text.length, start + contentLimit);
const softBreak = pickSafeBreakIndex(text, start, windowEnd, spans);
let breakIdx = softBreak > start ? softBreak : windowEnd;
const initialFence = isSafeFenceBreak(spans, breakIdx)
? undefined
@@ -335,38 +345,38 @@ export function chunkMarkdownText(text: string, limit: number): string[] {
let fenceToSplit = initialFence;
if (initialFence) {
const closeLine = `${initialFence.indent}${initialFence.marker}`;
const maxIdxIfNeedNewline = limit - (closeLine.length + 1);
const maxIdxIfNeedNewline = start + (contentLimit - (closeLine.length + 1));
if (maxIdxIfNeedNewline <= 0) {
if (maxIdxIfNeedNewline <= start) {
fenceToSplit = undefined;
breakIdx = limit;
breakIdx = windowEnd;
} else {
const minProgressIdx = Math.min(
remaining.length,
initialFence.start + initialFence.openLine.length + 2,
text.length,
Math.max(start + 1, initialFence.start + initialFence.openLine.length + 2),
);
const maxIdxIfAlreadyNewline = limit - closeLine.length;
const maxIdxIfAlreadyNewline = start + (contentLimit - closeLine.length);
let pickedNewline = false;
let lastNewline = remaining.lastIndexOf("\n", Math.max(0, maxIdxIfAlreadyNewline - 1));
while (lastNewline !== -1) {
let lastNewline = text.lastIndexOf("\n", Math.max(start, maxIdxIfAlreadyNewline - 1));
while (lastNewline >= start) {
const candidateBreak = lastNewline + 1;
if (candidateBreak < minProgressIdx) {
break;
}
const candidateFence = findFenceSpanAt(spans, candidateBreak);
if (candidateFence && candidateFence.start === initialFence.start) {
breakIdx = Math.max(1, candidateBreak);
breakIdx = candidateBreak;
pickedNewline = true;
break;
}
lastNewline = remaining.lastIndexOf("\n", lastNewline - 1);
lastNewline = text.lastIndexOf("\n", lastNewline - 1);
}
if (!pickedNewline) {
if (minProgressIdx > maxIdxIfAlreadyNewline) {
fenceToSplit = undefined;
breakIdx = limit;
breakIdx = windowEnd;
} else {
breakIdx = Math.max(minProgressIdx, maxIdxIfNeedNewline);
}
@@ -378,68 +388,72 @@ export function chunkMarkdownText(text: string, limit: number): string[] {
fenceAtBreak && fenceAtBreak.start === initialFence.start ? fenceAtBreak : undefined;
}
let rawChunk = remaining.slice(0, breakIdx);
if (!rawChunk) {
const rawContent = text.slice(start, breakIdx);
if (!rawContent) {
break;
}
const brokeOnSeparator = breakIdx < remaining.length && /\s/.test(remaining[breakIdx]);
const nextStart = Math.min(remaining.length, breakIdx + (brokeOnSeparator ? 1 : 0));
let next = remaining.slice(nextStart);
let rawChunk = `${reopenPrefix}${rawContent}`;
const brokeOnSeparator = breakIdx < text.length && /\s/.test(text[breakIdx]);
let nextStart = Math.min(text.length, breakIdx + (brokeOnSeparator ? 1 : 0));
if (fenceToSplit) {
const closeLine = `${fenceToSplit.indent}${fenceToSplit.marker}`;
rawChunk = rawChunk.endsWith("\n") ? `${rawChunk}${closeLine}` : `${rawChunk}\n${closeLine}`;
next = `${fenceToSplit.openLine}\n${next}`;
reopenFence = fenceToSplit;
} else {
next = stripLeadingNewlines(next);
nextStart = skipLeadingNewlines(text, nextStart);
reopenFence = undefined;
}
chunks.push(rawChunk);
remaining = next;
}
if (remaining.length) {
chunks.push(remaining);
start = nextStart;
}
return chunks;
}
function stripLeadingNewlines(value: string): string {
let i = 0;
function skipLeadingNewlines(value: string, start = 0): number {
let i = start;
while (i < value.length && value[i] === "\n") {
i++;
}
return i > 0 ? value.slice(i) : value;
return i;
}
function pickSafeBreakIndex(window: string, spans: ReturnType<typeof parseFenceSpans>): number {
const { lastNewline, lastWhitespace } = scanParenAwareBreakpoints(window, (index) =>
function pickSafeBreakIndex(
text: string,
start: number,
end: number,
spans: ReturnType<typeof parseFenceSpans>,
): number {
const { lastNewline, lastWhitespace } = scanParenAwareBreakpoints(text, start, end, (index) =>
isSafeFenceBreak(spans, index),
);
if (lastNewline > 0) {
if (lastNewline > start) {
return lastNewline;
}
if (lastWhitespace > 0) {
if (lastWhitespace > start) {
return lastWhitespace;
}
return -1;
}
function scanParenAwareBreakpoints(
window: string,
text: string,
start: number,
end: number,
isAllowed: (index: number) => boolean = () => true,
): { lastNewline: number; lastWhitespace: number } {
let lastNewline = -1;
let lastWhitespace = -1;
let depth = 0;
for (let i = 0; i < window.length; i++) {
for (let i = start; i < end; i++) {
if (!isAllowed(i)) {
continue;
}
const char = window[i];
const char = text[i];
if (char === "(") {
depth += 1;
continue;

View File

@@ -73,7 +73,27 @@ export function parseFenceSpans(buffer: string): FenceSpan[] {
}
export function findFenceSpanAt(spans: FenceSpan[], index: number): FenceSpan | undefined {
return spans.find((span) => index > span.start && index < span.end);
let low = 0;
let high = spans.length - 1;
while (low <= high) {
const mid = Math.floor((low + high) / 2);
const span = spans[mid];
if (!span) {
break;
}
if (index <= span.start) {
high = mid - 1;
continue;
}
if (index >= span.end) {
low = mid + 1;
continue;
}
return span;
}
return undefined;
}
export function isSafeFenceBreak(spans: FenceSpan[], index: number): boolean {