fix(postprocess): drop tiny carry-over text

Collapse one-word and very short caption carry-over fragments so reprocessed YouTube transcripts do not retain residual prefix chains. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-11 23:14:31 +02:00
parent 672b161cda
commit 6beb436687
2 changed files with 27 additions and 2 deletions
--- a/src/lib/server/postprocess.ts
+++ b/src/lib/server/postprocess.ts
@@ -78,6 +78,10 @@ function isMeaningfulPhrase(words: string[]): boolean {
 	return words.length >= MIN_MEANINGFUL_WORDS && words.join(' ').length >= MIN_MEANINGFUL_CHARS;
 }
 function isShortCarryover(seg: Segment, words: string[]): boolean {
 	return seg.end - seg.start <= 0.2 || words.length <= 2 || words.join(' ').length <= 16;
 }
 function trimLeadingWords(text: string, count: number): string {
 	return splitWords(text).slice(count).join(' ').trim();
 }
@@ -115,7 +119,7 @@ function collapseIncrementalSegments(segments: Segment[]): Segment[] {
 		if (
 			currentWords.length > lastWords.length &&
 			startsWithWords(currentWords, lastWords) &&
-			isMeaningfulPhrase(lastWords)
+			(isMeaningfulPhrase(lastWords) || isShortCarryover(last, lastWords))
 		) {
 			last.text = current.text;
 			last.end = current.end;
@@ -123,7 +127,10 @@ function collapseIncrementalSegments(segments: Segment[]): Segment[] {
 			continue;
 		}
-		if (endsWithWords(lastWords, currentWords) && isMeaningfulPhrase(currentWords)) {
+		if (
 			endsWithWords(lastWords, currentWords) &&
 			(isMeaningfulPhrase(currentWords) || isShortCarryover(current, currentWords))
 		) {
 			last.end = Math.max(last.end, current.end);
 			continue;
 		}
--- a/src/tests/postprocess.test.ts
+++ b/src/tests/postprocess.test.ts
@@ -108,6 +108,24 @@ describe('deduplicateSegments — rolling backend hypotheses', () => {
 		expect(result[0].text).toBe('Hello everyone.');
 		expect(result[1].text).toBe('Hello everyone. Welcome back.');
 	});
 	it('collapses tiny one-word carry-over segments from caption-style output', () => {
 		const input = [
 			seg(0, 94.8, 96.4, 'world.'),
 			seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
 			seg(2, 98.96, 100.72, 'inference.'),
 			seg(3, 100.72, 103.92, 'inference. So, as someone who kind of wants to'),
 			seg(4, 107.19, 107.2, 'and'),
 			seg(5, 107.2, 109.56, 'and work to understand the problems and the')
 		];
 		const result = deduplicateSegments(input);
 		expect(result).toHaveLength(3);
 		expect(result[0].text).toBe('world. And that aspect that I overlooked was');
 		expect(result[1].text).toBe('inference. So, as someone who kind of wants to');
 		expect(result[2].text).toBe('and work to understand the problems and the');
 	});
 });
 // ── ngramDedup ────────────────────────────────────────────────────────────────