fix(postprocess): drop tiny carry-over text

Collapse one-word and very short caption carry-over fragments so reprocessed YouTube transcripts do not retain residual prefix chains. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-11 23:14:31 +02:00
parent 672b161cda
commit 6beb436687
2 changed files with 27 additions and 2 deletions
--- a/src/lib/server/postprocess.ts
+++ b/src/lib/server/postprocess.ts
@@ -78,6 +78,10 @@ function isMeaningfulPhrase(words: string[]): boolean {
 	return words.length >= MIN_MEANINGFUL_WORDS && words.join(' ').length >= MIN_MEANINGFUL_CHARS;
 }

+function isShortCarryover(seg: Segment, words: string[]): boolean {
+	return seg.end - seg.start <= 0.2 || words.length <= 2 || words.join(' ').length <= 16;
+}
+
 function trimLeadingWords(text: string, count: number): string {
 	return splitWords(text).slice(count).join(' ').trim();
 }
@@ -115,7 +119,7 @@ function collapseIncrementalSegments(segments: Segment[]): Segment[] {
 		if (
 			currentWords.length > lastWords.length &&
 			startsWithWords(currentWords, lastWords) &&
-			isMeaningfulPhrase(lastWords)
+			(isMeaningfulPhrase(lastWords) || isShortCarryover(last, lastWords))
 		) {
 			last.text = current.text;
 			last.end = current.end;
@@ -123,7 +127,10 @@ function collapseIncrementalSegments(segments: Segment[]): Segment[] {
 			continue;
 		}

-		if (endsWithWords(lastWords, currentWords) && isMeaningfulPhrase(currentWords)) {
+		if (
+			endsWithWords(lastWords, currentWords) &&
+			(isMeaningfulPhrase(currentWords) || isShortCarryover(current, currentWords))
+		) {
 			last.end = Math.max(last.end, current.end);
 			continue;
 		}
--- a/src/tests/postprocess.test.ts
+++ b/src/tests/postprocess.test.ts
@@ -108,6 +108,24 @@ describe('deduplicateSegments — rolling backend hypotheses', () => {
 		expect(result[0].text).toBe('Hello everyone.');
 		expect(result[1].text).toBe('Hello everyone. Welcome back.');
 	});
+
+	it('collapses tiny one-word carry-over segments from caption-style output', () => {
+		const input = [
+			seg(0, 94.8, 96.4, 'world.'),
+			seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
+			seg(2, 98.96, 100.72, 'inference.'),
+			seg(3, 100.72, 103.92, 'inference. So, as someone who kind of wants to'),
+			seg(4, 107.19, 107.2, 'and'),
+			seg(5, 107.2, 109.56, 'and work to understand the problems and the')
+		];
+
+		const result = deduplicateSegments(input);
+
+		expect(result).toHaveLength(3);
+		expect(result[0].text).toBe('world. And that aspect that I overlooked was');
+		expect(result[1].text).toBe('inference. So, as someone who kind of wants to');
+		expect(result[2].text).toBe('and work to understand the problems and the');
+	});
 });

 // ── ngramDedup ────────────────────────────────────────────────────────────────