fix(postprocess): trim adjacent word overlap

Remove residual one-word suffix-prefix carry-over between adjacent caption segments so reprocessed transcripts no longer repeat bridge words across lines. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-11 23:24:21 +02:00
parent 6beb436687
commit 3a72bb815f
2 changed files with 16 additions and 1 deletions
--- a/src/lib/server/postprocess.ts
+++ b/src/lib/server/postprocess.ts
@@ -42,7 +42,7 @@ function mergeConsecutive(segments: Segment[]): Segment[] {
 const MAX_CHAIN_GAP_SECS = 0.15;
 const MIN_MEANINGFUL_WORDS = 2;
 const MIN_MEANINGFUL_CHARS = 8;
-const MIN_OVERLAP_WORDS = 3;
+const MIN_OVERLAP_WORDS = 1;

 function splitWords(text: string): string[] {
 	return text.trim().split(/\s+/).filter(Boolean);
--- a/src/tests/postprocess.test.ts
+++ b/src/tests/postprocess.test.ts
@@ -126,6 +126,21 @@ describe('deduplicateSegments — rolling backend hypotheses', () => {
 		expect(result[1].text).toBe('inference. So, as someone who kind of wants to');
 		expect(result[2].text).toBe('and work to understand the problems and the');
 	});
+
+	it('trims single-word suffix-prefix overlap between adjacent segments', () => {
+		const input = [
+			seg(0, 94.8, 96.4, 'world.'),
+			seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
+			seg(2, 120.12, 123.71, 'to find more about inference.'),
+			seg(3, 123.72, 126.92, "inference. So, I've done a lot of work with VLAM,")
+		];
+
+		const result = deduplicateSegments(input);
+
+		expect(result).toHaveLength(3);
+		expect(result[0].text).toBe('world. And that aspect that I overlooked was');
+		expect(result[2].text).toBe("So, I've done a lot of work with VLAM,");
+	});
 });

 // ── ngramDedup ────────────────────────────────────────────────────────────────