From 3a72bb815fe16d3547ac6abc1527653d19f68433 Mon Sep 17 00:00:00 2001 From: Giancarmine Salucci Date: Mon, 11 May 2026 23:24:21 +0200 Subject: [PATCH] fix(postprocess): trim adjacent word overlap Remove residual one-word suffix-prefix carry-over between adjacent caption segments so reprocessed transcripts no longer repeat bridge words across lines. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/lib/server/postprocess.ts | 2 +- src/tests/postprocess.test.ts | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/lib/server/postprocess.ts b/src/lib/server/postprocess.ts index 0dad3ac..49c1cf1 100644 --- a/src/lib/server/postprocess.ts +++ b/src/lib/server/postprocess.ts @@ -42,7 +42,7 @@ function mergeConsecutive(segments: Segment[]): Segment[] { const MAX_CHAIN_GAP_SECS = 0.15; const MIN_MEANINGFUL_WORDS = 2; const MIN_MEANINGFUL_CHARS = 8; -const MIN_OVERLAP_WORDS = 3; +const MIN_OVERLAP_WORDS = 1; function splitWords(text: string): string[] { return text.trim().split(/\s+/).filter(Boolean); diff --git a/src/tests/postprocess.test.ts b/src/tests/postprocess.test.ts index cf47ace..a74dd14 100644 --- a/src/tests/postprocess.test.ts +++ b/src/tests/postprocess.test.ts @@ -126,6 +126,21 @@ describe('deduplicateSegments — rolling backend hypotheses', () => { expect(result[1].text).toBe('inference. So, as someone who kind of wants to'); expect(result[2].text).toBe('and work to understand the problems and the'); }); + + it('trims single-word suffix-prefix overlap between adjacent segments', () => { + const input = [ + seg(0, 94.8, 96.4, 'world.'), + seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'), + seg(2, 120.12, 123.71, 'to find more about inference.'), + seg(3, 123.72, 126.92, "inference. So, I've done a lot of work with VLAM,") + ]; + + const result = deduplicateSegments(input); + + expect(result).toHaveLength(3); + expect(result[0].text).toBe('world. And that aspect that I overlooked was'); + expect(result[2].text).toBe("So, I've done a lot of work with VLAM,"); + }); }); // ── ngramDedup ────────────────────────────────────────────────────────────────