fix(postprocess): trim adjacent word overlap
Some checks failed
Build & Push Docker Image / test (push) Failing after 11s
Build & Push Docker Image / build-and-push (push) Has been skipped

Remove residual one-word suffix-prefix carry-over between adjacent caption segments so reprocessed transcripts no longer repeat bridge words across lines.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-11 23:24:21 +02:00
parent 6beb436687
commit 3a72bb815f
2 changed files with 16 additions and 1 deletions

View File

@@ -42,7 +42,7 @@ function mergeConsecutive(segments: Segment[]): Segment[] {
const MAX_CHAIN_GAP_SECS = 0.15;
const MIN_MEANINGFUL_WORDS = 2;
const MIN_MEANINGFUL_CHARS = 8;
const MIN_OVERLAP_WORDS = 3;
const MIN_OVERLAP_WORDS = 1;
function splitWords(text: string): string[] {
return text.trim().split(/\s+/).filter(Boolean);

View File

@@ -126,6 +126,21 @@ describe('deduplicateSegments — rolling backend hypotheses', () => {
expect(result[1].text).toBe('inference. So, as someone who kind of wants to');
expect(result[2].text).toBe('and work to understand the problems and the');
});
it('trims single-word suffix-prefix overlap between adjacent segments', () => {
const input = [
seg(0, 94.8, 96.4, 'world.'),
seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
seg(2, 120.12, 123.71, 'to find more about inference.'),
seg(3, 123.72, 126.92, "inference. So, I've done a lot of work with VLAM,")
];
const result = deduplicateSegments(input);
expect(result).toHaveLength(3);
expect(result[0].text).toBe('world. And that aspect that I overlooked was');
expect(result[2].text).toBe("So, I've done a lot of work with VLAM,");
});
});
// ── ngramDedup ────────────────────────────────────────────────────────────────