fix(postprocess): trim adjacent word overlap
Remove residual one-word suffix-prefix carry-over between adjacent caption segments so reprocessed transcripts no longer repeat bridge words across lines. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -42,7 +42,7 @@ function mergeConsecutive(segments: Segment[]): Segment[] {
|
||||
const MAX_CHAIN_GAP_SECS = 0.15;
|
||||
const MIN_MEANINGFUL_WORDS = 2;
|
||||
const MIN_MEANINGFUL_CHARS = 8;
|
||||
const MIN_OVERLAP_WORDS = 3;
|
||||
const MIN_OVERLAP_WORDS = 1;
|
||||
|
||||
function splitWords(text: string): string[] {
|
||||
return text.trim().split(/\s+/).filter(Boolean);
|
||||
|
||||
@@ -126,6 +126,21 @@ describe('deduplicateSegments — rolling backend hypotheses', () => {
|
||||
expect(result[1].text).toBe('inference. So, as someone who kind of wants to');
|
||||
expect(result[2].text).toBe('and work to understand the problems and the');
|
||||
});
|
||||
|
||||
it('trims single-word suffix-prefix overlap between adjacent segments', () => {
|
||||
const input = [
|
||||
seg(0, 94.8, 96.4, 'world.'),
|
||||
seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
|
||||
seg(2, 120.12, 123.71, 'to find more about inference.'),
|
||||
seg(3, 123.72, 126.92, "inference. So, I've done a lot of work with VLAM,")
|
||||
];
|
||||
|
||||
const result = deduplicateSegments(input);
|
||||
|
||||
expect(result).toHaveLength(3);
|
||||
expect(result[0].text).toBe('world. And that aspect that I overlooked was');
|
||||
expect(result[2].text).toBe("So, I've done a lot of work with VLAM,");
|
||||
});
|
||||
});
|
||||
|
||||
// ── ngramDedup ────────────────────────────────────────────────────────────────
|
||||
|
||||
Reference in New Issue
Block a user