fix(postprocess): drop tiny carry-over text
Collapse one-word and very short caption carry-over fragments so reprocessed YouTube transcripts do not retain residual prefix chains. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -108,6 +108,24 @@ describe('deduplicateSegments — rolling backend hypotheses', () => {
|
||||
expect(result[0].text).toBe('Hello everyone.');
|
||||
expect(result[1].text).toBe('Hello everyone. Welcome back.');
|
||||
});
|
||||
|
||||
it('collapses tiny one-word carry-over segments from caption-style output', () => {
|
||||
const input = [
|
||||
seg(0, 94.8, 96.4, 'world.'),
|
||||
seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
|
||||
seg(2, 98.96, 100.72, 'inference.'),
|
||||
seg(3, 100.72, 103.92, 'inference. So, as someone who kind of wants to'),
|
||||
seg(4, 107.19, 107.2, 'and'),
|
||||
seg(5, 107.2, 109.56, 'and work to understand the problems and the')
|
||||
];
|
||||
|
||||
const result = deduplicateSegments(input);
|
||||
|
||||
expect(result).toHaveLength(3);
|
||||
expect(result[0].text).toBe('world. And that aspect that I overlooked was');
|
||||
expect(result[1].text).toBe('inference. So, as someone who kind of wants to');
|
||||
expect(result[2].text).toBe('and work to understand the problems and the');
|
||||
});
|
||||
});
|
||||
|
||||
// ── ngramDedup ────────────────────────────────────────────────────────────────
|
||||
|
||||
Reference in New Issue
Block a user