fix(postprocess): drop tiny carry-over text
All checks were successful
Build & Push Docker Image / test (push) Successful in 11s
Build & Push Docker Image / build-and-push (push) Successful in 43s

Collapse one-word and very short caption carry-over fragments so reprocessed YouTube transcripts do not retain residual prefix chains.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-11 23:14:31 +02:00
parent 672b161cda
commit 6beb436687
2 changed files with 27 additions and 2 deletions

View File

@@ -78,6 +78,10 @@ function isMeaningfulPhrase(words: string[]): boolean {
return words.length >= MIN_MEANINGFUL_WORDS && words.join(' ').length >= MIN_MEANINGFUL_CHARS;
}
function isShortCarryover(seg: Segment, words: string[]): boolean {
return seg.end - seg.start <= 0.2 || words.length <= 2 || words.join(' ').length <= 16;
}
function trimLeadingWords(text: string, count: number): string {
return splitWords(text).slice(count).join(' ').trim();
}
@@ -115,7 +119,7 @@ function collapseIncrementalSegments(segments: Segment[]): Segment[] {
if (
currentWords.length > lastWords.length &&
startsWithWords(currentWords, lastWords) &&
isMeaningfulPhrase(lastWords)
(isMeaningfulPhrase(lastWords) || isShortCarryover(last, lastWords))
) {
last.text = current.text;
last.end = current.end;
@@ -123,7 +127,10 @@ function collapseIncrementalSegments(segments: Segment[]): Segment[] {
continue;
}
if (endsWithWords(lastWords, currentWords) && isMeaningfulPhrase(currentWords)) {
if (
endsWithWords(lastWords, currentWords) &&
(isMeaningfulPhrase(currentWords) || isShortCarryover(current, currentWords))
) {
last.end = Math.max(last.end, current.end);
continue;
}

View File

@@ -108,6 +108,24 @@ describe('deduplicateSegments — rolling backend hypotheses', () => {
expect(result[0].text).toBe('Hello everyone.');
expect(result[1].text).toBe('Hello everyone. Welcome back.');
});
it('collapses tiny one-word carry-over segments from caption-style output', () => {
const input = [
seg(0, 94.8, 96.4, 'world.'),
seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
seg(2, 98.96, 100.72, 'inference.'),
seg(3, 100.72, 103.92, 'inference. So, as someone who kind of wants to'),
seg(4, 107.19, 107.2, 'and'),
seg(5, 107.2, 109.56, 'and work to understand the problems and the')
];
const result = deduplicateSegments(input);
expect(result).toHaveLength(3);
expect(result[0].text).toBe('world. And that aspect that I overlooked was');
expect(result[1].text).toBe('inference. So, as someone who kind of wants to');
expect(result[2].text).toBe('and work to understand the problems and the');
});
});
// ── ngramDedup ────────────────────────────────────────────────────────────────