fix(postprocess): drop tiny carry-over text
Collapse one-word and very short caption carry-over fragments so reprocessed YouTube transcripts do not retain residual prefix chains. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -78,6 +78,10 @@ function isMeaningfulPhrase(words: string[]): boolean {
|
|||||||
return words.length >= MIN_MEANINGFUL_WORDS && words.join(' ').length >= MIN_MEANINGFUL_CHARS;
|
return words.length >= MIN_MEANINGFUL_WORDS && words.join(' ').length >= MIN_MEANINGFUL_CHARS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function isShortCarryover(seg: Segment, words: string[]): boolean {
|
||||||
|
return seg.end - seg.start <= 0.2 || words.length <= 2 || words.join(' ').length <= 16;
|
||||||
|
}
|
||||||
|
|
||||||
function trimLeadingWords(text: string, count: number): string {
|
function trimLeadingWords(text: string, count: number): string {
|
||||||
return splitWords(text).slice(count).join(' ').trim();
|
return splitWords(text).slice(count).join(' ').trim();
|
||||||
}
|
}
|
||||||
@@ -115,7 +119,7 @@ function collapseIncrementalSegments(segments: Segment[]): Segment[] {
|
|||||||
if (
|
if (
|
||||||
currentWords.length > lastWords.length &&
|
currentWords.length > lastWords.length &&
|
||||||
startsWithWords(currentWords, lastWords) &&
|
startsWithWords(currentWords, lastWords) &&
|
||||||
isMeaningfulPhrase(lastWords)
|
(isMeaningfulPhrase(lastWords) || isShortCarryover(last, lastWords))
|
||||||
) {
|
) {
|
||||||
last.text = current.text;
|
last.text = current.text;
|
||||||
last.end = current.end;
|
last.end = current.end;
|
||||||
@@ -123,7 +127,10 @@ function collapseIncrementalSegments(segments: Segment[]): Segment[] {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (endsWithWords(lastWords, currentWords) && isMeaningfulPhrase(currentWords)) {
|
if (
|
||||||
|
endsWithWords(lastWords, currentWords) &&
|
||||||
|
(isMeaningfulPhrase(currentWords) || isShortCarryover(current, currentWords))
|
||||||
|
) {
|
||||||
last.end = Math.max(last.end, current.end);
|
last.end = Math.max(last.end, current.end);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -108,6 +108,24 @@ describe('deduplicateSegments — rolling backend hypotheses', () => {
|
|||||||
expect(result[0].text).toBe('Hello everyone.');
|
expect(result[0].text).toBe('Hello everyone.');
|
||||||
expect(result[1].text).toBe('Hello everyone. Welcome back.');
|
expect(result[1].text).toBe('Hello everyone. Welcome back.');
|
||||||
});
|
});
|
||||||
|
|
||||||
|
it('collapses tiny one-word carry-over segments from caption-style output', () => {
|
||||||
|
const input = [
|
||||||
|
seg(0, 94.8, 96.4, 'world.'),
|
||||||
|
seg(1, 96.4, 98.96, 'world. And that aspect that I overlooked was'),
|
||||||
|
seg(2, 98.96, 100.72, 'inference.'),
|
||||||
|
seg(3, 100.72, 103.92, 'inference. So, as someone who kind of wants to'),
|
||||||
|
seg(4, 107.19, 107.2, 'and'),
|
||||||
|
seg(5, 107.2, 109.56, 'and work to understand the problems and the')
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = deduplicateSegments(input);
|
||||||
|
|
||||||
|
expect(result).toHaveLength(3);
|
||||||
|
expect(result[0].text).toBe('world. And that aspect that I overlooked was');
|
||||||
|
expect(result[1].text).toBe('inference. So, as someone who kind of wants to');
|
||||||
|
expect(result[2].text).toBe('and work to understand the problems and the');
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
// ── ngramDedup ────────────────────────────────────────────────────────────────
|
// ── ngramDedup ────────────────────────────────────────────────────────────────
|
||||||
|
|||||||
Reference in New Issue
Block a user