fix(transcript): collapse rolling segment echoes
All checks were successful
Build & Push Docker Image / test (push) Successful in 12s
Build & Push Docker Image / build-and-push (push) Successful in 45s

Normalize incremental backend hypothesis chains before persistence and ignore stale or replayed webhook callbacks so duplicate transcript text does not survive ingest.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-11 22:46:38 +02:00
parent 35a2d86dbb
commit 672b161cda
7 changed files with 246 additions and 33 deletions

View File

@@ -7,17 +7,18 @@ import { describe, it, expect, vi, afterEach } from 'vitest';
const execFileMock = vi.hoisted(() => {
const fn = vi.fn();
type ExecFilePromisifyArgs = [string, string[]];
type ExecFileCallback = (err: Error | null, stdout: string, stderr: string) => void;
type ExecFileMock = (...args: [...ExecFilePromisifyArgs, ExecFileCallback]) => void;
const invoke = fn as unknown as ExecFileMock;
Object.defineProperty(fn, Symbol.for('nodejs.util.promisify.custom'), {
configurable: true,
value: (...args: unknown[]) =>
value: (...args: ExecFilePromisifyArgs) =>
new Promise<{ stdout: string; stderr: string }>((resolve, reject) => {
(fn as ReturnType<typeof vi.fn>)(
...args,
(err: Error | null, stdout: string, stderr: string) => {
if (err) reject(err);
else resolve({ stdout, stderr });
}
);
invoke(...args, (err: Error | null, stdout: string, stderr: string) => {
if (err) reject(err);
else resolve({ stdout, stderr });
});
})
});
return fn;

View File

@@ -66,6 +66,50 @@ describe('deduplicateSegments — mergeConsecutive', () => {
});
});
// ── rolling prefix/suffix chain collapse ───────────────────────────────────────
describe('deduplicateSegments — rolling backend hypotheses', () => {
it('collapses prefix-growth chains from stored backend segments', () => {
const input = [
seg(0, 15.24, 16.6, 'Hello everyone.'),
seg(1, 16.6, 19.47, 'Hello everyone. Um, welcome to this talk.'),
seg(2, 19.47, 19.48, 'Um, welcome to this talk.'),
seg(3, 19.48, 21.67, "Um, welcome to this talk. I'll be speaking about small model"),
seg(4, 21.67, 21.68, "I'll be speaking about small model"),
seg(5, 21.68, 24.59, "I'll be speaking about small model inference and a gap that we've")
];
const result = deduplicateSegments(input);
expect(result).toHaveLength(2);
expect(result[0]).toMatchObject({
index: 0,
start: 15.24,
end: 19.48,
text: 'Hello everyone. Um, welcome to this talk.'
});
expect(result[1]).toMatchObject({
index: 1,
start: 19.48,
end: 24.59,
text: "I'll be speaking about small model inference and a gap that we've"
});
});
it('does not collapse similar phrases when there is a real timing gap', () => {
const input = [
seg(0, 0, 1, 'Hello everyone.'),
seg(1, 2, 4, 'Hello everyone. Welcome back.')
];
const result = deduplicateSegments(input);
expect(result).toHaveLength(2);
expect(result[0].text).toBe('Hello everyone.');
expect(result[1].text).toBe('Hello everyone. Welcome back.');
});
});
// ── ngramDedup ────────────────────────────────────────────────────────────────
describe('deduplicateSegments — ngramDedup', () => {

View File

@@ -132,6 +132,43 @@ describe('POST /api/webhook/[jobId] — locally cancelled job', () => {
});
});
// ── Duplicate / stale callback guards ──────────────────────────────────────────
describe('POST /api/webhook/[jobId] — duplicate and stale callbacks', () => {
it('ignores replayed success callbacks after the transcript is already done', async () => {
mockGetJob.mockReturnValue({
...makeJob('job-done'),
status: 'done',
segmentsJson: JSON.stringify([makeSeg(0, 'Already saved.')]),
whisperJobId: 'whisper-id'
});
const res = await POST(makeEvent('job-done', makeWhisperJob()) as any);
expect(res.status).toBe(200);
expect(await res.json()).toEqual({ ok: true, ignored: 'duplicate_webhook' });
expect(mockSetJobStatus).not.toHaveBeenCalled();
expect(mockUpdateJob).not.toHaveBeenCalled();
expect(mockWriteOutputs).not.toHaveBeenCalled();
});
it('ignores stale callbacks from an older whisper job after retry', async () => {
mockGetJob.mockReturnValue({
...makeJob('job-stale'),
status: 'transcribing',
whisperJobId: 'current-whisper-job'
});
const res = await POST(
makeEvent('job-stale', makeWhisperJob({ id: 'old-whisper-job', segments: [makeSeg(0, 'stale')] })) as any
);
expect(res.status).toBe(200);
expect(await res.json()).toEqual({ ok: true, ignored: 'stale_whisper_job' });
expect(mockSetJobStatus).not.toHaveBeenCalled();
expect(mockUpdateJob).not.toHaveBeenCalled();
expect(mockWriteOutputs).not.toHaveBeenCalled();
});
});
// ── Whisper job failed / cancelled ───────────────────────────────────────────
describe('POST /api/webhook/[jobId] — whisper failure', () => {