fix(transcript): collapse rolling segment echoes
All checks were successful
Build & Push Docker Image / test (push) Successful in 12s
Build & Push Docker Image / build-and-push (push) Successful in 45s

Normalize incremental backend hypothesis chains before persistence and ignore stale or replayed webhook callbacks so duplicate transcript text does not survive ingest.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
2026-05-11 22:46:38 +02:00
parent 35a2d86dbb
commit 672b161cda
7 changed files with 246 additions and 33 deletions

View File

@@ -16,10 +16,10 @@
type RecordState = 'idle' | 'requesting' | 'recording' | 'stopping'; type RecordState = 'idle' | 'requesting' | 'recording' | 'stopping';
let state = $state<RecordState>('idle'); let recordState: RecordState = $state('idle');
let error = $state(''); let error: string = $state('');
let elapsed = $state(0); // seconds let elapsed: number = $state(0); // seconds
let liveData = $state<Float32Array | null>(null); let liveData: Float32Array | null = $state(null);
let mediaRecorder: MediaRecorder | null = null; let mediaRecorder: MediaRecorder | null = null;
let chunks: Blob[] = []; let chunks: Blob[] = [];
@@ -60,12 +60,12 @@
async function startRecording() { async function startRecording() {
error = ''; error = '';
state = 'requesting'; recordState = 'requesting';
try { try {
stream = await navigator.mediaDevices.getUserMedia({ audio: true }); stream = await navigator.mediaDevices.getUserMedia({ audio: true });
} catch { } catch {
error = 'Microphone access denied'; error = 'Microphone access denied';
state = 'idle'; recordState = 'idle';
return; return;
} }
@@ -81,11 +81,11 @@
elapsed = 0; elapsed = 0;
timerInterval = setInterval(() => elapsed++, 1000); timerInterval = setInterval(() => elapsed++, 1000);
state = 'recording'; recordState = 'recording';
} }
function stopRecording() { function stopRecording() {
state = 'stopping'; recordState = 'stopping';
mediaRecorder?.stop(); mediaRecorder?.stop();
if (timerInterval) clearInterval(timerInterval); if (timerInterval) clearInterval(timerInterval);
if (animFrame) cancelAnimationFrame(animFrame); if (animFrame) cancelAnimationFrame(animFrame);
@@ -99,7 +99,7 @@
const ext = mime.includes('ogg') ? 'ogg' : mime.includes('mp4') ? 'mp4' : 'webm'; const ext = mime.includes('ogg') ? 'ogg' : mime.includes('mp4') ? 'mp4' : 'webm';
const blob = new Blob(chunks, { type: mime }); const blob = new Blob(chunks, { type: mime });
const filename = `recording-${new Date().toISOString().slice(0, 19).replace(/[T:]/g, '-')}.${ext}`; const filename = `recording-${new Date().toISOString().slice(0, 19).replace(/[T:]/g, '-')}.${ext}`;
state = 'idle'; recordState = 'idle';
ondone?.(blob, filename); ondone?.(blob, filename);
} }
@@ -116,15 +116,18 @@
{ length: IDLE_BARS }, { length: IDLE_BARS },
(_, i) => 3 + Math.abs(Math.sin(i * 0.7) + Math.cos(i * 0.31)) * 20 (_, i) => 3 + Math.abs(Math.sin(i * 0.7) + Math.cos(i * 0.31)) * 20
); );
const liveBars = $derived.by<number[]>(() =>
liveData ? Array.from(liveData.slice(0, IDLE_BARS), (value) => Number(value)) : []
);
</script> </script>
<div class="recorder"> <div class="recorder">
<!-- Waveform display --> <!-- Waveform display -->
<div class="waveform-area" aria-hidden="true"> <div class="waveform-area" aria-hidden="true">
{#if state === 'recording' && liveData} {#if recordState === 'recording' && liveData}
<!-- Live waveform from AnalyserNode --> <!-- Live waveform from AnalyserNode -->
<svg viewBox="0 0 {IDLE_BARS * 5} 28" preserveAspectRatio="none" class="waveform-svg"> <svg viewBox="0 0 {IDLE_BARS * 5} 28" preserveAspectRatio="none" class="waveform-svg">
{#each Array.from(liveData).slice(0, IDLE_BARS) as v, i} {#each liveBars as v, i}
{@const h = 2 + v * 24} {@const h = 2 + v * 24}
<rect <rect
x={i * 5} x={i * 5}
@@ -147,8 +150,8 @@
width="3" width="3"
height={h} height={h}
rx="1.5" rx="1.5"
fill={state === 'idle' ? 'rgba(255,255,255,0.15)' : accent} fill={recordState === 'idle' ? 'rgba(255,255,255,0.15)' : accent}
opacity={state === 'idle' ? 1 : 0.3} opacity={recordState === 'idle' ? 1 : 0.3}
/> />
{/each} {/each}
</svg> </svg>
@@ -156,7 +159,7 @@
</div> </div>
<!-- Timer (recording only) --> <!-- Timer (recording only) -->
{#if state === 'recording'} {#if recordState === 'recording'}
<div class="timer" style="color: {accent}"> <div class="timer" style="color: {accent}">
<span class="rec-dot" style="background: {accent}"></span> <span class="rec-dot" style="background: {accent}"></span>
{formatTime(elapsed)} {formatTime(elapsed)}
@@ -170,15 +173,15 @@
<!-- Buttons --> <!-- Buttons -->
<div class="btn-row"> <div class="btn-row">
{#if state === 'idle' || state === 'requesting'} {#if recordState === 'idle' || recordState === 'requesting'}
<button <button
class="btn-record" class="btn-record"
style="background: {accent}; color: #0c0d10;" style="background: {accent}; color: #0c0d10;"
onclick={startRecording} onclick={startRecording}
disabled={state === 'requesting'} disabled={recordState === 'requesting'}
aria-label="Start recording" aria-label="Start recording"
> >
{#if state === 'requesting'} {#if recordState === 'requesting'}
<svg width="13" height="13" viewBox="0 0 13 13" style="animation: spin 1s linear infinite"> <svg width="13" height="13" viewBox="0 0 13 13" style="animation: spin 1s linear infinite">
<circle cx="6.5" cy="6.5" r="5" stroke="currentColor" stroke-width="1.5" fill="none" stroke-dasharray="20 12"/> <circle cx="6.5" cy="6.5" r="5" stroke="currentColor" stroke-width="1.5" fill="none" stroke-dasharray="20 12"/>
</svg> </svg>
@@ -190,7 +193,7 @@
Record Record
{/if} {/if}
</button> </button>
{:else if state === 'recording'} {:else if recordState === 'recording'}
<button <button
class="btn-stop" class="btn-stop"
onclick={stopRecording} onclick={stopRecording}

View File

@@ -37,6 +37,119 @@ function mergeConsecutive(segments: Segment[]): Segment[] {
return out; return out;
} }
// ── Collapse rolling prefix/suffix chains from backend segment hypotheses ──────
const MAX_CHAIN_GAP_SECS = 0.15;
const MIN_MEANINGFUL_WORDS = 2;
const MIN_MEANINGFUL_CHARS = 8;
const MIN_OVERLAP_WORDS = 3;
function splitWords(text: string): string[] {
return text.trim().split(/\s+/).filter(Boolean);
}
function normaliseWords(text: string): string[] {
return splitWords(text)
.map((word) => word.toLowerCase().replace(/[^\w]/g, ''))
.filter(Boolean);
}
function arraysEqual(a: string[], b: string[]): boolean {
return a.length === b.length && a.every((value, index) => value === b[index]);
}
function startsWithWords(full: string[], prefix: string[]): boolean {
return prefix.length <= full.length && arraysEqual(full.slice(0, prefix.length), prefix);
}
function endsWithWords(full: string[], suffix: string[]): boolean {
return suffix.length <= full.length && arraysEqual(full.slice(full.length - suffix.length), suffix);
}
function suffixPrefixOverlap(left: string[], right: string[]): number {
const max = Math.min(left.length, right.length);
for (let size = max; size >= 1; size--) {
if (arraysEqual(left.slice(left.length - size), right.slice(0, size))) return size;
}
return 0;
}
function isMeaningfulPhrase(words: string[]): boolean {
return words.length >= MIN_MEANINGFUL_WORDS && words.join(' ').length >= MIN_MEANINGFUL_CHARS;
}
function trimLeadingWords(text: string, count: number): string {
return splitWords(text).slice(count).join(' ').trim();
}
function collapseIncrementalSegments(segments: Segment[]): Segment[] {
const out: Segment[] = [];
for (const seg of segments) {
let current: Segment = {
...seg,
text: seg.text.trim()
};
if (!current.text) continue;
const last = out[out.length - 1];
if (!last) {
out.push(current);
continue;
}
const gap = current.start - last.end;
if (gap > MAX_CHAIN_GAP_SECS) {
out.push(current);
continue;
}
const lastWords = normaliseWords(last.text);
const currentWords = normaliseWords(current.text);
if (lastWords.length === 0 || currentWords.length === 0) {
out.push(current);
continue;
}
if (
currentWords.length > lastWords.length &&
startsWithWords(currentWords, lastWords) &&
isMeaningfulPhrase(lastWords)
) {
last.text = current.text;
last.end = current.end;
last.words = current.words;
continue;
}
if (endsWithWords(lastWords, currentWords) && isMeaningfulPhrase(currentWords)) {
last.end = Math.max(last.end, current.end);
continue;
}
const overlapWords = suffixPrefixOverlap(lastWords, currentWords);
if (overlapWords >= MIN_OVERLAP_WORDS) {
const trimmedText = trimLeadingWords(current.text, overlapWords);
if (!trimmedText) {
last.end = Math.max(last.end, current.end);
continue;
}
current = {
...current,
start: Math.max(current.start, last.end),
text: trimmedText,
words: []
};
}
out.push(current);
}
return out;
}
// ── N-gram deduplication ───────────────────────────────────────────────────── // ── N-gram deduplication ─────────────────────────────────────────────────────
const NGRAM_N = 6; const NGRAM_N = 6;
@@ -93,16 +206,22 @@ export function deduplicateSegments(segments: Segment[]): Segment[] {
// 2. Remove empty segments // 2. Remove empty segments
result = result.filter((s) => s.text.length > 0); result = result.filter((s) => s.text.length > 0);
// 3. First merge pass // 3. Collapse rolling backend hypotheses before generic dedup
result = collapseIncrementalSegments(result);
// 4. First merge pass
result = mergeConsecutive(result); result = mergeConsecutive(result);
// 4. N-gram dedup // 5. N-gram dedup
result = ngramDedup(result); result = ngramDedup(result);
// 5. Second merge pass (catches new adjacencies after dedup) // 6. Re-run rolling collapse after removals create new adjacencies
result = collapseIncrementalSegments(result);
// 7. Second merge pass (catches new adjacencies after dedup)
result = mergeConsecutive(result); result = mergeConsecutive(result);
// 6. Re-index // 8. Re-index
result.forEach((s, i) => (s.index = i)); result.forEach((s, i) => (s.index = i));
return result; return result;

View File

@@ -144,7 +144,7 @@
<!-- Decorative waveform --> <!-- Decorative waveform -->
<div class="dropzone-wave"> <div class="dropzone-wave">
<Waveform bars={DROPZONE_BARS} progress={0} {ACCENT} height={38} /> <Waveform bars={DROPZONE_BARS} progress={0} accent={ACCENT} height={38} />
</div> </div>
<input <input
@@ -586,4 +586,3 @@
} }
} }
</style> </style>

View File

@@ -12,12 +12,22 @@ const jobId = params.jobId;
const job = getJob(jobId); const job = getJob(jobId);
if (!job) throw error(404, 'Job not found'); if (!job) throw error(404, 'Job not found');
const whisperJob = (await request.json()) as WhisperJob;
// Discard the result if the job was cancelled locally while whisper was running // Discard the result if the job was cancelled locally while whisper was running
if (job.status === 'cancelled') { if (job.status === 'cancelled') {
return json({ ok: true }); return json({ ok: true });
} }
const whisperJob = (await request.json()) as WhisperJob; // Ignore stale callbacks from a previous whisper job after a local retry/reset.
if (job.whisperJobId && whisperJob.id !== job.whisperJobId) {
return json({ ok: true, ignored: 'stale_whisper_job' });
}
// Ignore replayed success callbacks after the transcript is already persisted.
if (job.status === 'done' && job.segmentsJson) {
return json({ ok: true, ignored: 'duplicate_webhook' });
}
if (whisperJob.status === 'failed' || whisperJob.status === 'cancelled') { if (whisperJob.status === 'failed' || whisperJob.status === 'cancelled') {
const msg = whisperJob.error ?? `Whisper job ${whisperJob.status}`; const msg = whisperJob.error ?? `Whisper job ${whisperJob.status}`;

View File

@@ -7,17 +7,18 @@ import { describe, it, expect, vi, afterEach } from 'vitest';
const execFileMock = vi.hoisted(() => { const execFileMock = vi.hoisted(() => {
const fn = vi.fn(); const fn = vi.fn();
type ExecFilePromisifyArgs = [string, string[]];
type ExecFileCallback = (err: Error | null, stdout: string, stderr: string) => void;
type ExecFileMock = (...args: [...ExecFilePromisifyArgs, ExecFileCallback]) => void;
const invoke = fn as unknown as ExecFileMock;
Object.defineProperty(fn, Symbol.for('nodejs.util.promisify.custom'), { Object.defineProperty(fn, Symbol.for('nodejs.util.promisify.custom'), {
configurable: true, configurable: true,
value: (...args: unknown[]) => value: (...args: ExecFilePromisifyArgs) =>
new Promise<{ stdout: string; stderr: string }>((resolve, reject) => { new Promise<{ stdout: string; stderr: string }>((resolve, reject) => {
(fn as ReturnType<typeof vi.fn>)( invoke(...args, (err: Error | null, stdout: string, stderr: string) => {
...args, if (err) reject(err);
(err: Error | null, stdout: string, stderr: string) => { else resolve({ stdout, stderr });
if (err) reject(err); });
else resolve({ stdout, stderr });
}
);
}) })
}); });
return fn; return fn;

View File

@@ -66,6 +66,50 @@ describe('deduplicateSegments — mergeConsecutive', () => {
}); });
}); });
// ── rolling prefix/suffix chain collapse ───────────────────────────────────────
describe('deduplicateSegments — rolling backend hypotheses', () => {
it('collapses prefix-growth chains from stored backend segments', () => {
const input = [
seg(0, 15.24, 16.6, 'Hello everyone.'),
seg(1, 16.6, 19.47, 'Hello everyone. Um, welcome to this talk.'),
seg(2, 19.47, 19.48, 'Um, welcome to this talk.'),
seg(3, 19.48, 21.67, "Um, welcome to this talk. I'll be speaking about small model"),
seg(4, 21.67, 21.68, "I'll be speaking about small model"),
seg(5, 21.68, 24.59, "I'll be speaking about small model inference and a gap that we've")
];
const result = deduplicateSegments(input);
expect(result).toHaveLength(2);
expect(result[0]).toMatchObject({
index: 0,
start: 15.24,
end: 19.48,
text: 'Hello everyone. Um, welcome to this talk.'
});
expect(result[1]).toMatchObject({
index: 1,
start: 19.48,
end: 24.59,
text: "I'll be speaking about small model inference and a gap that we've"
});
});
it('does not collapse similar phrases when there is a real timing gap', () => {
const input = [
seg(0, 0, 1, 'Hello everyone.'),
seg(1, 2, 4, 'Hello everyone. Welcome back.')
];
const result = deduplicateSegments(input);
expect(result).toHaveLength(2);
expect(result[0].text).toBe('Hello everyone.');
expect(result[1].text).toBe('Hello everyone. Welcome back.');
});
});
// ── ngramDedup ──────────────────────────────────────────────────────────────── // ── ngramDedup ────────────────────────────────────────────────────────────────
describe('deduplicateSegments — ngramDedup', () => { describe('deduplicateSegments — ngramDedup', () => {

View File

@@ -132,6 +132,43 @@ describe('POST /api/webhook/[jobId] — locally cancelled job', () => {
}); });
}); });
// ── Duplicate / stale callback guards ──────────────────────────────────────────
describe('POST /api/webhook/[jobId] — duplicate and stale callbacks', () => {
it('ignores replayed success callbacks after the transcript is already done', async () => {
mockGetJob.mockReturnValue({
...makeJob('job-done'),
status: 'done',
segmentsJson: JSON.stringify([makeSeg(0, 'Already saved.')]),
whisperJobId: 'whisper-id'
});
const res = await POST(makeEvent('job-done', makeWhisperJob()) as any);
expect(res.status).toBe(200);
expect(await res.json()).toEqual({ ok: true, ignored: 'duplicate_webhook' });
expect(mockSetJobStatus).not.toHaveBeenCalled();
expect(mockUpdateJob).not.toHaveBeenCalled();
expect(mockWriteOutputs).not.toHaveBeenCalled();
});
it('ignores stale callbacks from an older whisper job after retry', async () => {
mockGetJob.mockReturnValue({
...makeJob('job-stale'),
status: 'transcribing',
whisperJobId: 'current-whisper-job'
});
const res = await POST(
makeEvent('job-stale', makeWhisperJob({ id: 'old-whisper-job', segments: [makeSeg(0, 'stale')] })) as any
);
expect(res.status).toBe(200);
expect(await res.json()).toEqual({ ok: true, ignored: 'stale_whisper_job' });
expect(mockSetJobStatus).not.toHaveBeenCalled();
expect(mockUpdateJob).not.toHaveBeenCalled();
expect(mockWriteOutputs).not.toHaveBeenCalled();
});
});
// ── Whisper job failed / cancelled ─────────────────────────────────────────── // ── Whisper job failed / cancelled ───────────────────────────────────────────
describe('POST /api/webhook/[jobId] — whisper failure', () => { describe('POST /api/webhook/[jobId] — whisper failure', () => {