fix(transcript): collapse rolling segment echoes
Normalize incremental backend hypothesis chains before persistence and ignore stale or replayed webhook callbacks so duplicate transcript text does not survive ingest. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -16,10 +16,10 @@
|
||||
|
||||
type RecordState = 'idle' | 'requesting' | 'recording' | 'stopping';
|
||||
|
||||
let state = $state<RecordState>('idle');
|
||||
let error = $state('');
|
||||
let elapsed = $state(0); // seconds
|
||||
let liveData = $state<Float32Array | null>(null);
|
||||
let recordState: RecordState = $state('idle');
|
||||
let error: string = $state('');
|
||||
let elapsed: number = $state(0); // seconds
|
||||
let liveData: Float32Array | null = $state(null);
|
||||
|
||||
let mediaRecorder: MediaRecorder | null = null;
|
||||
let chunks: Blob[] = [];
|
||||
@@ -60,12 +60,12 @@
|
||||
|
||||
async function startRecording() {
|
||||
error = '';
|
||||
state = 'requesting';
|
||||
recordState = 'requesting';
|
||||
try {
|
||||
stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||
} catch {
|
||||
error = 'Microphone access denied';
|
||||
state = 'idle';
|
||||
recordState = 'idle';
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -81,11 +81,11 @@
|
||||
|
||||
elapsed = 0;
|
||||
timerInterval = setInterval(() => elapsed++, 1000);
|
||||
state = 'recording';
|
||||
recordState = 'recording';
|
||||
}
|
||||
|
||||
function stopRecording() {
|
||||
state = 'stopping';
|
||||
recordState = 'stopping';
|
||||
mediaRecorder?.stop();
|
||||
if (timerInterval) clearInterval(timerInterval);
|
||||
if (animFrame) cancelAnimationFrame(animFrame);
|
||||
@@ -99,7 +99,7 @@
|
||||
const ext = mime.includes('ogg') ? 'ogg' : mime.includes('mp4') ? 'mp4' : 'webm';
|
||||
const blob = new Blob(chunks, { type: mime });
|
||||
const filename = `recording-${new Date().toISOString().slice(0, 19).replace(/[T:]/g, '-')}.${ext}`;
|
||||
state = 'idle';
|
||||
recordState = 'idle';
|
||||
ondone?.(blob, filename);
|
||||
}
|
||||
|
||||
@@ -116,15 +116,18 @@
|
||||
{ length: IDLE_BARS },
|
||||
(_, i) => 3 + Math.abs(Math.sin(i * 0.7) + Math.cos(i * 0.31)) * 20
|
||||
);
|
||||
const liveBars = $derived.by<number[]>(() =>
|
||||
liveData ? Array.from(liveData.slice(0, IDLE_BARS), (value) => Number(value)) : []
|
||||
);
|
||||
</script>
|
||||
|
||||
<div class="recorder">
|
||||
<!-- Waveform display -->
|
||||
<div class="waveform-area" aria-hidden="true">
|
||||
{#if state === 'recording' && liveData}
|
||||
{#if recordState === 'recording' && liveData}
|
||||
<!-- Live waveform from AnalyserNode -->
|
||||
<svg viewBox="0 0 {IDLE_BARS * 5} 28" preserveAspectRatio="none" class="waveform-svg">
|
||||
{#each Array.from(liveData).slice(0, IDLE_BARS) as v, i}
|
||||
{#each liveBars as v, i}
|
||||
{@const h = 2 + v * 24}
|
||||
<rect
|
||||
x={i * 5}
|
||||
@@ -147,8 +150,8 @@
|
||||
width="3"
|
||||
height={h}
|
||||
rx="1.5"
|
||||
fill={state === 'idle' ? 'rgba(255,255,255,0.15)' : accent}
|
||||
opacity={state === 'idle' ? 1 : 0.3}
|
||||
fill={recordState === 'idle' ? 'rgba(255,255,255,0.15)' : accent}
|
||||
opacity={recordState === 'idle' ? 1 : 0.3}
|
||||
/>
|
||||
{/each}
|
||||
</svg>
|
||||
@@ -156,7 +159,7 @@
|
||||
</div>
|
||||
|
||||
<!-- Timer (recording only) -->
|
||||
{#if state === 'recording'}
|
||||
{#if recordState === 'recording'}
|
||||
<div class="timer" style="color: {accent}">
|
||||
<span class="rec-dot" style="background: {accent}"></span>
|
||||
{formatTime(elapsed)}
|
||||
@@ -170,15 +173,15 @@
|
||||
|
||||
<!-- Buttons -->
|
||||
<div class="btn-row">
|
||||
{#if state === 'idle' || state === 'requesting'}
|
||||
{#if recordState === 'idle' || recordState === 'requesting'}
|
||||
<button
|
||||
class="btn-record"
|
||||
style="background: {accent}; color: #0c0d10;"
|
||||
onclick={startRecording}
|
||||
disabled={state === 'requesting'}
|
||||
disabled={recordState === 'requesting'}
|
||||
aria-label="Start recording"
|
||||
>
|
||||
{#if state === 'requesting'}
|
||||
{#if recordState === 'requesting'}
|
||||
<svg width="13" height="13" viewBox="0 0 13 13" style="animation: spin 1s linear infinite">
|
||||
<circle cx="6.5" cy="6.5" r="5" stroke="currentColor" stroke-width="1.5" fill="none" stroke-dasharray="20 12"/>
|
||||
</svg>
|
||||
@@ -190,7 +193,7 @@
|
||||
Record
|
||||
{/if}
|
||||
</button>
|
||||
{:else if state === 'recording'}
|
||||
{:else if recordState === 'recording'}
|
||||
<button
|
||||
class="btn-stop"
|
||||
onclick={stopRecording}
|
||||
|
||||
@@ -37,6 +37,119 @@ function mergeConsecutive(segments: Segment[]): Segment[] {
|
||||
return out;
|
||||
}
|
||||
|
||||
// ── Collapse rolling prefix/suffix chains from backend segment hypotheses ──────
|
||||
|
||||
const MAX_CHAIN_GAP_SECS = 0.15;
|
||||
const MIN_MEANINGFUL_WORDS = 2;
|
||||
const MIN_MEANINGFUL_CHARS = 8;
|
||||
const MIN_OVERLAP_WORDS = 3;
|
||||
|
||||
function splitWords(text: string): string[] {
|
||||
return text.trim().split(/\s+/).filter(Boolean);
|
||||
}
|
||||
|
||||
function normaliseWords(text: string): string[] {
|
||||
return splitWords(text)
|
||||
.map((word) => word.toLowerCase().replace(/[^\w]/g, ''))
|
||||
.filter(Boolean);
|
||||
}
|
||||
|
||||
function arraysEqual(a: string[], b: string[]): boolean {
|
||||
return a.length === b.length && a.every((value, index) => value === b[index]);
|
||||
}
|
||||
|
||||
function startsWithWords(full: string[], prefix: string[]): boolean {
|
||||
return prefix.length <= full.length && arraysEqual(full.slice(0, prefix.length), prefix);
|
||||
}
|
||||
|
||||
function endsWithWords(full: string[], suffix: string[]): boolean {
|
||||
return suffix.length <= full.length && arraysEqual(full.slice(full.length - suffix.length), suffix);
|
||||
}
|
||||
|
||||
function suffixPrefixOverlap(left: string[], right: string[]): number {
|
||||
const max = Math.min(left.length, right.length);
|
||||
for (let size = max; size >= 1; size--) {
|
||||
if (arraysEqual(left.slice(left.length - size), right.slice(0, size))) return size;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
function isMeaningfulPhrase(words: string[]): boolean {
|
||||
return words.length >= MIN_MEANINGFUL_WORDS && words.join(' ').length >= MIN_MEANINGFUL_CHARS;
|
||||
}
|
||||
|
||||
function trimLeadingWords(text: string, count: number): string {
|
||||
return splitWords(text).slice(count).join(' ').trim();
|
||||
}
|
||||
|
||||
function collapseIncrementalSegments(segments: Segment[]): Segment[] {
|
||||
const out: Segment[] = [];
|
||||
|
||||
for (const seg of segments) {
|
||||
let current: Segment = {
|
||||
...seg,
|
||||
text: seg.text.trim()
|
||||
};
|
||||
|
||||
if (!current.text) continue;
|
||||
|
||||
const last = out[out.length - 1];
|
||||
if (!last) {
|
||||
out.push(current);
|
||||
continue;
|
||||
}
|
||||
|
||||
const gap = current.start - last.end;
|
||||
if (gap > MAX_CHAIN_GAP_SECS) {
|
||||
out.push(current);
|
||||
continue;
|
||||
}
|
||||
|
||||
const lastWords = normaliseWords(last.text);
|
||||
const currentWords = normaliseWords(current.text);
|
||||
if (lastWords.length === 0 || currentWords.length === 0) {
|
||||
out.push(current);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (
|
||||
currentWords.length > lastWords.length &&
|
||||
startsWithWords(currentWords, lastWords) &&
|
||||
isMeaningfulPhrase(lastWords)
|
||||
) {
|
||||
last.text = current.text;
|
||||
last.end = current.end;
|
||||
last.words = current.words;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (endsWithWords(lastWords, currentWords) && isMeaningfulPhrase(currentWords)) {
|
||||
last.end = Math.max(last.end, current.end);
|
||||
continue;
|
||||
}
|
||||
|
||||
const overlapWords = suffixPrefixOverlap(lastWords, currentWords);
|
||||
if (overlapWords >= MIN_OVERLAP_WORDS) {
|
||||
const trimmedText = trimLeadingWords(current.text, overlapWords);
|
||||
if (!trimmedText) {
|
||||
last.end = Math.max(last.end, current.end);
|
||||
continue;
|
||||
}
|
||||
|
||||
current = {
|
||||
...current,
|
||||
start: Math.max(current.start, last.end),
|
||||
text: trimmedText,
|
||||
words: []
|
||||
};
|
||||
}
|
||||
|
||||
out.push(current);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
// ── N-gram deduplication ─────────────────────────────────────────────────────
|
||||
|
||||
const NGRAM_N = 6;
|
||||
@@ -93,16 +206,22 @@ export function deduplicateSegments(segments: Segment[]): Segment[] {
|
||||
// 2. Remove empty segments
|
||||
result = result.filter((s) => s.text.length > 0);
|
||||
|
||||
// 3. First merge pass
|
||||
// 3. Collapse rolling backend hypotheses before generic dedup
|
||||
result = collapseIncrementalSegments(result);
|
||||
|
||||
// 4. First merge pass
|
||||
result = mergeConsecutive(result);
|
||||
|
||||
// 4. N-gram dedup
|
||||
// 5. N-gram dedup
|
||||
result = ngramDedup(result);
|
||||
|
||||
// 5. Second merge pass (catches new adjacencies after dedup)
|
||||
// 6. Re-run rolling collapse after removals create new adjacencies
|
||||
result = collapseIncrementalSegments(result);
|
||||
|
||||
// 7. Second merge pass (catches new adjacencies after dedup)
|
||||
result = mergeConsecutive(result);
|
||||
|
||||
// 6. Re-index
|
||||
// 8. Re-index
|
||||
result.forEach((s, i) => (s.index = i));
|
||||
|
||||
return result;
|
||||
|
||||
@@ -144,7 +144,7 @@
|
||||
|
||||
<!-- Decorative waveform -->
|
||||
<div class="dropzone-wave">
|
||||
<Waveform bars={DROPZONE_BARS} progress={0} {ACCENT} height={38} />
|
||||
<Waveform bars={DROPZONE_BARS} progress={0} accent={ACCENT} height={38} />
|
||||
</div>
|
||||
|
||||
<input
|
||||
@@ -586,4 +586,3 @@
|
||||
}
|
||||
}
|
||||
</style>
|
||||
|
||||
|
||||
@@ -12,12 +12,22 @@ const jobId = params.jobId;
|
||||
const job = getJob(jobId);
|
||||
if (!job) throw error(404, 'Job not found');
|
||||
|
||||
const whisperJob = (await request.json()) as WhisperJob;
|
||||
|
||||
// Discard the result if the job was cancelled locally while whisper was running
|
||||
if (job.status === 'cancelled') {
|
||||
return json({ ok: true });
|
||||
}
|
||||
|
||||
const whisperJob = (await request.json()) as WhisperJob;
|
||||
// Ignore stale callbacks from a previous whisper job after a local retry/reset.
|
||||
if (job.whisperJobId && whisperJob.id !== job.whisperJobId) {
|
||||
return json({ ok: true, ignored: 'stale_whisper_job' });
|
||||
}
|
||||
|
||||
// Ignore replayed success callbacks after the transcript is already persisted.
|
||||
if (job.status === 'done' && job.segmentsJson) {
|
||||
return json({ ok: true, ignored: 'duplicate_webhook' });
|
||||
}
|
||||
|
||||
if (whisperJob.status === 'failed' || whisperJob.status === 'cancelled') {
|
||||
const msg = whisperJob.error ?? `Whisper job ${whisperJob.status}`;
|
||||
|
||||
@@ -7,17 +7,18 @@ import { describe, it, expect, vi, afterEach } from 'vitest';
|
||||
|
||||
const execFileMock = vi.hoisted(() => {
|
||||
const fn = vi.fn();
|
||||
type ExecFilePromisifyArgs = [string, string[]];
|
||||
type ExecFileCallback = (err: Error | null, stdout: string, stderr: string) => void;
|
||||
type ExecFileMock = (...args: [...ExecFilePromisifyArgs, ExecFileCallback]) => void;
|
||||
const invoke = fn as unknown as ExecFileMock;
|
||||
Object.defineProperty(fn, Symbol.for('nodejs.util.promisify.custom'), {
|
||||
configurable: true,
|
||||
value: (...args: unknown[]) =>
|
||||
value: (...args: ExecFilePromisifyArgs) =>
|
||||
new Promise<{ stdout: string; stderr: string }>((resolve, reject) => {
|
||||
(fn as ReturnType<typeof vi.fn>)(
|
||||
...args,
|
||||
(err: Error | null, stdout: string, stderr: string) => {
|
||||
if (err) reject(err);
|
||||
else resolve({ stdout, stderr });
|
||||
}
|
||||
);
|
||||
invoke(...args, (err: Error | null, stdout: string, stderr: string) => {
|
||||
if (err) reject(err);
|
||||
else resolve({ stdout, stderr });
|
||||
});
|
||||
})
|
||||
});
|
||||
return fn;
|
||||
|
||||
@@ -66,6 +66,50 @@ describe('deduplicateSegments — mergeConsecutive', () => {
|
||||
});
|
||||
});
|
||||
|
||||
// ── rolling prefix/suffix chain collapse ───────────────────────────────────────
|
||||
|
||||
describe('deduplicateSegments — rolling backend hypotheses', () => {
|
||||
it('collapses prefix-growth chains from stored backend segments', () => {
|
||||
const input = [
|
||||
seg(0, 15.24, 16.6, 'Hello everyone.'),
|
||||
seg(1, 16.6, 19.47, 'Hello everyone. Um, welcome to this talk.'),
|
||||
seg(2, 19.47, 19.48, 'Um, welcome to this talk.'),
|
||||
seg(3, 19.48, 21.67, "Um, welcome to this talk. I'll be speaking about small model"),
|
||||
seg(4, 21.67, 21.68, "I'll be speaking about small model"),
|
||||
seg(5, 21.68, 24.59, "I'll be speaking about small model inference and a gap that we've")
|
||||
];
|
||||
|
||||
const result = deduplicateSegments(input);
|
||||
|
||||
expect(result).toHaveLength(2);
|
||||
expect(result[0]).toMatchObject({
|
||||
index: 0,
|
||||
start: 15.24,
|
||||
end: 19.48,
|
||||
text: 'Hello everyone. Um, welcome to this talk.'
|
||||
});
|
||||
expect(result[1]).toMatchObject({
|
||||
index: 1,
|
||||
start: 19.48,
|
||||
end: 24.59,
|
||||
text: "I'll be speaking about small model inference and a gap that we've"
|
||||
});
|
||||
});
|
||||
|
||||
it('does not collapse similar phrases when there is a real timing gap', () => {
|
||||
const input = [
|
||||
seg(0, 0, 1, 'Hello everyone.'),
|
||||
seg(1, 2, 4, 'Hello everyone. Welcome back.')
|
||||
];
|
||||
|
||||
const result = deduplicateSegments(input);
|
||||
|
||||
expect(result).toHaveLength(2);
|
||||
expect(result[0].text).toBe('Hello everyone.');
|
||||
expect(result[1].text).toBe('Hello everyone. Welcome back.');
|
||||
});
|
||||
});
|
||||
|
||||
// ── ngramDedup ────────────────────────────────────────────────────────────────
|
||||
|
||||
describe('deduplicateSegments — ngramDedup', () => {
|
||||
|
||||
@@ -132,6 +132,43 @@ describe('POST /api/webhook/[jobId] — locally cancelled job', () => {
|
||||
});
|
||||
});
|
||||
|
||||
// ── Duplicate / stale callback guards ──────────────────────────────────────────
|
||||
|
||||
describe('POST /api/webhook/[jobId] — duplicate and stale callbacks', () => {
|
||||
it('ignores replayed success callbacks after the transcript is already done', async () => {
|
||||
mockGetJob.mockReturnValue({
|
||||
...makeJob('job-done'),
|
||||
status: 'done',
|
||||
segmentsJson: JSON.stringify([makeSeg(0, 'Already saved.')]),
|
||||
whisperJobId: 'whisper-id'
|
||||
});
|
||||
|
||||
const res = await POST(makeEvent('job-done', makeWhisperJob()) as any);
|
||||
expect(res.status).toBe(200);
|
||||
expect(await res.json()).toEqual({ ok: true, ignored: 'duplicate_webhook' });
|
||||
expect(mockSetJobStatus).not.toHaveBeenCalled();
|
||||
expect(mockUpdateJob).not.toHaveBeenCalled();
|
||||
expect(mockWriteOutputs).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('ignores stale callbacks from an older whisper job after retry', async () => {
|
||||
mockGetJob.mockReturnValue({
|
||||
...makeJob('job-stale'),
|
||||
status: 'transcribing',
|
||||
whisperJobId: 'current-whisper-job'
|
||||
});
|
||||
|
||||
const res = await POST(
|
||||
makeEvent('job-stale', makeWhisperJob({ id: 'old-whisper-job', segments: [makeSeg(0, 'stale')] })) as any
|
||||
);
|
||||
expect(res.status).toBe(200);
|
||||
expect(await res.json()).toEqual({ ok: true, ignored: 'stale_whisper_job' });
|
||||
expect(mockSetJobStatus).not.toHaveBeenCalled();
|
||||
expect(mockUpdateJob).not.toHaveBeenCalled();
|
||||
expect(mockWriteOutputs).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
|
||||
// ── Whisper job failed / cancelled ───────────────────────────────────────────
|
||||
|
||||
describe('POST /api/webhook/[jobId] — whisper failure', () => {
|
||||
|
||||
Reference in New Issue
Block a user