fix(worker): collapse incremental segments
Normalize rolling partial-hypothesis chains before final job persistence so downstream clients receive stable transcript segments instead of echoed continuations. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
501
src/worker.rs
501
src/worker.rs
@@ -16,8 +16,7 @@ use crate::{
|
||||
models::{Job, JobId, JobStatus, ModelEvent, ModelState, Segment},
|
||||
storage::Storage,
|
||||
transcriber::Transcriber,
|
||||
webhook,
|
||||
AppError,
|
||||
webhook, AppError,
|
||||
};
|
||||
|
||||
/// Per-job broadcast channel for SSE subscribers.
|
||||
@@ -26,7 +25,11 @@ pub type ProgressTx = broadcast::Sender<ProgressEvent>;
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum ProgressEvent {
|
||||
/// `percent` — overall 0–100; `chunk` — 1-based; `total` — total chunks.
|
||||
Progress { percent: u8, chunk: usize, total: usize },
|
||||
Progress {
|
||||
percent: u8,
|
||||
chunk: usize,
|
||||
total: usize,
|
||||
},
|
||||
Done(Box<Job>),
|
||||
Error(String),
|
||||
}
|
||||
@@ -50,11 +53,11 @@ pub enum WorkerCmd {
|
||||
// ── Transcription request/response types ─────────────────────────────────────
|
||||
|
||||
pub struct TranscribeRequest {
|
||||
pub pcm: Vec<f32>,
|
||||
pub language: Option<String>,
|
||||
pub task: String,
|
||||
pub pcm: Vec<f32>,
|
||||
pub language: Option<String>,
|
||||
pub task: String,
|
||||
pub on_progress: Box<dyn Fn(u8) + Send + 'static>,
|
||||
pub reply: oneshot::Sender<crate::Result<(Vec<Segment>, String)>>,
|
||||
pub reply: oneshot::Sender<crate::Result<(Vec<Segment>, String)>>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for TranscribeRequest {
|
||||
@@ -75,15 +78,15 @@ impl std::fmt::Debug for TranscribeRequest {
|
||||
/// trigger loading.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn start(
|
||||
job_rx: mpsc::UnboundedReceiver<JobId>,
|
||||
storage: Arc<Storage>,
|
||||
model_path: PathBuf,
|
||||
queue_depth: Arc<AtomicUsize>,
|
||||
gpu_device: u32,
|
||||
model_state: Arc<RwLock<ModelState>>,
|
||||
model_event_tx: broadcast::Sender<ModelEvent>,
|
||||
job_rx: mpsc::UnboundedReceiver<JobId>,
|
||||
storage: Arc<Storage>,
|
||||
model_path: PathBuf,
|
||||
queue_depth: Arc<AtomicUsize>,
|
||||
gpu_device: u32,
|
||||
model_state: Arc<RwLock<ModelState>>,
|
||||
model_event_tx: broadcast::Sender<ModelEvent>,
|
||||
webhook_registry: Arc<Mutex<HashSet<String>>>,
|
||||
idle_timeout: Duration,
|
||||
idle_timeout: Duration,
|
||||
gpu_poll_interval: Duration,
|
||||
) -> (ProgressRegistry, std::sync::mpsc::SyncSender<WorkerCmd>) {
|
||||
let registry: ProgressRegistry = Arc::new(dashmap::DashMap::new());
|
||||
@@ -126,15 +129,15 @@ pub fn start(
|
||||
/// separate thread.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn transcriber_thread(
|
||||
rx: std::sync::mpsc::Receiver<WorkerCmd>,
|
||||
model_path: PathBuf,
|
||||
gpu_device: u32,
|
||||
model_state: Arc<RwLock<ModelState>>,
|
||||
model_event_tx: broadcast::Sender<ModelEvent>,
|
||||
rx: std::sync::mpsc::Receiver<WorkerCmd>,
|
||||
model_path: PathBuf,
|
||||
gpu_device: u32,
|
||||
model_state: Arc<RwLock<ModelState>>,
|
||||
model_event_tx: broadcast::Sender<ModelEvent>,
|
||||
webhook_registry: Arc<Mutex<HashSet<String>>>,
|
||||
idle_timeout: Duration,
|
||||
idle_timeout: Duration,
|
||||
gpu_poll_interval: Duration,
|
||||
rt: tokio::runtime::Handle,
|
||||
rt: tokio::runtime::Handle,
|
||||
) {
|
||||
let mut transcriber: Option<Transcriber> = None;
|
||||
let mut last_job = Instant::now();
|
||||
@@ -162,14 +165,22 @@ fn transcriber_thread(
|
||||
}
|
||||
|
||||
Ok(WorkerCmd::Unload) => {
|
||||
do_unload(&mut transcriber, &model_state, &model_event_tx, &webhook_registry, &rt);
|
||||
do_unload(
|
||||
&mut transcriber,
|
||||
&model_state,
|
||||
&model_event_tx,
|
||||
&webhook_registry,
|
||||
&rt,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(WorkerCmd::Transcribe(req)) => {
|
||||
let t = match &mut transcriber {
|
||||
Some(t) => t,
|
||||
None => {
|
||||
tracing::warn!("Transcribe cmd received but model is unloaded — failing job");
|
||||
tracing::warn!(
|
||||
"Transcribe cmd received but model is unloaded — failing job"
|
||||
);
|
||||
let _ = req.reply.send(Err(AppError::Internal(
|
||||
"model unloaded before job could run".into(),
|
||||
)));
|
||||
@@ -177,12 +188,9 @@ fn transcriber_thread(
|
||||
}
|
||||
};
|
||||
|
||||
let result = t.transcribe(
|
||||
&req.pcm,
|
||||
req.language.as_deref(),
|
||||
&req.task,
|
||||
move |p| (req.on_progress)(p),
|
||||
);
|
||||
let result = t.transcribe(&req.pcm, req.language.as_deref(), &req.task, move |p| {
|
||||
(req.on_progress)(p)
|
||||
});
|
||||
last_job = Instant::now();
|
||||
let _ = req.reply.send(result);
|
||||
}
|
||||
@@ -218,14 +226,14 @@ fn transcriber_thread(
|
||||
/// rejection. Returns `Some(Transcriber)` on success, `None` if cancelled.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
fn try_load_with_polling(
|
||||
rx: &std::sync::mpsc::Receiver<WorkerCmd>,
|
||||
model_path: &PathBuf,
|
||||
gpu_device: u32,
|
||||
model_state: &Arc<RwLock<ModelState>>,
|
||||
model_event_tx: &broadcast::Sender<ModelEvent>,
|
||||
rx: &std::sync::mpsc::Receiver<WorkerCmd>,
|
||||
model_path: &PathBuf,
|
||||
gpu_device: u32,
|
||||
model_state: &Arc<RwLock<ModelState>>,
|
||||
model_event_tx: &broadcast::Sender<ModelEvent>,
|
||||
webhook_registry: &Arc<Mutex<HashSet<String>>>,
|
||||
gpu_poll_interval: Duration,
|
||||
rt: &tokio::runtime::Handle,
|
||||
rt: &tokio::runtime::Handle,
|
||||
) -> Option<Transcriber> {
|
||||
loop {
|
||||
set_state(model_state, ModelState::Loading);
|
||||
@@ -253,25 +261,35 @@ fn try_load_with_polling(
|
||||
"insufficient VRAM — will retry"
|
||||
);
|
||||
|
||||
set_state(model_state, ModelState::WaitingForGpu {
|
||||
vram_needed_mb,
|
||||
vram_free_mb,
|
||||
retry_in_secs,
|
||||
});
|
||||
broadcast_event(model_event_tx, ModelEvent::ModelWaitingForGpu {
|
||||
vram_needed_mb,
|
||||
vram_free_mb,
|
||||
retry_in_secs,
|
||||
});
|
||||
set_state(
|
||||
model_state,
|
||||
ModelState::WaitingForGpu {
|
||||
vram_needed_mb,
|
||||
vram_free_mb,
|
||||
retry_in_secs,
|
||||
},
|
||||
);
|
||||
broadcast_event(
|
||||
model_event_tx,
|
||||
ModelEvent::ModelWaitingForGpu {
|
||||
vram_needed_mb,
|
||||
vram_free_mb,
|
||||
retry_in_secs,
|
||||
},
|
||||
);
|
||||
|
||||
// Interruptible sleep: drain rx while waiting for gpu_poll_interval.
|
||||
let deadline = Instant::now() + gpu_poll_interval;
|
||||
loop {
|
||||
let remaining = deadline.saturating_duration_since(Instant::now());
|
||||
if remaining.is_zero() { break; }
|
||||
if remaining.is_zero() {
|
||||
break;
|
||||
}
|
||||
match rx.recv_timeout(remaining.min(Duration::from_secs(1))) {
|
||||
Ok(WorkerCmd::Unload) => {
|
||||
tracing::info!("Unload received while waiting for GPU — cancelling load");
|
||||
tracing::info!(
|
||||
"Unload received while waiting for GPU — cancelling load"
|
||||
);
|
||||
set_state(model_state, ModelState::Unloaded);
|
||||
broadcast_event(model_event_tx, ModelEvent::ModelUnloaded);
|
||||
fire_webhooks(webhook_registry, ModelEvent::ModelUnloaded, rt);
|
||||
@@ -303,11 +321,11 @@ fn try_load_with_polling(
|
||||
}
|
||||
|
||||
fn do_unload(
|
||||
transcriber: &mut Option<Transcriber>,
|
||||
model_state: &Arc<RwLock<ModelState>>,
|
||||
model_event_tx: &broadcast::Sender<ModelEvent>,
|
||||
transcriber: &mut Option<Transcriber>,
|
||||
model_state: &Arc<RwLock<ModelState>>,
|
||||
model_event_tx: &broadcast::Sender<ModelEvent>,
|
||||
webhook_registry: &Arc<Mutex<HashSet<String>>>,
|
||||
rt: &tokio::runtime::Handle,
|
||||
rt: &tokio::runtime::Handle,
|
||||
) {
|
||||
*transcriber = None;
|
||||
set_state(model_state, ModelState::Unloaded);
|
||||
@@ -328,8 +346,8 @@ fn broadcast_event(tx: &broadcast::Sender<ModelEvent>, event: ModelEvent) {
|
||||
|
||||
fn fire_webhooks(
|
||||
registry: &Arc<Mutex<HashSet<String>>>,
|
||||
event: ModelEvent,
|
||||
rt: &tokio::runtime::Handle,
|
||||
event: ModelEvent,
|
||||
rt: &tokio::runtime::Handle,
|
||||
) {
|
||||
if !event.is_webhook_event() {
|
||||
return;
|
||||
@@ -341,11 +359,16 @@ fn fire_webhooks(
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
if urls.is_empty() { return; }
|
||||
if urls.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let payload = match serde_json::to_string(&event) {
|
||||
Ok(p) => p,
|
||||
Err(e) => { tracing::error!(error = %e, "failed to serialize model event"); return; }
|
||||
Ok(p) => p,
|
||||
Err(e) => {
|
||||
tracing::error!(error = %e, "failed to serialize model event");
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
for url in urls {
|
||||
@@ -356,7 +379,8 @@ fn fire_webhooks(
|
||||
.build()
|
||||
.expect("http client");
|
||||
for attempt in 0..3_u32 {
|
||||
match http.post(&url)
|
||||
match http
|
||||
.post(&url)
|
||||
.header("content-type", "application/json")
|
||||
.body(body.clone())
|
||||
.send()
|
||||
@@ -405,11 +429,11 @@ fn parse_oom_vram(msg: &str, gpu_device: u32) -> (u64, u64) {
|
||||
// ── Async job runner ──────────────────────────────────────────────────────────
|
||||
|
||||
async fn run(
|
||||
mut job_rx: mpsc::UnboundedReceiver<JobId>,
|
||||
storage: Arc<Storage>,
|
||||
mut job_rx: mpsc::UnboundedReceiver<JobId>,
|
||||
storage: Arc<Storage>,
|
||||
queue_depth: Arc<AtomicUsize>,
|
||||
registry: ProgressRegistry,
|
||||
cmd_tx: std::sync::mpsc::SyncSender<WorkerCmd>,
|
||||
registry: ProgressRegistry,
|
||||
cmd_tx: std::sync::mpsc::SyncSender<WorkerCmd>,
|
||||
) {
|
||||
let http = Client::builder()
|
||||
.timeout(Duration::from_secs(30))
|
||||
@@ -420,7 +444,7 @@ async fn run(
|
||||
queue_depth.fetch_sub(1, Ordering::Relaxed);
|
||||
|
||||
let mut job = match storage.get(&job_id).await {
|
||||
Ok(j) => j,
|
||||
Ok(j) => j,
|
||||
Err(e) => {
|
||||
tracing::warn!(job_id = %job_id, error = %e, "job vanished before processing");
|
||||
registry.remove(&job_id);
|
||||
@@ -461,19 +485,19 @@ async fn run(
|
||||
|
||||
match result {
|
||||
Ok((segments, language, duration_secs)) => {
|
||||
job.status = JobStatus::Done;
|
||||
job.segments = segments;
|
||||
job.language = Some(language);
|
||||
job.status = JobStatus::Done;
|
||||
job.segments = segments;
|
||||
job.language = Some(language);
|
||||
job.duration_secs = Some(duration_secs);
|
||||
job.progress = 100;
|
||||
job.completed_at = Some(Utc::now());
|
||||
job.progress = 100;
|
||||
job.completed_at = Some(Utc::now());
|
||||
let _ = progress_tx.send(ProgressEvent::Done(Box::new(job.clone())));
|
||||
}
|
||||
Err(e) => {
|
||||
let msg = e.to_string();
|
||||
tracing::error!(job_id = %job_id, error = %msg, "transcription failed");
|
||||
job.status = JobStatus::Failed;
|
||||
job.error = Some(msg.clone());
|
||||
job.status = JobStatus::Failed;
|
||||
job.error = Some(msg.clone());
|
||||
job.completed_at = Some(Utc::now());
|
||||
let _ = progress_tx.send(ProgressEvent::Error(msg));
|
||||
}
|
||||
@@ -485,9 +509,11 @@ async fn run(
|
||||
|
||||
if let Some(url) = &job.webhook_url.clone() {
|
||||
let http = http.clone();
|
||||
let url = url.clone();
|
||||
let job = job.clone();
|
||||
tokio::spawn(async move { webhook::fire(&http, &url, &job).await; });
|
||||
let url = url.clone();
|
||||
let job = job.clone();
|
||||
tokio::spawn(async move {
|
||||
webhook::fire(&http, &url, &job).await;
|
||||
});
|
||||
}
|
||||
|
||||
tokio::time::sleep(Duration::from_secs(30)).await;
|
||||
@@ -498,9 +524,9 @@ async fn run(
|
||||
// ── Silence-based chunking ────────────────────────────────────────────────────
|
||||
|
||||
const TARGET_CHUNK_SECS: f32 = 60.0;
|
||||
const SNAP_WINDOW_SECS: f32 = 30.0;
|
||||
const SILENCE_DB: &str = "-35dB";
|
||||
const SILENCE_DUR: &str = "0.4";
|
||||
const SNAP_WINDOW_SECS: f32 = 30.0;
|
||||
const SILENCE_DB: &str = "-35dB";
|
||||
const SILENCE_DUR: &str = "0.4";
|
||||
|
||||
async fn detect_silence_midpoints(path: &std::path::Path) -> Vec<f32> {
|
||||
use tokio::process::Command;
|
||||
@@ -509,15 +535,19 @@ async fn detect_silence_midpoints(path: &std::path::Path) -> Vec<f32> {
|
||||
let output = Command::new("ffmpeg")
|
||||
.args([
|
||||
"-nostdin",
|
||||
"-i", path.to_str().unwrap_or(""),
|
||||
"-af", &filter,
|
||||
"-f", "null", "-",
|
||||
"-i",
|
||||
path.to_str().unwrap_or(""),
|
||||
"-af",
|
||||
&filter,
|
||||
"-f",
|
||||
"null",
|
||||
"-",
|
||||
])
|
||||
.output()
|
||||
.await;
|
||||
|
||||
let output = match output {
|
||||
Ok(o) => o,
|
||||
Ok(o) => o,
|
||||
Err(e) => {
|
||||
tracing::warn!(error = %e, "silencedetect unavailable; using hard cuts");
|
||||
return Vec::new();
|
||||
@@ -526,7 +556,7 @@ async fn detect_silence_midpoints(path: &std::path::Path) -> Vec<f32> {
|
||||
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
let mut starts: Vec<f32> = Vec::new();
|
||||
let mut ends: Vec<f32> = Vec::new();
|
||||
let mut ends: Vec<f32> = Vec::new();
|
||||
|
||||
for line in stderr.lines() {
|
||||
if let Some(i) = line.find("silence_start: ") {
|
||||
@@ -545,7 +575,9 @@ async fn detect_silence_midpoints(path: &std::path::Path) -> Vec<f32> {
|
||||
}
|
||||
}
|
||||
|
||||
let mids: Vec<f32> = starts.iter().zip(ends.iter())
|
||||
let mids: Vec<f32> = starts
|
||||
.iter()
|
||||
.zip(ends.iter())
|
||||
.map(|(s, e)| (s + e) / 2.0)
|
||||
.collect();
|
||||
|
||||
@@ -553,18 +585,15 @@ async fn detect_silence_midpoints(path: &std::path::Path) -> Vec<f32> {
|
||||
mids
|
||||
}
|
||||
|
||||
fn snap_to_silence(
|
||||
mids: &[f32],
|
||||
total_secs: f32,
|
||||
target_secs: f32,
|
||||
snap_window: f32,
|
||||
) -> Vec<f32> {
|
||||
fn snap_to_silence(mids: &[f32], total_secs: f32, target_secs: f32, snap_window: f32) -> Vec<f32> {
|
||||
let mut cuts: Vec<f32> = Vec::new();
|
||||
let mut pos = target_secs;
|
||||
|
||||
while pos < total_secs - target_secs * 0.25 {
|
||||
let prev_cut = cuts.last().copied().unwrap_or(0.0);
|
||||
let best = mids.iter().copied()
|
||||
let best = mids
|
||||
.iter()
|
||||
.copied()
|
||||
.filter(|&t| t > prev_cut + 10.0 && (t - pos).abs() <= snap_window)
|
||||
.min_by(|a, b| (a - pos).abs().partial_cmp(&(b - pos).abs()).unwrap());
|
||||
let cut = best.unwrap_or(pos);
|
||||
@@ -591,20 +620,165 @@ fn to_chunk_ranges(cuts: &[f32], total_secs: f32) -> Vec<(f32, f32)> {
|
||||
ranges
|
||||
}
|
||||
|
||||
const MAX_CHAIN_GAP_SECS: f32 = 0.15;
|
||||
const MIN_MEANINGFUL_WORDS: usize = 2;
|
||||
const MIN_MEANINGFUL_CHARS: usize = 8;
|
||||
const MIN_OVERLAP_WORDS: usize = 3;
|
||||
|
||||
fn normalised_words(text: &str) -> Vec<String> {
|
||||
text.split_whitespace()
|
||||
.map(|word| {
|
||||
word.chars()
|
||||
.filter(|ch| ch.is_alphanumeric() || *ch == '_')
|
||||
.flat_map(|ch| ch.to_lowercase())
|
||||
.collect::<String>()
|
||||
})
|
||||
.filter(|word| !word.is_empty())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn starts_with_words(full: &[String], prefix: &[String]) -> bool {
|
||||
prefix.len() <= full.len() && full.iter().take(prefix.len()).eq(prefix.iter())
|
||||
}
|
||||
|
||||
fn ends_with_words(full: &[String], suffix: &[String]) -> bool {
|
||||
suffix.len() <= full.len()
|
||||
&& full
|
||||
.iter()
|
||||
.skip(full.len() - suffix.len())
|
||||
.eq(suffix.iter())
|
||||
}
|
||||
|
||||
fn suffix_prefix_overlap(left: &[String], right: &[String]) -> usize {
|
||||
let max = left.len().min(right.len());
|
||||
for size in (1..=max).rev() {
|
||||
if left[left.len() - size..] == right[..size] {
|
||||
return size;
|
||||
}
|
||||
}
|
||||
0
|
||||
}
|
||||
|
||||
fn is_meaningful_phrase(words: &[String]) -> bool {
|
||||
words.len() >= MIN_MEANINGFUL_WORDS
|
||||
&& words.iter().map(|word| word.len()).sum::<usize>() >= MIN_MEANINGFUL_CHARS
|
||||
}
|
||||
|
||||
fn trim_leading_words(text: &str, count: usize) -> String {
|
||||
text.split_whitespace()
|
||||
.skip(count)
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
.trim()
|
||||
.to_string()
|
||||
}
|
||||
|
||||
fn merge_identical_segments(segments: Vec<Segment>) -> Vec<Segment> {
|
||||
let mut out: Vec<Segment> = Vec::with_capacity(segments.len());
|
||||
|
||||
for seg in segments {
|
||||
if let Some(last) = out.last_mut() {
|
||||
if normalised_words(&last.text) == normalised_words(&seg.text) {
|
||||
last.end = last.end.max(seg.end);
|
||||
if !seg.words.is_empty() {
|
||||
last.words = seg.words;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
out.push(seg);
|
||||
}
|
||||
|
||||
out
|
||||
}
|
||||
|
||||
fn collapse_incremental_segments(segments: Vec<Segment>) -> Vec<Segment> {
|
||||
let mut out: Vec<Segment> = Vec::with_capacity(segments.len());
|
||||
|
||||
for mut seg in segments {
|
||||
seg.text = seg.text.trim().to_string();
|
||||
if seg.text.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let Some(last) = out.last_mut() else {
|
||||
out.push(seg);
|
||||
continue;
|
||||
};
|
||||
|
||||
let gap = seg.start - last.end;
|
||||
if gap > MAX_CHAIN_GAP_SECS {
|
||||
out.push(seg);
|
||||
continue;
|
||||
}
|
||||
|
||||
let last_words = normalised_words(&last.text);
|
||||
let seg_words = normalised_words(&seg.text);
|
||||
if last_words.is_empty() || seg_words.is_empty() {
|
||||
out.push(seg);
|
||||
continue;
|
||||
}
|
||||
|
||||
if seg_words.len() > last_words.len()
|
||||
&& starts_with_words(&seg_words, &last_words)
|
||||
&& is_meaningful_phrase(&last_words)
|
||||
{
|
||||
last.text = seg.text;
|
||||
last.end = seg.end;
|
||||
last.words = seg.words;
|
||||
continue;
|
||||
}
|
||||
|
||||
if ends_with_words(&last_words, &seg_words) && is_meaningful_phrase(&seg_words) {
|
||||
last.end = last.end.max(seg.end);
|
||||
continue;
|
||||
}
|
||||
|
||||
let overlap = suffix_prefix_overlap(&last_words, &seg_words);
|
||||
if overlap >= MIN_OVERLAP_WORDS {
|
||||
let trimmed_text = trim_leading_words(&seg.text, overlap);
|
||||
if trimmed_text.is_empty() {
|
||||
last.end = last.end.max(seg.end);
|
||||
continue;
|
||||
}
|
||||
|
||||
seg.start = seg.start.max(last.end);
|
||||
seg.text = trimmed_text;
|
||||
seg.words.clear();
|
||||
}
|
||||
|
||||
out.push(seg);
|
||||
}
|
||||
|
||||
out
|
||||
}
|
||||
|
||||
fn normalise_segments(segments: Vec<Segment>) -> Vec<Segment> {
|
||||
let mut result = collapse_incremental_segments(segments);
|
||||
result = merge_identical_segments(result);
|
||||
result = collapse_incremental_segments(result);
|
||||
merge_identical_segments(result)
|
||||
}
|
||||
|
||||
// ── Job processing ────────────────────────────────────────────────────────────
|
||||
|
||||
async fn process_job(
|
||||
job: &Job,
|
||||
audio_path: &std::path::Path,
|
||||
job: &Job,
|
||||
audio_path: &std::path::Path,
|
||||
progress_tx: &ProgressTx,
|
||||
cmd_tx: &std::sync::mpsc::SyncSender<WorkerCmd>,
|
||||
storage: &Arc<Storage>,
|
||||
cmd_tx: &std::sync::mpsc::SyncSender<WorkerCmd>,
|
||||
storage: &Arc<Storage>,
|
||||
) -> crate::Result<(Vec<Segment>, String, f32)> {
|
||||
let pcm = decode_audio(audio_path).await?;
|
||||
let total_secs = pcm.len() as f32 / 16_000.0;
|
||||
|
||||
let silence_mids = detect_silence_midpoints(audio_path).await;
|
||||
let cuts = snap_to_silence(&silence_mids, total_secs, TARGET_CHUNK_SECS, SNAP_WINDOW_SECS);
|
||||
let cuts = snap_to_silence(
|
||||
&silence_mids,
|
||||
total_secs,
|
||||
TARGET_CHUNK_SECS,
|
||||
SNAP_WINDOW_SECS,
|
||||
);
|
||||
let chunks = to_chunk_ranges(&cuts, total_secs);
|
||||
let n = chunks.len();
|
||||
|
||||
@@ -620,12 +794,12 @@ async fn process_job(
|
||||
|
||||
for (ci, (chunk_start, chunk_end)) in chunks.iter().enumerate() {
|
||||
let s0 = (*chunk_start * 16_000.0) as usize;
|
||||
let s1 = ((*chunk_end * 16_000.0) as usize).min(pcm.len());
|
||||
let s1 = ((*chunk_end * 16_000.0) as usize).min(pcm.len());
|
||||
let mut chunk_pcm = pcm[s0..s1].to_vec();
|
||||
trim_trailing_silence(&mut chunk_pcm);
|
||||
|
||||
let base = (ci * 100 / n) as u8;
|
||||
let span = (100usize / n).max(1) as u8;
|
||||
let base = (ci * 100 / n) as u8;
|
||||
let span = (100usize / n).max(1) as u8;
|
||||
|
||||
// Save progress to disk before emitting SSE — polling clients who respond
|
||||
// immediately to the SSE event will then see consistent state.
|
||||
@@ -637,49 +811,52 @@ async fn process_job(
|
||||
|
||||
let _ = progress_tx.send(ProgressEvent::Progress {
|
||||
percent: base,
|
||||
chunk: ci + 1,
|
||||
total: n,
|
||||
chunk: ci + 1,
|
||||
total: n,
|
||||
});
|
||||
|
||||
let tx = progress_tx.clone();
|
||||
let tx = progress_tx.clone();
|
||||
let chunk_num = ci + 1;
|
||||
let on_progress = Box::new(move |p: u8| {
|
||||
let overall = base.saturating_add(p.saturating_mul(span) / 100);
|
||||
let _ = tx.send(ProgressEvent::Progress {
|
||||
percent: overall,
|
||||
chunk: chunk_num,
|
||||
total: n,
|
||||
chunk: chunk_num,
|
||||
total: n,
|
||||
});
|
||||
});
|
||||
|
||||
let (reply_tx, reply_rx) = oneshot::channel();
|
||||
cmd_tx.send(WorkerCmd::Transcribe(TranscribeRequest {
|
||||
pcm: chunk_pcm,
|
||||
language: job.language.clone(),
|
||||
task: job.task.clone(),
|
||||
on_progress,
|
||||
reply: reply_tx,
|
||||
})).map_err(|_| AppError::Internal("worker command channel closed".into()))?;
|
||||
cmd_tx
|
||||
.send(WorkerCmd::Transcribe(TranscribeRequest {
|
||||
pcm: chunk_pcm,
|
||||
language: job.language.clone(),
|
||||
task: job.task.clone(),
|
||||
on_progress,
|
||||
reply: reply_tx,
|
||||
}))
|
||||
.map_err(|_| AppError::Internal("worker command channel closed".into()))?;
|
||||
|
||||
let (mut segs, lang) = reply_rx.await
|
||||
let (mut segs, lang) = reply_rx
|
||||
.await
|
||||
.map_err(|_| AppError::Internal("transcriber thread dropped reply".into()))??;
|
||||
|
||||
let offset = *chunk_start;
|
||||
for seg in &mut segs {
|
||||
seg.start += offset;
|
||||
seg.end += offset;
|
||||
seg.end += offset;
|
||||
for word in &mut seg.words {
|
||||
word.start += offset;
|
||||
word.end += offset;
|
||||
word.end += offset;
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!(
|
||||
chunk = ci + 1,
|
||||
of = n,
|
||||
of = n,
|
||||
start = chunk_start,
|
||||
end = chunk_end,
|
||||
segs = segs.len(),
|
||||
end = chunk_end,
|
||||
segs = segs.len(),
|
||||
"chunk done"
|
||||
);
|
||||
|
||||
@@ -689,24 +866,30 @@ async fn process_job(
|
||||
}
|
||||
}
|
||||
|
||||
all_segments = normalise_segments(all_segments);
|
||||
|
||||
for (i, seg) in all_segments.iter_mut().enumerate() {
|
||||
seg.index = i as i32;
|
||||
}
|
||||
|
||||
let _ = progress_tx.send(ProgressEvent::Progress { percent: 100, chunk: n, total: n });
|
||||
let _ = progress_tx.send(ProgressEvent::Progress {
|
||||
percent: 100,
|
||||
chunk: n,
|
||||
total: n,
|
||||
});
|
||||
Ok((all_segments, language, total_secs))
|
||||
}
|
||||
|
||||
fn trim_trailing_silence(pcm: &mut Vec<f32>) {
|
||||
const THRESHOLD: f32 = 0.017_8;
|
||||
const PADDING: usize = 8_000;
|
||||
const PADDING: usize = 8_000;
|
||||
|
||||
if let Some(last_loud) = pcm.iter().rposition(|&s| s.abs() > THRESHOLD) {
|
||||
let new_len = (last_loud + 1 + PADDING).min(pcm.len());
|
||||
if new_len < pcm.len() {
|
||||
tracing::trace!(
|
||||
original_samples = pcm.len(),
|
||||
trimmed_samples = pcm.len() - new_len,
|
||||
trimmed_samples = pcm.len() - new_len,
|
||||
"trimmed trailing silence"
|
||||
);
|
||||
pcm.truncate(new_len);
|
||||
@@ -719,11 +902,17 @@ async fn decode_audio(path: &std::path::Path) -> crate::Result<Vec<f32>> {
|
||||
|
||||
let output = Command::new("ffmpeg")
|
||||
.args([
|
||||
"-nostdin", "-threads", "0",
|
||||
"-i", path.to_str().unwrap_or(""),
|
||||
"-f", "f32le",
|
||||
"-ac", "1",
|
||||
"-ar", "16000",
|
||||
"-nostdin",
|
||||
"-threads",
|
||||
"0",
|
||||
"-i",
|
||||
path.to_str().unwrap_or(""),
|
||||
"-f",
|
||||
"f32le",
|
||||
"-ac",
|
||||
"1",
|
||||
"-ar",
|
||||
"16000",
|
||||
"-",
|
||||
])
|
||||
.output()
|
||||
@@ -760,13 +949,28 @@ pub fn audio_path_for(id: &JobId) -> PathBuf {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::models::Word;
|
||||
|
||||
fn segment(index: i32, start: f32, end: f32, text: &str) -> Segment {
|
||||
Segment {
|
||||
index,
|
||||
start,
|
||||
end,
|
||||
text: text.into(),
|
||||
words: Vec::<Word>::new(),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_snap_to_silence_uses_nearest_midpoint() {
|
||||
let mids = vec![55.0, 58.0, 62.0];
|
||||
let cuts = snap_to_silence(&mids, 120.0, 60.0, 30.0);
|
||||
assert!(!cuts.is_empty());
|
||||
assert!((cuts[0] - 58.0).abs() < 0.01, "expected ~58.0, got {}", cuts[0]);
|
||||
assert!(
|
||||
(cuts[0] - 58.0).abs() < 0.01,
|
||||
"expected ~58.0, got {}",
|
||||
cuts[0]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -801,4 +1005,53 @@ mod tests {
|
||||
trim_trailing_silence(&mut pcm);
|
||||
assert_eq!(pcm.len(), (10_001 + 8_000).min(32_000));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalise_segments_collapses_prefix_growth_chain() {
|
||||
let input = vec![
|
||||
segment(0, 15.24, 16.6, "Hello everyone."),
|
||||
segment(1, 16.6, 19.47, "Hello everyone. Um, welcome to this talk."),
|
||||
segment(2, 19.47, 19.48, "Um, welcome to this talk."),
|
||||
segment(
|
||||
3,
|
||||
19.48,
|
||||
21.67,
|
||||
"Um, welcome to this talk. I'll be speaking about small model",
|
||||
),
|
||||
segment(4, 21.67, 21.68, "I'll be speaking about small model"),
|
||||
segment(
|
||||
5,
|
||||
21.68,
|
||||
24.59,
|
||||
"I'll be speaking about small model inference and a gap that we've",
|
||||
),
|
||||
];
|
||||
|
||||
let result = normalise_segments(input);
|
||||
|
||||
assert_eq!(result.len(), 2);
|
||||
assert_eq!(result[0].text, "Hello everyone. Um, welcome to this talk.");
|
||||
assert!((result[0].start - 15.24).abs() < 0.01);
|
||||
assert!((result[0].end - 19.48).abs() < 0.01);
|
||||
assert_eq!(
|
||||
result[1].text,
|
||||
"I'll be speaking about small model inference and a gap that we've"
|
||||
);
|
||||
assert!((result[1].start - 19.48).abs() < 0.01);
|
||||
assert!((result[1].end - 24.59).abs() < 0.01);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalise_segments_keeps_real_gap() {
|
||||
let input = vec![
|
||||
segment(0, 0.0, 1.0, "Hello everyone."),
|
||||
segment(1, 2.0, 4.0, "Hello everyone. Welcome back."),
|
||||
];
|
||||
|
||||
let result = normalise_segments(input);
|
||||
|
||||
assert_eq!(result.len(), 2);
|
||||
assert_eq!(result[0].text, "Hello everyone.");
|
||||
assert_eq!(result[1].text, "Hello everyone. Welcome back.");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user