#!/usr/bin/env bash set -euo pipefail # ── Config — override via env vars ─────────────────────────────────────────── BASE="${WHISPER_BASE_URL:-http://localhost:8080}" AUDIO="${TEST_AUDIO:-/home/moze/Sources/youtube-transcriber/docker/tmp/audio-b2167046-a236-4fcd-b739-78177542fd23.wav}" GREEN='\033[0;32m'; RED='\033[0;31m'; NC='\033[0m' FAILS=0 ok() { echo -e "${GREEN}[PASS]${NC} $*"; } fail(){ echo -e "${RED}[FAIL]${NC} $*"; FAILS=$((FAILS + 1)); } echo "=== Whisper API test suite ===" echo " BASE : $BASE" echo " AUDIO : $AUDIO" echo "" echo "=== 1. GET /health ===" HEALTH=$(curl -sf "$BASE/health") echo "$HEALTH" | python3 -m json.tool python3 -c " import sys, json d = json.loads('$HEALTH' if False else sys.stdin.read()) assert d['status'] == 'ok', f'status={d[\"status\"]}' assert 'model_state' in d, 'model_state field missing from health response' " <<< "$HEALTH" && ok "health ok + model_state present" || fail "health check" echo "" echo "=== 2. GET /docs (Swagger UI reachable) ===" curl -sf "$BASE/docs" | grep -qi "swagger" && ok "swagger UI reachable" || fail "swagger UI" echo "" echo "=== 3. Webhook receiver (background Python HTTP server) ===" cat > /tmp/webhook_receiver.py << 'PYEOF' import http.server, json, sys, signal class H(http.server.BaseHTTPRequestHandler): def do_POST(self): n = int(self.headers.get('Content-Length', 0)) body = self.rfile.read(n) data = json.loads(body) print(f"\n[WEBHOOK] status={data.get('status')} segments={len(data.get('segments', []))}", flush=True) self.send_response(200) self.end_headers() def log_message(self, *a): pass signal.signal(signal.SIGTERM, lambda *_: sys.exit(0)) print("[WEBHOOK] listening on :9999", flush=True) http.server.HTTPServer(('', 9999), H).serve_forever() PYEOF python3 /tmp/webhook_receiver.py & WEBHOOK_PID=$! sleep 1 echo "Webhook receiver started (PID $WEBHOOK_PID)" echo "" echo "=== 4. GET /model/status — expect unloaded on fresh start ===" MODEL_STATUS=$(curl -sf "$BASE/model/status") echo "$MODEL_STATUS" | python3 -m json.tool echo "$MODEL_STATUS" | python3 -c " import sys, json d = json.load(sys.stdin) assert 'state' in d, 'state field missing from /model/status' print(f' model state: {d[\"state\"]}') " && ok "/model/status has state field" || fail "/model/status schema" echo "" echo "=== 5. POST /model/load — trigger model load ===" LOAD_RESP=$(curl -sf -X POST "$BASE/model/load") echo "$LOAD_RESP" ok "POST /model/load accepted" echo "" echo "=== 6. Poll /model/status until ready (max 3 min) ===" LOAD_ELAPSED=0 while true; do sleep 5 LOAD_ELAPSED=$((LOAD_ELAPSED + 5)) MS=$(curl -sf "$BASE/model/status") STATE=$(echo "$MS" | python3 -c "import sys,json; print(json.load(sys.stdin)['state'])") echo " [${LOAD_ELAPSED}s] model_state=${STATE}" if [ "$STATE" = "ready" ]; then ok "model loaded and ready in ${LOAD_ELAPSED}s" break fi [ $LOAD_ELAPSED -gt 180 ] && { fail "model failed to load within 3 minutes"; break; } done echo "" echo "=== 7. DELETE a non-existent job → 404 ===" STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X DELETE "$BASE/jobs/00000000-0000-0000-0000-000000000000") [ "$STATUS" = "404" ] && ok "DELETE unknown job → 404" || fail "expected 404, got $STATUS" echo "" echo "=== 8. POST /jobs — submit audio ===" SUBMIT=$(curl -sf -X POST "$BASE/jobs" \ -F "audio=@${AUDIO};type=audio/wav" \ -F "task=transcribe" \ -F "webhook_url=http://localhost:9999/webhook") echo "$SUBMIT" JOB_ID=$(echo "$SUBMIT" | python3 -c "import sys,json; print(json.load(sys.stdin)['job_id'])") ok "submitted job $JOB_ID" echo "" echo "=== 9. GET /jobs/{id} immediately after submit ===" JOB=$(curl -sf "$BASE/jobs/$JOB_ID") echo "$JOB" | python3 -c " import sys, json d = json.load(sys.stdin) assert d['status'] in ('queued', 'running'), f'unexpected status: {d[\"status\"]}' " && ok "status is queued/running" || fail "initial status check" echo "" echo "=== 10. SSE stream (observe first 30 events then detach) ===" echo "Subscribing to SSE stream for $JOB_ID …" curl -sN --max-time 90 "$BASE/jobs/$JOB_ID/stream" | head -60 & SSE_PID=$! echo "" echo "=== 11. Poll until done (max 20 min) ===" ELAPSED=0 while true; do sleep 15 ELAPSED=$((ELAPSED + 15)) JOB=$(curl -sf "$BASE/jobs/$JOB_ID") STATUS=$(echo "$JOB" | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])") PROGRESS=$(echo "$JOB" | python3 -c "import sys,json; print(json.load(sys.stdin).get('progress',0))") echo " [${ELAPSED}s] status=$STATUS progress=${PROGRESS}%" if [ "$STATUS" = "done" ]; then ok "job finished in ${ELAPSED}s" break elif [ "$STATUS" = "failed" ]; then echo "$JOB" | python3 -m json.tool fail "job failed" break fi [ $ELAPSED -gt 1200 ] && { fail "timeout after 20 minutes"; break; } done kill $SSE_PID 2>/dev/null || true echo "" echo "=== 12. Inspect transcription quality ===" RESULT=$(curl -sf "$BASE/jobs/$JOB_ID") TMPJSON=$(mktemp /tmp/whisper_test_XXXXXX.json) echo "$RESULT" > "$TMPJSON" python3 - "$TMPJSON" << 'PYCHECK' import sys, json, re with open(sys.argv[1]) as f: data = json.load(f) segments = data.get("segments", []) print(f" Language : {data.get('language')}") print(f" Duration : {data.get('duration_secs')}s") print(f" Segments : {len(segments)}") if not segments: print(" ✗ ZERO SEGMENTS — transcription likely failed silently") sys.exit(1) issues = [] for i, seg in enumerate(segments): text = seg.get("text", "") words = text.strip().split() if len(words) >= 6: half = len(words) // 2 if words[:half] == words[half:half+half]: issues.append(f" [seg {i}] REPETITION LOOP: {text[:80]}") phrases = re.findall(r'(\b\w+ \w+ \w+\b)', text) if len(phrases) != len(set(phrases)) and len(phrases) > 4: issues.append(f" [seg {i}] DUPLICATE PHRASE: {text[:80]}") if not text.strip(): issues.append(f" [seg {i}] BLANK SEGMENT") if issues: print("\n ⚠ Quality issues found:") for iss in issues[:10]: print(iss) else: print("\n ✓ No repetition loops or blank segments detected") print("\n Sample output (first 5 segments):") for seg in segments[:5]: print(f" [{seg['start']:.1f}–{seg['end']:.1f}] {seg['text'][:100]}") PYCHECK PYEXIT=$? rm -f "$TMPJSON" [ $PYEXIT -eq 0 ] && ok "quality check passed" || fail "quality check" echo "" echo "=== 13. DELETE completed job → 409 Conflict ===" DEL_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X DELETE "$BASE/jobs/$JOB_ID") [ "$DEL_STATUS" = "409" ] && ok "DELETE completed job → 409 Conflict (expected)" \ || echo " [INFO] DELETE returned $DEL_STATUS" echo "" echo "=== 14. Submit + cancel a queued job ===" JOB2=$(curl -sf -X POST "$BASE/jobs" \ -F "audio=@${AUDIO};type=audio/wav" \ -F "language=en" \ -F "task=transcribe") JOB2_ID=$(echo "$JOB2" | python3 -c "import sys,json; print(json.load(sys.stdin)['job_id'])") sleep 1 curl -s -X DELETE "$BASE/jobs/$JOB2_ID" > /dev/null CANCEL_STATUS=$(curl -sf "$BASE/jobs/$JOB2_ID" | python3 -c "import sys,json; print(json.load(sys.stdin)['status'])") [ "$CANCEL_STATUS" = "cancelled" ] && ok "cancel works → status=cancelled" \ || echo " [INFO] cancel status: $CANCEL_STATUS (may be running — worker ignores cancel mid-chunk)" echo "" echo "=== 15. POST /model/unload ===" UNLOAD_RESP=$(curl -sf -X POST "$BASE/model/unload") echo "$UNLOAD_RESP" sleep 2 UNLOAD_STATE=$(curl -sf "$BASE/model/status" | python3 -c "import sys,json; print(json.load(sys.stdin)['state'])") [ "$UNLOAD_STATE" = "unloaded" ] && ok "model unloaded → state=unloaded" \ || echo " [INFO] state after unload: $UNLOAD_STATE" echo "" echo "=== 16. Verify webhook fired ===" sleep 3 kill $WEBHOOK_PID 2>/dev/null || true ok "webhook server stopped" echo "" if [ $FAILS -eq 0 ]; then echo -e "${GREEN}=== ALL TESTS PASSED ===${NC}" else echo -e "${RED}=== $FAILS TEST(S) FAILED ===${NC}" exit 1 fi