feat: replace Playwright extractor with yt-dlp subprocess

- Add instagram-extractor.ts: yt-dlp subprocess backend for Instagram
  caption extraction. No in-process browser state, maintained against
  Instagram frontend churn, supports cookies.txt for auth-walled reels.
- Add feature flag EXTRACTOR_BACKEND (ytdlp|playwright) in QueueProcessor
  so the old Playwright path remains available as fallback.
- Add 9 unit tests and 2 live-network integration tests for the new extractor.
- Dockerfile: install yt-dlp via pip3 alongside existing Chromium deps.
- docker-compose: expose EXTRACTOR_BACKEND env var (default: ytdlp).

Also in this commit:
- LLM: configurable per-request timeout via LLM_REQUEST_TIMEOUT_MS (default 120s);
  set maxRetries=0 to surface errors immediately; llama-swap /running health probe.
- QueueProcessor: thread progress callback through parser phase.
- LlmHealthIndicator: surface llama-swap loaded-model name.
- Logging: improve error serialization in queue-processor tests.
- .env.example: document llama-swap endpoint and model options.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Giancarmine Salucci
2026-05-12 20:46:31 +02:00
parent 6849a1fb26
commit 5b5bb947ef
14 changed files with 628 additions and 50 deletions

View File

@@ -1,9 +1,12 @@
<script lang="ts">
import { onMount } from 'svelte';
type HealthStatus = 'checking' | 'ok' | 'warming' | 'error';
interface HealthState {
status: 'checking' | 'healthy' | 'unhealthy' | 'error';
status: HealthStatus;
message: string;
configuredModel: string;
lastChecked: Date | null;
}
@@ -14,6 +17,7 @@
let health = $state<HealthState>({
status: 'checking',
message: '',
configuredModel: '',
lastChecked: null
});
@@ -21,24 +25,26 @@
try {
const res = await fetch('/api/llm-health');
const data = await res.json();
const status: HealthStatus =
data.status === 'ok' ? 'ok' : data.status === 'warming' ? 'warming' : 'error';
health = {
status: data.status === 'healthy' ? 'healthy' : 'unhealthy',
message: data.message,
status,
message: data.message ?? '',
configuredModel: data.configuredModel ?? '',
lastChecked: new Date()
};
} catch (e) {
health = {
status: 'error',
message: e instanceof Error ? e.message : 'Network error',
configuredModel: '',
lastChecked: new Date()
};
}
}
// Use onMount instead of $effect for timer-based side effects
// onMount only runs in browser, no SSR guard needed
onMount(() => {
checkHealth(); // Initial check
checkHealth();
const interval = setInterval(checkHealth, pollInterval);
return () => clearInterval(interval);
});
@@ -48,12 +54,12 @@
<div class="flex items-center gap-1">
{#if health.status === 'checking'}
🟡 <span>Checking LLM...</span>
{:else if health.status === 'healthy'}
{:else if health.status === 'ok'}
🟢 <span class="text-green-600">LLM Ready</span>
{:else if health.status === 'unhealthy'}
🔴 <span class="text-red-600">LLM Unavailable</span>
{:else if health.status === 'warming'}
🟡 <span class="text-yellow-600">LLM Cold ({health.configuredModel})</span>
{:else}
🔴 <span class="text-red-600">LLM Error</span>
🔴 <span class="text-red-600">LLM Unavailable</span>
{/if}
</div>
<div class="text-xs text-gray-500" title={health.message}>