feat: replace Playwright extractor with yt-dlp subprocess

- Add instagram-extractor.ts: yt-dlp subprocess backend for Instagram caption extraction. No in-process browser state, maintained against Instagram frontend churn, supports cookies.txt for auth-walled reels. - Add feature flag EXTRACTOR_BACKEND (ytdlp|playwright) in QueueProcessor so the old Playwright path remains available as fallback. - Add 9 unit tests and 2 live-network integration tests for the new extractor. - Dockerfile: install yt-dlp via pip3 alongside existing Chromium deps. - docker-compose: expose EXTRACTOR_BACKEND env var (default: ytdlp). Also in this commit: - LLM: configurable per-request timeout via LLM_REQUEST_TIMEOUT_MS (default 120s); set maxRetries=0 to surface errors immediately; llama-swap /running health probe. - QueueProcessor: thread progress callback through parser phase. - LlmHealthIndicator: surface llama-swap loaded-model name. - Logging: improve error serialization in queue-processor tests. - .env.example: document llama-swap endpoint and model options. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-12 20:46:31 +02:00
parent 6849a1fb26
commit 5b5bb947ef
14 changed files with 628 additions and 50 deletions
--- a/src/lib/server/extraction.ts
+++ b/src/lib/server/extraction.ts
@@ -26,7 +26,14 @@ type CaptionCandidate = {
 	brCount: number;
 };

-export type ProgressEventType = 'status' | 'method' | 'retry' | 'error' | 'thumbnail' | 'complete';
+export type ProgressEventType =
+	| 'status'
+	| 'method'
+	| 'retry'
+	| 'error'
+	| 'thumbnail'
+	| 'complete'
+	| 'model_loading';

 export interface ProgressEvent {
 	type: ProgressEventType;
--- a/src/lib/server/instagram-extractor.ts
+++ b/src/lib/server/instagram-extractor.ts
@@ -0,0 +1,193 @@
+/**
+ * Instagram extractor — yt-dlp subprocess implementation.
+ *
+ * Replaces the Playwright-based scraper. yt-dlp is maintained against
+ * Instagram's frontend churn, has no in-process state, and works on public
+ * reels without authentication. Login-walled reels can be supported by
+ * dropping a Netscape-format cookies file at the path under SECRETS_DIR.
+ */
+
+import { execFile } from 'node:child_process';
+import { promisify } from 'node:util';
+import { existsSync } from 'node:fs';
+import { logError } from './utils/logger';
+import type { ExtractedContent, ProgressCallback } from './extraction';
+
+const execFileAsync = promisify(execFile);
+
+const YTDLP_TIMEOUT_MS = 60_000;
+const IMAGE_FETCH_TIMEOUT_MS = 10_000;
+const USER_AGENT =
+	'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1';
+
+const COOKIE_PATHS = ['/app/secrets/cookies.txt', './secrets/cookies.txt'];
+
+function resolveCookiePath(): string | null {
+	for (const p of COOKIE_PATHS) {
+		if (existsSync(p)) return p;
+	}
+	return null;
+}
+
+interface YtDlpJson {
+	description?: string | null;
+	title?: string | null;
+	thumbnail?: string | null;
+	thumbnails?: Array<{ url?: string }>;
+}
+
+function pickThumbnailUrl(data: YtDlpJson): string | null {
+	if (data.thumbnail) return data.thumbnail;
+	const first = (data.thumbnails ?? []).find((t) => t?.url);
+	return first?.url ?? null;
+}
+
+async function fetchImageAsBase64(imageUrl: string): Promise<string | null> {
+	try {
+		const response = await fetch(imageUrl, {
+			signal: AbortSignal.timeout(IMAGE_FETCH_TIMEOUT_MS)
+		});
+		if (response.status !== 200) return null;
+		const contentType = response.headers.get('content-type') ?? '';
+		if (!contentType.startsWith('image/')) return null;
+		const buf = Buffer.from(await response.arrayBuffer());
+		return `data:${contentType};base64,${buf.toString('base64')}`;
+	} catch (e) {
+		logError('[ytdlp] Thumbnail fetch failed', e);
+		return null;
+	}
+}
+
+function classifyYtDlpError(stderr: string): { recoverable: boolean; reason: string } {
+	const lower = stderr.toLowerCase();
+	if (
+		lower.includes('login required') ||
+		lower.includes('login_required') ||
+		lower.includes('private') ||
+		lower.includes('rate-limit') ||
+		lower.includes('rate limit')
+	) {
+		return {
+			recoverable: false,
+			reason:
+				'Instagram requires authentication for this reel. Drop a Netscape cookies.txt at secrets/cookies.txt and retry.'
+		};
+	}
+	if (lower.includes('unsupported url')) {
+		return { recoverable: false, reason: 'URL not recognised by yt-dlp.' };
+	}
+	if (lower.includes('http error 404') || lower.includes('does not exist')) {
+		return { recoverable: false, reason: 'Reel not found (404).' };
+	}
+	return { recoverable: true, reason: stderr.split('\n').filter(Boolean).slice(-2).join(' ') };
+}
+
+/**
+ * Extract caption text + thumbnail data-URL from an Instagram reel.
+ *
+ * Mirrors the signature of the legacy Playwright extractor so QueueProcessor
+ * needs no contract change. ProgressCallback events use existing types
+ * (`status`, `method`, `error`) so the SSE consumers do not need updates.
+ */
+export async function extractTextAndThumbnail(
+	url: string,
+	progressCallback?: ProgressCallback
+): Promise<ExtractedContent> {
+	progressCallback?.({
+		type: 'status',
+		message: 'Invoking yt-dlp...',
+		timestamp: new Date().toISOString()
+	});
+
+	const cookies = resolveCookiePath();
+	if (cookies) {
+		progressCallback?.({
+			type: 'status',
+			message: `Using cookies from ${cookies}`,
+			timestamp: new Date().toISOString()
+		});
+	}
+
+	const args = [
+		'--dump-single-json',
+		'--skip-download',
+		'--no-warnings',
+		'--no-call-home',
+		'--socket-timeout',
+		'20',
+		'--user-agent',
+		USER_AGENT,
+		...(cookies ? ['--cookies', cookies] : []),
+		url
+	];
+
+	let stdout: string;
+	try {
+		const result = await execFileAsync('yt-dlp', args, {
+			timeout: YTDLP_TIMEOUT_MS,
+			maxBuffer: 10 * 1024 * 1024
+		});
+		stdout = result.stdout;
+	} catch (e: any) {
+		const stderr = String(e?.stderr ?? e?.message ?? '');
+		const code = e?.code;
+		if (code === 'ENOENT') {
+			throw new Error(
+				'yt-dlp is not installed in this container. Add it to the Dockerfile.'
+			);
+		}
+		const { recoverable, reason } = classifyYtDlpError(stderr);
+		progressCallback?.({
+			type: 'error',
+			message: `yt-dlp failed: ${reason}`,
+			timestamp: new Date().toISOString()
+		});
+		const err = new Error(`yt-dlp extraction failed: ${reason}`);
+		// QueueProcessor.isRecoverableError() classifies on message; surface keywords.
+		if (!recoverable) (err as any).nonRecoverable = true;
+		throw err;
+	}
+
+	let data: YtDlpJson;
+	try {
+		data = JSON.parse(stdout);
+	} catch (e) {
+		logError('[ytdlp] Failed to parse yt-dlp JSON output', e);
+		throw new Error('yt-dlp returned invalid JSON');
+	}
+
+	const bodyText = (data.description ?? data.title ?? '').trim();
+	if (!bodyText) {
+		throw new Error('yt-dlp returned no description for this reel');
+	}
+
+	progressCallback?.({
+		type: 'status',
+		message: `Caption extracted (${bodyText.length} chars)`,
+		timestamp: new Date().toISOString()
+	});
+
+	let thumbnail: string | null = null;
+	const thumbUrl = pickThumbnailUrl(data);
+	if (thumbUrl) {
+		progressCallback?.({
+			type: 'thumbnail',
+			message: 'Fetching thumbnail...',
+			timestamp: new Date().toISOString()
+		});
+		thumbnail = await fetchImageAsBase64(thumbUrl);
+		progressCallback?.({
+			type: 'status',
+			message: thumbnail ? 'Thumbnail fetched' : 'Thumbnail fetch failed (continuing without)',
+			timestamp: new Date().toISOString()
+		});
+	}
+
+	progressCallback?.({
+		type: 'complete',
+		message: 'Extraction complete',
+		timestamp: new Date().toISOString()
+	});
+
+	return { bodyText, thumbnail };
+}
--- a/src/lib/server/llm.ts
+++ b/src/lib/server/llm.ts
@@ -2,15 +2,24 @@ import OpenAI from 'openai';
 import { env } from '$env/dynamic/private';
 import { logError } from './utils/logger';

+const DEFAULT_REQUEST_TIMEOUT_MS = 120_000;
+
+const parseTimeoutMs = (raw: string | undefined): number => {
+	if (!raw) return DEFAULT_REQUEST_TIMEOUT_MS;
+	const n = Number(raw);
+	return Number.isFinite(n) && n > 0 ? n : DEFAULT_REQUEST_TIMEOUT_MS;
+};
+
 export const createLLM = () => {
-	// Detect if we are using Ollama or OpenAI based on URL
 	const baseURL = env.OPENAI_BASE_URL;
 	const apiKey = env.OPENAI_API_KEY;
 	const model = env.LLM_MODEL || 'gpt-4o';
+	const timeout = parseTimeoutMs(env.LLM_REQUEST_TIMEOUT_MS);

 	console.log('[LLM] Initializing client...');
 	console.log('[LLM] Base URL:', baseURL);
 	console.log('[LLM] Model:', model);
+	console.log('[LLM] Request timeout (ms):', timeout);

 	if (!baseURL) {
 		throw new Error('OPENAI_BASE_URL environment variable is not set');
@@ -22,7 +31,9 @@ export const createLLM = () => {

 	const client = new OpenAI({
 		apiKey,
-		baseURL
+		baseURL,
+		timeout,
+		maxRetries: 0
 	});

 	return { client, model };
@@ -43,6 +54,47 @@ export async function checkLLMHealth(): Promise<boolean> {
 	}
 }

+/**
+ * Strip a trailing /v1 (or /v1/) from a base URL to get the llama-swap root.
+ * llama-swap exposes both /v1/* (OpenAI-compatible) and /running, /upstream, etc.
+ * at the bare root.
+ */
+function llamaSwapRoot(baseURL: string): string {
+	return baseURL.replace(/\/v1\/?$/, '').replace(/\/$/, '');
+}
+
+interface RunningModelEntry {
+	model: string;
+	state?: string;
+}
+
+/**
+ * Query llama-swap's /running endpoint and report whether `model` is currently
+ * loaded and ready to serve. Returns false on any error (treat as cold).
+ *
+ * Why we don't fold this into checkModelAvailability(): /v1/models lists every
+ * model llama-swap is configured to swap to (not just loaded ones), while
+ * /running returns only the in-VRAM instance. Both signals are useful.
+ */
+export async function isModelLoaded(model: string): Promise<boolean> {
+	const baseURL = env.OPENAI_BASE_URL;
+	if (!baseURL) return false;
+
+	try {
+		const url = `${llamaSwapRoot(baseURL)}/running`;
+		const response = await fetch(url, {
+			signal: AbortSignal.timeout(5_000)
+		});
+		if (!response.ok) return false;
+		const data = (await response.json()) as { running?: RunningModelEntry[] };
+		const running = data.running ?? [];
+		return running.some((m) => m.model === model && (m.state ?? 'ready') === 'ready');
+	} catch (e) {
+		logError('[LLM] isModelLoaded check failed', e);
+		return false;
+	}
+}
+
 /**
 * Check if a specific model is available in the OpenAI-compatible API
 * @param model - The model ID to check for availability
--- a/src/lib/server/parser.ts
+++ b/src/lib/server/parser.ts
@@ -1,8 +1,9 @@
-import { createLLM, checkModelAvailability } from './llm';
+import { createLLM, checkModelAvailability, isModelLoaded } from './llm';
 import { zodResponseFormat } from 'openai/helpers/zod';
 import { z } from 'zod';
 import { RECIPE_DETECTION_PROMPT, RECIPE_EXTRACTION_PROMPT } from './prompts/recipe-extraction';
 import { logError } from './utils/logger';
+import type { ProgressCallback } from './extraction';

 const RecipeSchema = z.object({
 	name: z.string(),
@@ -144,11 +145,33 @@ export async function parseRecipe(text: string): Promise<Recipe> {
 }

 /**
- * Complete workflow: detect recipe and parse if found
+ * Complete workflow: detect recipe and parse if found.
+ *
+ * Emits a `model_loading` progress event (if a callback is supplied) when the
+ * configured llama-swap model is not yet warm — the first request after idle
+ * blocks for several seconds while llama-swap loads the model into VRAM.
+ *
 * @param text - The text to analyze
+ * @param progressCallback - Optional callback for surfacing cold-load state
 * @returns Parsed recipe object if detected, null otherwise
 */
-export async function extractRecipe(text: string): Promise<Recipe | null> {
+export async function extractRecipe(
+	text: string,
+	progressCallback?: ProgressCallback
+): Promise<Recipe | null> {
+	if (progressCallback) {
+		const { model } = createLLM();
+		const warm = await isModelLoaded(model);
+		if (!warm) {
+			progressCallback({
+				type: 'model_loading',
+				message: `Inference server cold — loading ${model} into VRAM (5–30s)...`,
+				data: { model },
+				timestamp: new Date().toISOString()
+			});
+		}
+	}
+
 	const isRecipe = await detectRecipe(text);

 	if (!isRecipe) {
--- a/src/lib/server/queue/QueueProcessor.ts
+++ b/src/lib/server/queue/QueueProcessor.ts
@@ -12,15 +12,30 @@
 */

 import { queueManager } from './QueueManager';
-import { extractTextAndThumbnail } from '$lib/server/extraction';
+import { extractTextAndThumbnail as extractWithPlaywright } from '$lib/server/extraction';
+import { extractTextAndThumbnail as extractWithYtDlp } from '$lib/server/instagram-extractor';
 import { extractRecipe } from '$lib/server/parser';
 import { uploadRecipeWithIngredientsDTO, uploadRecipeImage } from '$lib/server/tandoor';
 import { pushNotificationService } from '$lib/server/notifications/PushNotificationService';
 import { queueConfig } from './config';
 import { logError } from '../utils/logger';
-import type { ProgressEvent } from '$lib/server/extraction';
+import { env } from '$env/dynamic/private';
+import type { ProgressEvent, ExtractedContent, ProgressCallback } from '$lib/server/extraction';
 import type { QueueItem } from './types';

+// Feature flag: pick which Instagram extractor backend to invoke.
+// Default to yt-dlp; set EXTRACTOR_BACKEND=playwright to fall back to the
+// legacy stealth scraper while we verify the new path.
+const extractTextAndThumbnail = (
+	url: string,
+	cb?: ProgressCallback
+): Promise<ExtractedContent> => {
+	const backend = (env.EXTRACTOR_BACKEND ?? 'ytdlp').toLowerCase();
+	return backend === 'playwright'
+		? extractWithPlaywright(url, cb)
+		: extractWithYtDlp(url, cb);
+};
+
 /**
 * Queue processor with configurable concurrency
 *
@@ -250,7 +265,9 @@ export class QueueProcessor {
 		});

 		console.log(`[QueueProcessor] Parsing recipe: ${item.id}`);
-		const recipe = await extractRecipe(item.extractedText);
+		const recipe = await extractRecipe(item.extractedText, (event) => {
+			queueManager.addProgressEvent(item.id, event);
+		});

 		if (!recipe) {
 			throw new Error('Failed to parse recipe from extracted text');