feat: replace Playwright extractor with yt-dlp subprocess

- Add instagram-extractor.ts: yt-dlp subprocess backend for Instagram caption extraction. No in-process browser state, maintained against Instagram frontend churn, supports cookies.txt for auth-walled reels. - Add feature flag EXTRACTOR_BACKEND (ytdlp|playwright) in QueueProcessor so the old Playwright path remains available as fallback. - Add 9 unit tests and 2 live-network integration tests for the new extractor. - Dockerfile: install yt-dlp via pip3 alongside existing Chromium deps. - docker-compose: expose EXTRACTOR_BACKEND env var (default: ytdlp). Also in this commit: - LLM: configurable per-request timeout via LLM_REQUEST_TIMEOUT_MS (default 120s); set maxRetries=0 to surface errors immediately; llama-swap /running health probe. - QueueProcessor: thread progress callback through parser phase. - LlmHealthIndicator: surface llama-swap loaded-model name. - Logging: improve error serialization in queue-processor tests. - .env.example: document llama-swap endpoint and model options. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-12 20:46:31 +02:00
parent 6849a1fb26
commit 5b5bb947ef
14 changed files with 628 additions and 50 deletions
--- a/.env.example
+++ b/.env.example
@@ -7,15 +7,23 @@
 # ==============================================================================
 # LLM Configuration (REQUIRED)
 # ==============================================================================
-# OpenAI-compatible API endpoint (OpenAI, LM Studio, Ollama, LiteLLM, etc.)
-OPENAI_BASE_URL=http://localhost:1234/v1
+# OpenAI-compatible API endpoint. Production: llama-swap on ideapad.
+# llama-swap loads models on demand and unloads them after globalTTL (10 min).
+OPENAI_BASE_URL=http://192.168.1.50:8080/v1

-# API key for authentication
-OPENAI_API_KEY=your-api-key-here
+# API key for authentication (llama-swap accepts any non-empty value).
+OPENAI_API_KEY=sk-llama-local

-# Model to use for recipe extraction
-# Examples: gpt-4o, gpt-4o-mini, llama-3.1, mistral, etc.
-LLM_MODEL=google/gemma-3-4b
+# Model to use for recipe extraction. Available on the ideapad llama-swap stack:
+#   gemma4-e4b-q6k       (recommended — 4B, 65k ctx, 31 TPS)
+#   gemma4-e2b-q8_0      (faster — 2B, 65k ctx, 55 TPS)
+#   qwen3.5-4b-q8_0      (fallback — 22 TPS)
+#   phi4-mini-q8_0, granite-3.3-8b-q6k, plus larger MoE variants
+LLM_MODEL=gemma4-e4b-q6k
+
+# Per-request LLM timeout in ms. Must cover llama-swap cold-load (~5–30s for
+# small models) plus generation time. Default 120000.
+LLM_REQUEST_TIMEOUT_MS=120000

 # ==============================================================================
 # Queue Configuration (OPTIONAL)
@@ -55,9 +63,23 @@ VAPID_PUBLIC_KEY=BNextdcB_fQ0BVvyGioM5L8Tf9vKQjs-WnF-rUbnU8MdWIZQYfggIHxBnW21I-l
 VAPID_PRIVATE_KEY=JwxI_KcsBcehYcTOufMcbVWJjCq1QbH5FJmSyQuG680

 # ==============================================================================
-# Authentication Scheduler (OPTIONAL)
+# Instagram Extraction Backend
 # ==============================================================================
-# Enable automatic Instagram authentication renewal
+# Which extractor to use:
+#   ytdlp      (default) — yt-dlp subprocess, stateless, Sablier-safe
+#   playwright           — legacy Playwright stealth scraper, requires
+#                          secrets/auth.json + AUTH_SCHEDULER_* below
+EXTRACTOR_BACKEND=ytdlp
+
+# Optional Netscape-format cookies file for login-walled reels.
+# yt-dlp picks it up automatically if it exists at /app/secrets/cookies.txt
+# (Docker) or ./secrets/cookies.txt (local). No automation; export from a
+# browser when an extraction starts hitting login walls.
+
+# ==============================================================================
+# Authentication Scheduler (LEGACY — only relevant when EXTRACTOR_BACKEND=playwright)
+# ==============================================================================
+# Enable automatic Instagram authentication renewal (Playwright backend only)
 AUTH_SCHEDULER_ENABLED=true

 # Renewal interval in minutes (default: 720 = 12 hours)
--- a/7
+++ b/7
@@ -1,12 +1,15 @@
 FROM node:24-alpine
 WORKDIR /app

-# Install Playwright system dependencies
+# Install yt-dlp (primary Instagram extractor) and Playwright system dependencies (fallback)
 RUN apk add --no-cache \
+    python3 \
+    py3-pip \
    chromium \
    font-liberation \
    font-noto \
-    font-noto-cjk
+    font-noto-cjk && \
+    pip3 install --break-system-packages yt-dlp

 COPY package*.json ./
 RUN npm ci
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -32,6 +32,9 @@ services:
      # Playwright Configuration
      - DISPLAY=:99

+      # Extractor backend: 'ytdlp' (default) or 'playwright' (legacy fallback)
+      - EXTRACTOR_BACKEND=${EXTRACTOR_BACKEND:-ytdlp}
+
      # Node.js Environment
      - NODE_ENV=production
    security_opt:
--- a/src/lib/server/extraction.ts
+++ b/src/lib/server/extraction.ts
@@ -26,7 +26,14 @@ type CaptionCandidate = {
 	brCount: number;
 };

-export type ProgressEventType = 'status' | 'method' | 'retry' | 'error' | 'thumbnail' | 'complete';
+export type ProgressEventType =
+	| 'status'
+	| 'method'
+	| 'retry'
+	| 'error'
+	| 'thumbnail'
+	| 'complete'
+	| 'model_loading';

 export interface ProgressEvent {
 	type: ProgressEventType;
--- a/src/lib/server/instagram-extractor.ts
+++ b/src/lib/server/instagram-extractor.ts
@@ -0,0 +1,193 @@
+/**
+ * Instagram extractor — yt-dlp subprocess implementation.
+ *
+ * Replaces the Playwright-based scraper. yt-dlp is maintained against
+ * Instagram's frontend churn, has no in-process state, and works on public
+ * reels without authentication. Login-walled reels can be supported by
+ * dropping a Netscape-format cookies file at the path under SECRETS_DIR.
+ */
+
+import { execFile } from 'node:child_process';
+import { promisify } from 'node:util';
+import { existsSync } from 'node:fs';
+import { logError } from './utils/logger';
+import type { ExtractedContent, ProgressCallback } from './extraction';
+
+const execFileAsync = promisify(execFile);
+
+const YTDLP_TIMEOUT_MS = 60_000;
+const IMAGE_FETCH_TIMEOUT_MS = 10_000;
+const USER_AGENT =
+	'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1';
+
+const COOKIE_PATHS = ['/app/secrets/cookies.txt', './secrets/cookies.txt'];
+
+function resolveCookiePath(): string | null {
+	for (const p of COOKIE_PATHS) {
+		if (existsSync(p)) return p;
+	}
+	return null;
+}
+
+interface YtDlpJson {
+	description?: string | null;
+	title?: string | null;
+	thumbnail?: string | null;
+	thumbnails?: Array<{ url?: string }>;
+}
+
+function pickThumbnailUrl(data: YtDlpJson): string | null {
+	if (data.thumbnail) return data.thumbnail;
+	const first = (data.thumbnails ?? []).find((t) => t?.url);
+	return first?.url ?? null;
+}
+
+async function fetchImageAsBase64(imageUrl: string): Promise<string | null> {
+	try {
+		const response = await fetch(imageUrl, {
+			signal: AbortSignal.timeout(IMAGE_FETCH_TIMEOUT_MS)
+		});
+		if (response.status !== 200) return null;
+		const contentType = response.headers.get('content-type') ?? '';
+		if (!contentType.startsWith('image/')) return null;
+		const buf = Buffer.from(await response.arrayBuffer());
+		return `data:${contentType};base64,${buf.toString('base64')}`;
+	} catch (e) {
+		logError('[ytdlp] Thumbnail fetch failed', e);
+		return null;
+	}
+}
+
+function classifyYtDlpError(stderr: string): { recoverable: boolean; reason: string } {
+	const lower = stderr.toLowerCase();
+	if (
+		lower.includes('login required') ||
+		lower.includes('login_required') ||
+		lower.includes('private') ||
+		lower.includes('rate-limit') ||
+		lower.includes('rate limit')
+	) {
+		return {
+			recoverable: false,
+			reason:
+				'Instagram requires authentication for this reel. Drop a Netscape cookies.txt at secrets/cookies.txt and retry.'
+		};
+	}
+	if (lower.includes('unsupported url')) {
+		return { recoverable: false, reason: 'URL not recognised by yt-dlp.' };
+	}
+	if (lower.includes('http error 404') || lower.includes('does not exist')) {
+		return { recoverable: false, reason: 'Reel not found (404).' };
+	}
+	return { recoverable: true, reason: stderr.split('\n').filter(Boolean).slice(-2).join(' ') };
+}
+
+/**
+ * Extract caption text + thumbnail data-URL from an Instagram reel.
+ *
+ * Mirrors the signature of the legacy Playwright extractor so QueueProcessor
+ * needs no contract change. ProgressCallback events use existing types
+ * (`status`, `method`, `error`) so the SSE consumers do not need updates.
+ */
+export async function extractTextAndThumbnail(
+	url: string,
+	progressCallback?: ProgressCallback
+): Promise<ExtractedContent> {
+	progressCallback?.({
+		type: 'status',
+		message: 'Invoking yt-dlp...',
+		timestamp: new Date().toISOString()
+	});
+
+	const cookies = resolveCookiePath();
+	if (cookies) {
+		progressCallback?.({
+			type: 'status',
+			message: `Using cookies from ${cookies}`,
+			timestamp: new Date().toISOString()
+		});
+	}
+
+	const args = [
+		'--dump-single-json',
+		'--skip-download',
+		'--no-warnings',
+		'--no-call-home',
+		'--socket-timeout',
+		'20',
+		'--user-agent',
+		USER_AGENT,
+		...(cookies ? ['--cookies', cookies] : []),
+		url
+	];
+
+	let stdout: string;
+	try {
+		const result = await execFileAsync('yt-dlp', args, {
+			timeout: YTDLP_TIMEOUT_MS,
+			maxBuffer: 10 * 1024 * 1024
+		});
+		stdout = result.stdout;
+	} catch (e: any) {
+		const stderr = String(e?.stderr ?? e?.message ?? '');
+		const code = e?.code;
+		if (code === 'ENOENT') {
+			throw new Error(
+				'yt-dlp is not installed in this container. Add it to the Dockerfile.'
+			);
+		}
+		const { recoverable, reason } = classifyYtDlpError(stderr);
+		progressCallback?.({
+			type: 'error',
+			message: `yt-dlp failed: ${reason}`,
+			timestamp: new Date().toISOString()
+		});
+		const err = new Error(`yt-dlp extraction failed: ${reason}`);
+		// QueueProcessor.isRecoverableError() classifies on message; surface keywords.
+		if (!recoverable) (err as any).nonRecoverable = true;
+		throw err;
+	}
+
+	let data: YtDlpJson;
+	try {
+		data = JSON.parse(stdout);
+	} catch (e) {
+		logError('[ytdlp] Failed to parse yt-dlp JSON output', e);
+		throw new Error('yt-dlp returned invalid JSON');
+	}
+
+	const bodyText = (data.description ?? data.title ?? '').trim();
+	if (!bodyText) {
+		throw new Error('yt-dlp returned no description for this reel');
+	}
+
+	progressCallback?.({
+		type: 'status',
+		message: `Caption extracted (${bodyText.length} chars)`,
+		timestamp: new Date().toISOString()
+	});
+
+	let thumbnail: string | null = null;
+	const thumbUrl = pickThumbnailUrl(data);
+	if (thumbUrl) {
+		progressCallback?.({
+			type: 'thumbnail',
+			message: 'Fetching thumbnail...',
+			timestamp: new Date().toISOString()
+		});
+		thumbnail = await fetchImageAsBase64(thumbUrl);
+		progressCallback?.({
+			type: 'status',
+			message: thumbnail ? 'Thumbnail fetched' : 'Thumbnail fetch failed (continuing without)',
+			timestamp: new Date().toISOString()
+		});
+	}
+
+	progressCallback?.({
+		type: 'complete',
+		message: 'Extraction complete',
+		timestamp: new Date().toISOString()
+	});
+
+	return { bodyText, thumbnail };
+}
--- a/src/lib/server/llm.ts
+++ b/src/lib/server/llm.ts
@@ -2,15 +2,24 @@ import OpenAI from 'openai';
 import { env } from '$env/dynamic/private';
 import { logError } from './utils/logger';

+const DEFAULT_REQUEST_TIMEOUT_MS = 120_000;
+
+const parseTimeoutMs = (raw: string | undefined): number => {
+	if (!raw) return DEFAULT_REQUEST_TIMEOUT_MS;
+	const n = Number(raw);
+	return Number.isFinite(n) && n > 0 ? n : DEFAULT_REQUEST_TIMEOUT_MS;
+};
+
 export const createLLM = () => {
-	// Detect if we are using Ollama or OpenAI based on URL
 	const baseURL = env.OPENAI_BASE_URL;
 	const apiKey = env.OPENAI_API_KEY;
 	const model = env.LLM_MODEL || 'gpt-4o';
+	const timeout = parseTimeoutMs(env.LLM_REQUEST_TIMEOUT_MS);

 	console.log('[LLM] Initializing client...');
 	console.log('[LLM] Base URL:', baseURL);
 	console.log('[LLM] Model:', model);
+	console.log('[LLM] Request timeout (ms):', timeout);

 	if (!baseURL) {
 		throw new Error('OPENAI_BASE_URL environment variable is not set');
@@ -22,7 +31,9 @@ export const createLLM = () => {

 	const client = new OpenAI({
 		apiKey,
-		baseURL
+		baseURL,
+		timeout,
+		maxRetries: 0
 	});

 	return { client, model };
@@ -43,6 +54,47 @@ export async function checkLLMHealth(): Promise<boolean> {
 	}
 }

+/**
+ * Strip a trailing /v1 (or /v1/) from a base URL to get the llama-swap root.
+ * llama-swap exposes both /v1/* (OpenAI-compatible) and /running, /upstream, etc.
+ * at the bare root.
+ */
+function llamaSwapRoot(baseURL: string): string {
+	return baseURL.replace(/\/v1\/?$/, '').replace(/\/$/, '');
+}
+
+interface RunningModelEntry {
+	model: string;
+	state?: string;
+}
+
+/**
+ * Query llama-swap's /running endpoint and report whether `model` is currently
+ * loaded and ready to serve. Returns false on any error (treat as cold).
+ *
+ * Why we don't fold this into checkModelAvailability(): /v1/models lists every
+ * model llama-swap is configured to swap to (not just loaded ones), while
+ * /running returns only the in-VRAM instance. Both signals are useful.
+ */
+export async function isModelLoaded(model: string): Promise<boolean> {
+	const baseURL = env.OPENAI_BASE_URL;
+	if (!baseURL) return false;
+
+	try {
+		const url = `${llamaSwapRoot(baseURL)}/running`;
+		const response = await fetch(url, {
+			signal: AbortSignal.timeout(5_000)
+		});
+		if (!response.ok) return false;
+		const data = (await response.json()) as { running?: RunningModelEntry[] };
+		const running = data.running ?? [];
+		return running.some((m) => m.model === model && (m.state ?? 'ready') === 'ready');
+	} catch (e) {
+		logError('[LLM] isModelLoaded check failed', e);
+		return false;
+	}
+}
+
 /**
 * Check if a specific model is available in the OpenAI-compatible API
 * @param model - The model ID to check for availability
--- a/src/lib/server/parser.ts
+++ b/src/lib/server/parser.ts
@@ -1,8 +1,9 @@
-import { createLLM, checkModelAvailability } from './llm';
+import { createLLM, checkModelAvailability, isModelLoaded } from './llm';
 import { zodResponseFormat } from 'openai/helpers/zod';
 import { z } from 'zod';
 import { RECIPE_DETECTION_PROMPT, RECIPE_EXTRACTION_PROMPT } from './prompts/recipe-extraction';
 import { logError } from './utils/logger';
+import type { ProgressCallback } from './extraction';

 const RecipeSchema = z.object({
 	name: z.string(),
@@ -144,11 +145,33 @@ export async function parseRecipe(text: string): Promise<Recipe> {
 }

 /**
- * Complete workflow: detect recipe and parse if found
+ * Complete workflow: detect recipe and parse if found.
+ *
+ * Emits a `model_loading` progress event (if a callback is supplied) when the
+ * configured llama-swap model is not yet warm — the first request after idle
+ * blocks for several seconds while llama-swap loads the model into VRAM.
+ *
 * @param text - The text to analyze
+ * @param progressCallback - Optional callback for surfacing cold-load state
 * @returns Parsed recipe object if detected, null otherwise
 */
-export async function extractRecipe(text: string): Promise<Recipe | null> {
+export async function extractRecipe(
+	text: string,
+	progressCallback?: ProgressCallback
+): Promise<Recipe | null> {
+	if (progressCallback) {
+		const { model } = createLLM();
+		const warm = await isModelLoaded(model);
+		if (!warm) {
+			progressCallback({
+				type: 'model_loading',
+				message: `Inference server cold — loading ${model} into VRAM (5–30s)...`,
+				data: { model },
+				timestamp: new Date().toISOString()
+			});
+		}
+	}
+
 	const isRecipe = await detectRecipe(text);

 	if (!isRecipe) {
--- a/src/lib/server/queue/QueueProcessor.ts
+++ b/src/lib/server/queue/QueueProcessor.ts
@@ -12,15 +12,30 @@
 */

 import { queueManager } from './QueueManager';
-import { extractTextAndThumbnail } from '$lib/server/extraction';
+import { extractTextAndThumbnail as extractWithPlaywright } from '$lib/server/extraction';
+import { extractTextAndThumbnail as extractWithYtDlp } from '$lib/server/instagram-extractor';
 import { extractRecipe } from '$lib/server/parser';
 import { uploadRecipeWithIngredientsDTO, uploadRecipeImage } from '$lib/server/tandoor';
 import { pushNotificationService } from '$lib/server/notifications/PushNotificationService';
 import { queueConfig } from './config';
 import { logError } from '../utils/logger';
-import type { ProgressEvent } from '$lib/server/extraction';
+import { env } from '$env/dynamic/private';
+import type { ProgressEvent, ExtractedContent, ProgressCallback } from '$lib/server/extraction';
 import type { QueueItem } from './types';

+// Feature flag: pick which Instagram extractor backend to invoke.
+// Default to yt-dlp; set EXTRACTOR_BACKEND=playwright to fall back to the
+// legacy stealth scraper while we verify the new path.
+const extractTextAndThumbnail = (
+	url: string,
+	cb?: ProgressCallback
+): Promise<ExtractedContent> => {
+	const backend = (env.EXTRACTOR_BACKEND ?? 'ytdlp').toLowerCase();
+	return backend === 'playwright'
+		? extractWithPlaywright(url, cb)
+		: extractWithYtDlp(url, cb);
+};
+
 /**
 * Queue processor with configurable concurrency
 *
@@ -250,7 +265,9 @@ export class QueueProcessor {
 		});

 		console.log(`[QueueProcessor] Parsing recipe: ${item.id}`);
-		const recipe = await extractRecipe(item.extractedText);
+		const recipe = await extractRecipe(item.extractedText, (event) => {
+			queueManager.addProgressEvent(item.id, event);
+		});

 		if (!recipe) {
 			throw new Error('Failed to parse recipe from extracted text');
--- a/src/routes/api/llm-health/+server.ts
+++ b/src/routes/api/llm-health/+server.ts
@@ -1,34 +1,48 @@
 import { json } from '@sveltejs/kit';
-import { checkLLMHealth } from '$lib/server/llm';
+import { env } from '$env/dynamic/private';
+import { checkLLMHealth, isModelLoaded } from '$lib/server/llm';

 /**
- * Health check endpoint for LLM service
- * Tests connectivity to LM Studio or OpenAI-compatible endpoint
+ * Health check endpoint for the LLM service (llama-swap on ideapad).
+ *
+ * Three states:
+ *  - ok      → endpoint reachable AND configured model is loaded in VRAM
+ *  - warming → endpoint reachable but configured model not yet loaded
+ *              (next request will trigger a cold load)
+ *  - error   → endpoint unreachable
 */
 export async function GET() {
 	try {
-		const isHealthy = await checkLLMHealth();
+		const reachable = await checkLLMHealth();
+		const configuredModel = env.LLM_MODEL || 'gpt-4o';

-		if (isHealthy) {
-			return json({
-				status: 'healthy',
-				message: 'LLM service is accessible'
-			});
-		} else {
+		if (!reachable) {
 			return json(
 				{
-					status: 'unhealthy',
-					message: 'LLM service is not accessible'
+					status: 'error',
+					message: 'LLM service is not accessible',
+					configuredModel
 				},
 				{ status: 503 }
 			);
 		}
+
+		const warm = await isModelLoaded(configuredModel);
+		return json({
+			status: warm ? 'ok' : 'warming',
+			message: warm
+				? `Model ${configuredModel} loaded and ready`
+				: `Model ${configuredModel} configured; next request will trigger a cold load`,
+			configuredModel,
+			loaded: warm
+		});
 	} catch (error) {
 		const errorMessage = error instanceof Error ? error.message : 'Unknown error';
 		return json(
 			{
 				status: 'error',
-				message: errorMessage
+				message: errorMessage,
+				configuredModel: env.LLM_MODEL || 'gpt-4o'
 			},
 			{ status: 500 }
 		);
--- a/src/routes/share/components/LlmHealthIndicator.svelte
+++ b/src/routes/share/components/LlmHealthIndicator.svelte
@@ -1,9 +1,12 @@
 <script lang="ts">
 	import { onMount } from 'svelte';

+	type HealthStatus = 'checking' | 'ok' | 'warming' | 'error';
+
 	interface HealthState {
-		status: 'checking' | 'healthy' | 'unhealthy' | 'error';
+		status: HealthStatus;
 		message: string;
+		configuredModel: string;
 		lastChecked: Date | null;
 	}

@@ -14,6 +17,7 @@
 	let health = $state<HealthState>({
 		status: 'checking',
 		message: '',
+		configuredModel: '',
 		lastChecked: null
 	});

@@ -21,24 +25,26 @@
 		try {
 			const res = await fetch('/api/llm-health');
 			const data = await res.json();
+			const status: HealthStatus =
+				data.status === 'ok' ? 'ok' : data.status === 'warming' ? 'warming' : 'error';
 			health = {
-				status: data.status === 'healthy' ? 'healthy' : 'unhealthy',
-				message: data.message,
+				status,
+				message: data.message ?? '',
+				configuredModel: data.configuredModel ?? '',
 				lastChecked: new Date()
 			};
 		} catch (e) {
 			health = {
 				status: 'error',
 				message: e instanceof Error ? e.message : 'Network error',
+				configuredModel: '',
 				lastChecked: new Date()
 			};
 		}
 	}

-	// Use onMount instead of $effect for timer-based side effects
-	// onMount only runs in browser, no SSR guard needed
 	onMount(() => {
-		checkHealth(); // Initial check
+		checkHealth();
 		const interval = setInterval(checkHealth, pollInterval);
 		return () => clearInterval(interval);
 	});
@@ -48,12 +54,12 @@
 	<div class="flex items-center gap-1">
 		{#if health.status === 'checking'}
 			🟡 <span>Checking LLM...</span>
-		{:else if health.status === 'healthy'}
+		{:else if health.status === 'ok'}
 			🟢 <span class="text-green-600">LLM Ready</span>
-		{:else if health.status === 'unhealthy'}
-			🔴 <span class="text-red-600">LLM Unavailable</span>
+		{:else if health.status === 'warming'}
+			🟡 <span class="text-yellow-600">LLM Cold ({health.configuredModel})</span>
 		{:else}
-			🔴 <span class="text-red-600">LLM Error</span>
+			🔴 <span class="text-red-600">LLM Unavailable</span>
 		{/if}
 	</div>
 	<div class="text-xs text-gray-500" title={health.message}>
--- a/src/tests/instagram-extractor.integration.spec.ts
+++ b/src/tests/instagram-extractor.integration.spec.ts
@@ -0,0 +1,49 @@
+/**
+ * E2E integration test for the yt-dlp Instagram extractor.
+ *
+ * Makes real network calls (yt-dlp + Instagram CDN). Requires:
+ *   - yt-dlp installed on PATH
+ *   - Network access to instagram.com
+ *   - EXTRACTOR_E2E=1 env var (safety guard to avoid running in normal test runs)
+ *
+ * Run with:
+ *   EXTRACTOR_E2E=1 npm test -- src/tests/instagram-extractor.e2e.spec.ts
+ */
+
+import { describe, it, expect } from 'vitest';
+import { extractTextAndThumbnail } from '$lib/server/instagram-extractor';
+
+const E2E = !!process.env.EXTRACTOR_E2E;
+
+describe.skipIf(!E2E)('instagram-extractor E2E (requires yt-dlp + network)', () => {
+	// Public reels that have previously been in the app queue
+	const TEST_REELS = [
+		{
+			url: 'https://www.instagram.com/reel/DX4XEDZt3qT/',
+			expectKeyword: 'pizza'
+		},
+		{
+			url: 'https://www.instagram.com/reel/DUtHm2EiD26/',
+			expectKeyword: 'noodles'
+		}
+	];
+
+	for (const { url, expectKeyword } of TEST_REELS) {
+		it(`extracts caption from ${url}`, async () => {
+			const events: { type: string; message: string }[] = [];
+			const result = await extractTextAndThumbnail(url, (e) =>
+				events.push(e as { type: string; message: string })
+			);
+
+			expect(result.bodyText.length).toBeGreaterThan(20);
+			expect(result.bodyText.toLowerCase()).toContain(expectKeyword);
+
+			if (result.thumbnail !== null) {
+				expect(result.thumbnail).toMatch(/^data:image\//);
+			}
+
+			expect(events.some((e) => e.type === 'complete')).toBe(true);
+			expect(events.some((e) => e.type === 'status' && e.message.includes('yt-dlp'))).toBe(true);
+		}, 90_000);
+	}
+});
--- a/src/tests/instagram-extractor.spec.ts
+++ b/src/tests/instagram-extractor.spec.ts
@@ -0,0 +1,171 @@
+import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
+
+// Mock node:child_process before importing the SUT. The SUT uses
+// promisify(execFile); without the Node-internal special handling, promisify
+// would only forward the first callback arg. We sidestep that by returning a
+// pre-promisified function tagged with util.promisify.custom that resolves
+// to {stdout, stderr}.
+import * as util from 'node:util';
+const execFileMock = vi.fn();
+vi.mock('node:child_process', () => {
+	const execFile: any = () => {
+		throw new Error('callback form not used in tests');
+	};
+	execFile[util.promisify.custom] = (cmd: string, args: string[], opts: any) =>
+		execFileMock(cmd, args, opts);
+	return { execFile };
+});
+
+const existsSyncMock = vi.fn();
+vi.mock('node:fs', () => ({
+	existsSync: (p: string) => existsSyncMock(p)
+}));
+
+import { extractTextAndThumbnail } from '../lib/server/instagram-extractor';
+
+describe('instagram-extractor (yt-dlp backend)', () => {
+	const originalFetch = globalThis.fetch;
+
+	beforeEach(() => {
+		execFileMock.mockReset();
+		existsSyncMock.mockReset();
+		existsSyncMock.mockReturnValue(false);
+	});
+
+	afterEach(() => {
+		globalThis.fetch = originalFetch;
+	});
+
+	it('parses yt-dlp JSON and returns bodyText + thumbnail data URI', async () => {
+		execFileMock.mockResolvedValue({
+			stdout: JSON.stringify({
+				description: 'Pasta carbonara: 200g spaghetti, 100g pancetta, 2 eggs.',
+				thumbnail: 'https://example.com/thumb.jpg'
+			}),
+			stderr: ''
+		});
+
+		globalThis.fetch = vi.fn().mockResolvedValue({
+			status: 200,
+			headers: { get: () => 'image/jpeg' },
+			arrayBuffer: () => Promise.resolve(new Uint8Array([1, 2, 3]).buffer)
+		}) as unknown as typeof fetch;
+
+		const result = await extractTextAndThumbnail('https://www.instagram.com/reel/abc123/');
+
+		expect(result.bodyText).toContain('carbonara');
+		expect(result.thumbnail).toMatch(/^data:image\/jpeg;base64,/);
+	});
+
+	it('falls back to first thumbnails entry when top-level thumbnail is absent', async () => {
+		execFileMock.mockResolvedValue({
+			stdout: JSON.stringify({
+				description: 'Recipe text',
+				thumbnails: [{ url: 'https://example.com/alt-thumb.jpg' }]
+			}),
+			stderr: ''
+		});
+
+		globalThis.fetch = vi.fn().mockResolvedValue({
+			status: 200,
+			headers: { get: () => 'image/png' },
+			arrayBuffer: () => Promise.resolve(new Uint8Array([4, 5, 6]).buffer)
+		}) as unknown as typeof fetch;
+
+		const result = await extractTextAndThumbnail('https://www.instagram.com/reel/abc/');
+		expect(result.thumbnail).toMatch(/^data:image\/png;base64,/);
+	});
+
+	it('returns null thumbnail when fetch fails', async () => {
+		execFileMock.mockResolvedValue({
+			stdout: JSON.stringify({
+				description: 'Recipe text',
+				thumbnail: 'https://example.com/missing.jpg'
+			}),
+			stderr: ''
+		});
+		globalThis.fetch = vi.fn().mockResolvedValue({
+			status: 404,
+			headers: { get: () => 'text/html' },
+			arrayBuffer: () => Promise.resolve(new ArrayBuffer(0))
+		}) as unknown as typeof fetch;
+
+		const result = await extractTextAndThumbnail('https://www.instagram.com/reel/abc/');
+		expect(result.bodyText).toBe('Recipe text');
+		expect(result.thumbnail).toBeNull();
+	});
+
+	it('passes --cookies flag when secrets/cookies.txt exists', async () => {
+		existsSyncMock.mockImplementation((p: string) => p.endsWith('cookies.txt'));
+		execFileMock.mockResolvedValue({
+			stdout: JSON.stringify({ description: 'x', thumbnail: null }),
+			stderr: ''
+		});
+
+		await extractTextAndThumbnail('https://www.instagram.com/reel/abc/');
+
+		const [, args] = execFileMock.mock.calls[0];
+		expect(args).toContain('--cookies');
+		const idx = (args as string[]).indexOf('--cookies');
+		expect((args as string[])[idx + 1]).toMatch(/cookies\.txt$/);
+	});
+
+	it('omits --cookies flag when no cookie file is present', async () => {
+		existsSyncMock.mockReturnValue(false);
+		execFileMock.mockResolvedValue({
+			stdout: JSON.stringify({ description: 'x', thumbnail: null }),
+			stderr: ''
+		});
+
+		await extractTextAndThumbnail('https://www.instagram.com/reel/abc/');
+
+		const [, args] = execFileMock.mock.calls[0];
+		expect(args).not.toContain('--cookies');
+	});
+
+	it('throws non-recoverable error on "Login required" stderr', async () => {
+		const err: any = new Error('yt-dlp failed');
+		err.stderr = 'ERROR: [Instagram] xyz: Login required to access this post.';
+		execFileMock.mockRejectedValue(err);
+
+		await expect(
+			extractTextAndThumbnail('https://www.instagram.com/reel/private/')
+		).rejects.toThrow(/authentication/);
+	});
+
+	it('throws clear error when yt-dlp binary is missing (ENOENT)', async () => {
+		const err: any = new Error('not found');
+		err.code = 'ENOENT';
+		execFileMock.mockRejectedValue(err);
+
+		await expect(
+			extractTextAndThumbnail('https://www.instagram.com/reel/abc/')
+		).rejects.toThrow(/yt-dlp is not installed/);
+	});
+
+	it('throws when description is empty', async () => {
+		execFileMock.mockResolvedValue({
+			stdout: JSON.stringify({ description: '', thumbnail: null }),
+			stderr: ''
+		});
+
+		await expect(
+			extractTextAndThumbnail('https://www.instagram.com/reel/empty/')
+		).rejects.toThrow(/no description/);
+	});
+
+	it('emits progress events through the callback', async () => {
+		execFileMock.mockResolvedValue({
+			stdout: JSON.stringify({ description: 'x', thumbnail: null }),
+			stderr: ''
+		});
+
+		const events: any[] = [];
+		await extractTextAndThumbnail('https://www.instagram.com/reel/abc/', (e) =>
+			events.push(e)
+		);
+
+		expect(events.some((e) => e.type === 'status' && e.message.includes('yt-dlp'))).toBe(true);
+		expect(events.some((e) => e.type === 'complete')).toBe(true);
+	});
+});
--- a/src/tests/queue-processor-logging.spec.ts
+++ b/src/tests/queue-processor-logging.spec.ts
@@ -18,7 +18,7 @@ vi.mock('$lib/server/tandoor', () => ({
 }));

 import { queueManager } from '$lib/server/queue/QueueManager';
-import * as extraction from '$lib/server/extraction';
+import * as instagramExtractor from '$lib/server/instagram-extractor';
 import { queueProcessor } from '$lib/server/queue/QueueProcessor';

 describe('QueueProcessor logging', () => {
@@ -50,8 +50,8 @@ describe('QueueProcessor logging', () => {
 		(complexError as any).code = 'ERR_TEST';
 		(complexError as any).details = { phase: 'extraction', retries: 3 };

-		// Mock extraction to fail BEFORE starting processor
-		const extractSpy = vi.spyOn(extraction, 'extractTextAndThumbnail');
+		// Mock extraction to fail BEFORE starting processor (default backend = ytdlp)
+		const extractSpy = vi.spyOn(instagramExtractor, 'extractTextAndThumbnail');
 		extractSpy.mockRejectedValueOnce(complexError);

 		const item = queueManager.enqueue('https://instagram.com/p/TEST');
--- a/src/tests/queue-processor.spec.ts
+++ b/src/tests/queue-processor.spec.ts
@@ -35,13 +35,21 @@ vi.mock('$lib/server/queue/config', () => ({
 	}
 }));

-// Mock external dependencies BEFORE importing QueueProcessor
+// Mock external dependencies BEFORE importing QueueProcessor.
+// QueueProcessor.extractionPhase picks between two extractor modules based on
+// EXTRACTOR_BACKEND; mock both so behavior is identical regardless of default.
 vi.mock('$lib/server/extraction', () => ({
 	extractTextAndThumbnail: vi.fn().mockResolvedValue({
 		bodyText: 'Default recipe text',
 		thumbnail: null
 	})
 }));
+vi.mock('$lib/server/instagram-extractor', () => ({
+	extractTextAndThumbnail: vi.fn().mockResolvedValue({
+		bodyText: 'Default recipe text',
+		thumbnail: null
+	})
+}));

 vi.mock('$lib/server/parser', () => ({
 	extractRecipe: vi.fn().mockResolvedValue({
@@ -62,11 +70,16 @@ vi.mock('$lib/server/tandoor', () => ({
 	})
 }));

-import { extractTextAndThumbnail } from '$lib/server/extraction';
+import { extractTextAndThumbnail as extractFromExtraction } from '$lib/server/extraction';
+import { extractTextAndThumbnail as extractFromYtDlp } from '$lib/server/instagram-extractor';
 import { extractRecipe } from '$lib/server/parser';
 import { uploadRecipeWithIngredientsDTO, uploadRecipeImage } from '$lib/server/tandoor';
 import * as configModule from '$lib/server/queue/config';

+// Alias used by existing assertions; default backend is ytdlp so the new
+// instagram-extractor mock is what the processor actually invokes.
+const extractTextAndThumbnail = extractFromYtDlp;
+
 // Import processor AFTER mocks - it will auto-start (imported for side effects)
 import '$lib/server/queue/QueueProcessor';

@@ -78,8 +91,13 @@ describe('QueueProcessor Integration Tests', () => {
 		// Reset mocks and their implementations
 		vi.resetAllMocks();

-		// Set default mock implementations
-		vi.mocked(extractTextAndThumbnail).mockResolvedValue({
+		// Set default mock implementations on BOTH backend modules so the test
+		// behavior is invariant to EXTRACTOR_BACKEND.
+		vi.mocked(extractFromExtraction).mockResolvedValue({
+			bodyText: 'Default recipe text',
+			thumbnail: null
+		});
+		vi.mocked(extractFromYtDlp).mockResolvedValue({
 			bodyText: 'Default recipe text',
 			thumbnail: null
 		});