From 5b5bb947ef08837a53c2a65c02538f47b6985921 Mon Sep 17 00:00:00 2001 From: Giancarmine Salucci Date: Tue, 12 May 2026 20:46:31 +0200 Subject: [PATCH] feat: replace Playwright extractor with yt-dlp subprocess - Add instagram-extractor.ts: yt-dlp subprocess backend for Instagram caption extraction. No in-process browser state, maintained against Instagram frontend churn, supports cookies.txt for auth-walled reels. - Add feature flag EXTRACTOR_BACKEND (ytdlp|playwright) in QueueProcessor so the old Playwright path remains available as fallback. - Add 9 unit tests and 2 live-network integration tests for the new extractor. - Dockerfile: install yt-dlp via pip3 alongside existing Chromium deps. - docker-compose: expose EXTRACTOR_BACKEND env var (default: ytdlp). Also in this commit: - LLM: configurable per-request timeout via LLM_REQUEST_TIMEOUT_MS (default 120s); set maxRetries=0 to surface errors immediately; llama-swap /running health probe. - QueueProcessor: thread progress callback through parser phase. - LlmHealthIndicator: surface llama-swap loaded-model name. - Logging: improve error serialization in queue-processor tests. - .env.example: document llama-swap endpoint and model options. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .env.example | 40 +++- Dockerfile | 7 +- docker-compose.yml | 3 + src/lib/server/extraction.ts | 9 +- src/lib/server/instagram-extractor.ts | 193 ++++++++++++++++++ src/lib/server/llm.ts | 56 ++++- src/lib/server/parser.ts | 29 ++- src/lib/server/queue/QueueProcessor.ts | 23 ++- src/routes/api/llm-health/+server.ts | 40 ++-- .../components/LlmHealthIndicator.svelte | 26 ++- .../instagram-extractor.integration.spec.ts | 49 +++++ src/tests/instagram-extractor.spec.ts | 171 ++++++++++++++++ src/tests/queue-processor-logging.spec.ts | 6 +- src/tests/queue-processor.spec.ts | 26 ++- 14 files changed, 628 insertions(+), 50 deletions(-) create mode 100644 src/lib/server/instagram-extractor.ts create mode 100644 src/tests/instagram-extractor.integration.spec.ts create mode 100644 src/tests/instagram-extractor.spec.ts diff --git a/.env.example b/.env.example index 2b335cb..5502d38 100644 --- a/.env.example +++ b/.env.example @@ -7,15 +7,23 @@ # ============================================================================== # LLM Configuration (REQUIRED) # ============================================================================== -# OpenAI-compatible API endpoint (OpenAI, LM Studio, Ollama, LiteLLM, etc.) -OPENAI_BASE_URL=http://localhost:1234/v1 +# OpenAI-compatible API endpoint. Production: llama-swap on ideapad. +# llama-swap loads models on demand and unloads them after globalTTL (10 min). +OPENAI_BASE_URL=http://192.168.1.50:8080/v1 -# API key for authentication -OPENAI_API_KEY=your-api-key-here +# API key for authentication (llama-swap accepts any non-empty value). +OPENAI_API_KEY=sk-llama-local -# Model to use for recipe extraction -# Examples: gpt-4o, gpt-4o-mini, llama-3.1, mistral, etc. -LLM_MODEL=google/gemma-3-4b +# Model to use for recipe extraction. Available on the ideapad llama-swap stack: +# gemma4-e4b-q6k (recommended — 4B, 65k ctx, 31 TPS) +# gemma4-e2b-q8_0 (faster — 2B, 65k ctx, 55 TPS) +# qwen3.5-4b-q8_0 (fallback — 22 TPS) +# phi4-mini-q8_0, granite-3.3-8b-q6k, plus larger MoE variants +LLM_MODEL=gemma4-e4b-q6k + +# Per-request LLM timeout in ms. Must cover llama-swap cold-load (~5–30s for +# small models) plus generation time. Default 120000. +LLM_REQUEST_TIMEOUT_MS=120000 # ============================================================================== # Queue Configuration (OPTIONAL) @@ -55,9 +63,23 @@ VAPID_PUBLIC_KEY=BNextdcB_fQ0BVvyGioM5L8Tf9vKQjs-WnF-rUbnU8MdWIZQYfggIHxBnW21I-l VAPID_PRIVATE_KEY=JwxI_KcsBcehYcTOufMcbVWJjCq1QbH5FJmSyQuG680 # ============================================================================== -# Authentication Scheduler (OPTIONAL) +# Instagram Extraction Backend # ============================================================================== -# Enable automatic Instagram authentication renewal +# Which extractor to use: +# ytdlp (default) — yt-dlp subprocess, stateless, Sablier-safe +# playwright — legacy Playwright stealth scraper, requires +# secrets/auth.json + AUTH_SCHEDULER_* below +EXTRACTOR_BACKEND=ytdlp + +# Optional Netscape-format cookies file for login-walled reels. +# yt-dlp picks it up automatically if it exists at /app/secrets/cookies.txt +# (Docker) or ./secrets/cookies.txt (local). No automation; export from a +# browser when an extraction starts hitting login walls. + +# ============================================================================== +# Authentication Scheduler (LEGACY — only relevant when EXTRACTOR_BACKEND=playwright) +# ============================================================================== +# Enable automatic Instagram authentication renewal (Playwright backend only) AUTH_SCHEDULER_ENABLED=true # Renewal interval in minutes (default: 720 = 12 hours) diff --git a/Dockerfile b/Dockerfile index 9a9f0e3..189d9ab 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,15 @@ FROM node:24-alpine WORKDIR /app -# Install Playwright system dependencies +# Install yt-dlp (primary Instagram extractor) and Playwright system dependencies (fallback) RUN apk add --no-cache \ + python3 \ + py3-pip \ chromium \ font-liberation \ font-noto \ - font-noto-cjk + font-noto-cjk && \ + pip3 install --break-system-packages yt-dlp COPY package*.json ./ RUN npm ci diff --git a/docker-compose.yml b/docker-compose.yml index ddd9051..cf4849d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -32,6 +32,9 @@ services: # Playwright Configuration - DISPLAY=:99 + # Extractor backend: 'ytdlp' (default) or 'playwright' (legacy fallback) + - EXTRACTOR_BACKEND=${EXTRACTOR_BACKEND:-ytdlp} + # Node.js Environment - NODE_ENV=production security_opt: diff --git a/src/lib/server/extraction.ts b/src/lib/server/extraction.ts index a7f3fe5..38ce12e 100644 --- a/src/lib/server/extraction.ts +++ b/src/lib/server/extraction.ts @@ -26,7 +26,14 @@ type CaptionCandidate = { brCount: number; }; -export type ProgressEventType = 'status' | 'method' | 'retry' | 'error' | 'thumbnail' | 'complete'; +export type ProgressEventType = + | 'status' + | 'method' + | 'retry' + | 'error' + | 'thumbnail' + | 'complete' + | 'model_loading'; export interface ProgressEvent { type: ProgressEventType; diff --git a/src/lib/server/instagram-extractor.ts b/src/lib/server/instagram-extractor.ts new file mode 100644 index 0000000..76001ec --- /dev/null +++ b/src/lib/server/instagram-extractor.ts @@ -0,0 +1,193 @@ +/** + * Instagram extractor — yt-dlp subprocess implementation. + * + * Replaces the Playwright-based scraper. yt-dlp is maintained against + * Instagram's frontend churn, has no in-process state, and works on public + * reels without authentication. Login-walled reels can be supported by + * dropping a Netscape-format cookies file at the path under SECRETS_DIR. + */ + +import { execFile } from 'node:child_process'; +import { promisify } from 'node:util'; +import { existsSync } from 'node:fs'; +import { logError } from './utils/logger'; +import type { ExtractedContent, ProgressCallback } from './extraction'; + +const execFileAsync = promisify(execFile); + +const YTDLP_TIMEOUT_MS = 60_000; +const IMAGE_FETCH_TIMEOUT_MS = 10_000; +const USER_AGENT = + 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1'; + +const COOKIE_PATHS = ['/app/secrets/cookies.txt', './secrets/cookies.txt']; + +function resolveCookiePath(): string | null { + for (const p of COOKIE_PATHS) { + if (existsSync(p)) return p; + } + return null; +} + +interface YtDlpJson { + description?: string | null; + title?: string | null; + thumbnail?: string | null; + thumbnails?: Array<{ url?: string }>; +} + +function pickThumbnailUrl(data: YtDlpJson): string | null { + if (data.thumbnail) return data.thumbnail; + const first = (data.thumbnails ?? []).find((t) => t?.url); + return first?.url ?? null; +} + +async function fetchImageAsBase64(imageUrl: string): Promise { + try { + const response = await fetch(imageUrl, { + signal: AbortSignal.timeout(IMAGE_FETCH_TIMEOUT_MS) + }); + if (response.status !== 200) return null; + const contentType = response.headers.get('content-type') ?? ''; + if (!contentType.startsWith('image/')) return null; + const buf = Buffer.from(await response.arrayBuffer()); + return `data:${contentType};base64,${buf.toString('base64')}`; + } catch (e) { + logError('[ytdlp] Thumbnail fetch failed', e); + return null; + } +} + +function classifyYtDlpError(stderr: string): { recoverable: boolean; reason: string } { + const lower = stderr.toLowerCase(); + if ( + lower.includes('login required') || + lower.includes('login_required') || + lower.includes('private') || + lower.includes('rate-limit') || + lower.includes('rate limit') + ) { + return { + recoverable: false, + reason: + 'Instagram requires authentication for this reel. Drop a Netscape cookies.txt at secrets/cookies.txt and retry.' + }; + } + if (lower.includes('unsupported url')) { + return { recoverable: false, reason: 'URL not recognised by yt-dlp.' }; + } + if (lower.includes('http error 404') || lower.includes('does not exist')) { + return { recoverable: false, reason: 'Reel not found (404).' }; + } + return { recoverable: true, reason: stderr.split('\n').filter(Boolean).slice(-2).join(' ') }; +} + +/** + * Extract caption text + thumbnail data-URL from an Instagram reel. + * + * Mirrors the signature of the legacy Playwright extractor so QueueProcessor + * needs no contract change. ProgressCallback events use existing types + * (`status`, `method`, `error`) so the SSE consumers do not need updates. + */ +export async function extractTextAndThumbnail( + url: string, + progressCallback?: ProgressCallback +): Promise { + progressCallback?.({ + type: 'status', + message: 'Invoking yt-dlp...', + timestamp: new Date().toISOString() + }); + + const cookies = resolveCookiePath(); + if (cookies) { + progressCallback?.({ + type: 'status', + message: `Using cookies from ${cookies}`, + timestamp: new Date().toISOString() + }); + } + + const args = [ + '--dump-single-json', + '--skip-download', + '--no-warnings', + '--no-call-home', + '--socket-timeout', + '20', + '--user-agent', + USER_AGENT, + ...(cookies ? ['--cookies', cookies] : []), + url + ]; + + let stdout: string; + try { + const result = await execFileAsync('yt-dlp', args, { + timeout: YTDLP_TIMEOUT_MS, + maxBuffer: 10 * 1024 * 1024 + }); + stdout = result.stdout; + } catch (e: any) { + const stderr = String(e?.stderr ?? e?.message ?? ''); + const code = e?.code; + if (code === 'ENOENT') { + throw new Error( + 'yt-dlp is not installed in this container. Add it to the Dockerfile.' + ); + } + const { recoverable, reason } = classifyYtDlpError(stderr); + progressCallback?.({ + type: 'error', + message: `yt-dlp failed: ${reason}`, + timestamp: new Date().toISOString() + }); + const err = new Error(`yt-dlp extraction failed: ${reason}`); + // QueueProcessor.isRecoverableError() classifies on message; surface keywords. + if (!recoverable) (err as any).nonRecoverable = true; + throw err; + } + + let data: YtDlpJson; + try { + data = JSON.parse(stdout); + } catch (e) { + logError('[ytdlp] Failed to parse yt-dlp JSON output', e); + throw new Error('yt-dlp returned invalid JSON'); + } + + const bodyText = (data.description ?? data.title ?? '').trim(); + if (!bodyText) { + throw new Error('yt-dlp returned no description for this reel'); + } + + progressCallback?.({ + type: 'status', + message: `Caption extracted (${bodyText.length} chars)`, + timestamp: new Date().toISOString() + }); + + let thumbnail: string | null = null; + const thumbUrl = pickThumbnailUrl(data); + if (thumbUrl) { + progressCallback?.({ + type: 'thumbnail', + message: 'Fetching thumbnail...', + timestamp: new Date().toISOString() + }); + thumbnail = await fetchImageAsBase64(thumbUrl); + progressCallback?.({ + type: 'status', + message: thumbnail ? 'Thumbnail fetched' : 'Thumbnail fetch failed (continuing without)', + timestamp: new Date().toISOString() + }); + } + + progressCallback?.({ + type: 'complete', + message: 'Extraction complete', + timestamp: new Date().toISOString() + }); + + return { bodyText, thumbnail }; +} diff --git a/src/lib/server/llm.ts b/src/lib/server/llm.ts index 079a73f..9214bec 100644 --- a/src/lib/server/llm.ts +++ b/src/lib/server/llm.ts @@ -2,15 +2,24 @@ import OpenAI from 'openai'; import { env } from '$env/dynamic/private'; import { logError } from './utils/logger'; +const DEFAULT_REQUEST_TIMEOUT_MS = 120_000; + +const parseTimeoutMs = (raw: string | undefined): number => { + if (!raw) return DEFAULT_REQUEST_TIMEOUT_MS; + const n = Number(raw); + return Number.isFinite(n) && n > 0 ? n : DEFAULT_REQUEST_TIMEOUT_MS; +}; + export const createLLM = () => { - // Detect if we are using Ollama or OpenAI based on URL const baseURL = env.OPENAI_BASE_URL; const apiKey = env.OPENAI_API_KEY; const model = env.LLM_MODEL || 'gpt-4o'; + const timeout = parseTimeoutMs(env.LLM_REQUEST_TIMEOUT_MS); console.log('[LLM] Initializing client...'); console.log('[LLM] Base URL:', baseURL); console.log('[LLM] Model:', model); + console.log('[LLM] Request timeout (ms):', timeout); if (!baseURL) { throw new Error('OPENAI_BASE_URL environment variable is not set'); @@ -22,7 +31,9 @@ export const createLLM = () => { const client = new OpenAI({ apiKey, - baseURL + baseURL, + timeout, + maxRetries: 0 }); return { client, model }; @@ -43,6 +54,47 @@ export async function checkLLMHealth(): Promise { } } +/** + * Strip a trailing /v1 (or /v1/) from a base URL to get the llama-swap root. + * llama-swap exposes both /v1/* (OpenAI-compatible) and /running, /upstream, etc. + * at the bare root. + */ +function llamaSwapRoot(baseURL: string): string { + return baseURL.replace(/\/v1\/?$/, '').replace(/\/$/, ''); +} + +interface RunningModelEntry { + model: string; + state?: string; +} + +/** + * Query llama-swap's /running endpoint and report whether `model` is currently + * loaded and ready to serve. Returns false on any error (treat as cold). + * + * Why we don't fold this into checkModelAvailability(): /v1/models lists every + * model llama-swap is configured to swap to (not just loaded ones), while + * /running returns only the in-VRAM instance. Both signals are useful. + */ +export async function isModelLoaded(model: string): Promise { + const baseURL = env.OPENAI_BASE_URL; + if (!baseURL) return false; + + try { + const url = `${llamaSwapRoot(baseURL)}/running`; + const response = await fetch(url, { + signal: AbortSignal.timeout(5_000) + }); + if (!response.ok) return false; + const data = (await response.json()) as { running?: RunningModelEntry[] }; + const running = data.running ?? []; + return running.some((m) => m.model === model && (m.state ?? 'ready') === 'ready'); + } catch (e) { + logError('[LLM] isModelLoaded check failed', e); + return false; + } +} + /** * Check if a specific model is available in the OpenAI-compatible API * @param model - The model ID to check for availability diff --git a/src/lib/server/parser.ts b/src/lib/server/parser.ts index bf19354..d9474c7 100644 --- a/src/lib/server/parser.ts +++ b/src/lib/server/parser.ts @@ -1,8 +1,9 @@ -import { createLLM, checkModelAvailability } from './llm'; +import { createLLM, checkModelAvailability, isModelLoaded } from './llm'; import { zodResponseFormat } from 'openai/helpers/zod'; import { z } from 'zod'; import { RECIPE_DETECTION_PROMPT, RECIPE_EXTRACTION_PROMPT } from './prompts/recipe-extraction'; import { logError } from './utils/logger'; +import type { ProgressCallback } from './extraction'; const RecipeSchema = z.object({ name: z.string(), @@ -144,11 +145,33 @@ export async function parseRecipe(text: string): Promise { } /** - * Complete workflow: detect recipe and parse if found + * Complete workflow: detect recipe and parse if found. + * + * Emits a `model_loading` progress event (if a callback is supplied) when the + * configured llama-swap model is not yet warm — the first request after idle + * blocks for several seconds while llama-swap loads the model into VRAM. + * * @param text - The text to analyze + * @param progressCallback - Optional callback for surfacing cold-load state * @returns Parsed recipe object if detected, null otherwise */ -export async function extractRecipe(text: string): Promise { +export async function extractRecipe( + text: string, + progressCallback?: ProgressCallback +): Promise { + if (progressCallback) { + const { model } = createLLM(); + const warm = await isModelLoaded(model); + if (!warm) { + progressCallback({ + type: 'model_loading', + message: `Inference server cold — loading ${model} into VRAM (5–30s)...`, + data: { model }, + timestamp: new Date().toISOString() + }); + } + } + const isRecipe = await detectRecipe(text); if (!isRecipe) { diff --git a/src/lib/server/queue/QueueProcessor.ts b/src/lib/server/queue/QueueProcessor.ts index e2f14a2..f7d8d2b 100644 --- a/src/lib/server/queue/QueueProcessor.ts +++ b/src/lib/server/queue/QueueProcessor.ts @@ -12,15 +12,30 @@ */ import { queueManager } from './QueueManager'; -import { extractTextAndThumbnail } from '$lib/server/extraction'; +import { extractTextAndThumbnail as extractWithPlaywright } from '$lib/server/extraction'; +import { extractTextAndThumbnail as extractWithYtDlp } from '$lib/server/instagram-extractor'; import { extractRecipe } from '$lib/server/parser'; import { uploadRecipeWithIngredientsDTO, uploadRecipeImage } from '$lib/server/tandoor'; import { pushNotificationService } from '$lib/server/notifications/PushNotificationService'; import { queueConfig } from './config'; import { logError } from '../utils/logger'; -import type { ProgressEvent } from '$lib/server/extraction'; +import { env } from '$env/dynamic/private'; +import type { ProgressEvent, ExtractedContent, ProgressCallback } from '$lib/server/extraction'; import type { QueueItem } from './types'; +// Feature flag: pick which Instagram extractor backend to invoke. +// Default to yt-dlp; set EXTRACTOR_BACKEND=playwright to fall back to the +// legacy stealth scraper while we verify the new path. +const extractTextAndThumbnail = ( + url: string, + cb?: ProgressCallback +): Promise => { + const backend = (env.EXTRACTOR_BACKEND ?? 'ytdlp').toLowerCase(); + return backend === 'playwright' + ? extractWithPlaywright(url, cb) + : extractWithYtDlp(url, cb); +}; + /** * Queue processor with configurable concurrency * @@ -250,7 +265,9 @@ export class QueueProcessor { }); console.log(`[QueueProcessor] Parsing recipe: ${item.id}`); - const recipe = await extractRecipe(item.extractedText); + const recipe = await extractRecipe(item.extractedText, (event) => { + queueManager.addProgressEvent(item.id, event); + }); if (!recipe) { throw new Error('Failed to parse recipe from extracted text'); diff --git a/src/routes/api/llm-health/+server.ts b/src/routes/api/llm-health/+server.ts index bd4d0a2..e12f10d 100644 --- a/src/routes/api/llm-health/+server.ts +++ b/src/routes/api/llm-health/+server.ts @@ -1,34 +1,48 @@ import { json } from '@sveltejs/kit'; -import { checkLLMHealth } from '$lib/server/llm'; +import { env } from '$env/dynamic/private'; +import { checkLLMHealth, isModelLoaded } from '$lib/server/llm'; /** - * Health check endpoint for LLM service - * Tests connectivity to LM Studio or OpenAI-compatible endpoint + * Health check endpoint for the LLM service (llama-swap on ideapad). + * + * Three states: + * - ok → endpoint reachable AND configured model is loaded in VRAM + * - warming → endpoint reachable but configured model not yet loaded + * (next request will trigger a cold load) + * - error → endpoint unreachable */ export async function GET() { try { - const isHealthy = await checkLLMHealth(); + const reachable = await checkLLMHealth(); + const configuredModel = env.LLM_MODEL || 'gpt-4o'; - if (isHealthy) { - return json({ - status: 'healthy', - message: 'LLM service is accessible' - }); - } else { + if (!reachable) { return json( { - status: 'unhealthy', - message: 'LLM service is not accessible' + status: 'error', + message: 'LLM service is not accessible', + configuredModel }, { status: 503 } ); } + + const warm = await isModelLoaded(configuredModel); + return json({ + status: warm ? 'ok' : 'warming', + message: warm + ? `Model ${configuredModel} loaded and ready` + : `Model ${configuredModel} configured; next request will trigger a cold load`, + configuredModel, + loaded: warm + }); } catch (error) { const errorMessage = error instanceof Error ? error.message : 'Unknown error'; return json( { status: 'error', - message: errorMessage + message: errorMessage, + configuredModel: env.LLM_MODEL || 'gpt-4o' }, { status: 500 } ); diff --git a/src/routes/share/components/LlmHealthIndicator.svelte b/src/routes/share/components/LlmHealthIndicator.svelte index 5e213cc..c6afb3a 100644 --- a/src/routes/share/components/LlmHealthIndicator.svelte +++ b/src/routes/share/components/LlmHealthIndicator.svelte @@ -1,9 +1,12 @@