feat: replace Playwright extractor with yt-dlp subprocess
- Add instagram-extractor.ts: yt-dlp subprocess backend for Instagram caption extraction. No in-process browser state, maintained against Instagram frontend churn, supports cookies.txt for auth-walled reels. - Add feature flag EXTRACTOR_BACKEND (ytdlp|playwright) in QueueProcessor so the old Playwright path remains available as fallback. - Add 9 unit tests and 2 live-network integration tests for the new extractor. - Dockerfile: install yt-dlp via pip3 alongside existing Chromium deps. - docker-compose: expose EXTRACTOR_BACKEND env var (default: ytdlp). Also in this commit: - LLM: configurable per-request timeout via LLM_REQUEST_TIMEOUT_MS (default 120s); set maxRetries=0 to surface errors immediately; llama-swap /running health probe. - QueueProcessor: thread progress callback through parser phase. - LlmHealthIndicator: surface llama-swap loaded-model name. - Logging: improve error serialization in queue-processor tests. - .env.example: document llama-swap endpoint and model options. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -26,7 +26,14 @@ type CaptionCandidate = {
|
||||
brCount: number;
|
||||
};
|
||||
|
||||
export type ProgressEventType = 'status' | 'method' | 'retry' | 'error' | 'thumbnail' | 'complete';
|
||||
export type ProgressEventType =
|
||||
| 'status'
|
||||
| 'method'
|
||||
| 'retry'
|
||||
| 'error'
|
||||
| 'thumbnail'
|
||||
| 'complete'
|
||||
| 'model_loading';
|
||||
|
||||
export interface ProgressEvent {
|
||||
type: ProgressEventType;
|
||||
|
||||
193
src/lib/server/instagram-extractor.ts
Normal file
193
src/lib/server/instagram-extractor.ts
Normal file
@@ -0,0 +1,193 @@
|
||||
/**
|
||||
* Instagram extractor — yt-dlp subprocess implementation.
|
||||
*
|
||||
* Replaces the Playwright-based scraper. yt-dlp is maintained against
|
||||
* Instagram's frontend churn, has no in-process state, and works on public
|
||||
* reels without authentication. Login-walled reels can be supported by
|
||||
* dropping a Netscape-format cookies file at the path under SECRETS_DIR.
|
||||
*/
|
||||
|
||||
import { execFile } from 'node:child_process';
|
||||
import { promisify } from 'node:util';
|
||||
import { existsSync } from 'node:fs';
|
||||
import { logError } from './utils/logger';
|
||||
import type { ExtractedContent, ProgressCallback } from './extraction';
|
||||
|
||||
const execFileAsync = promisify(execFile);
|
||||
|
||||
const YTDLP_TIMEOUT_MS = 60_000;
|
||||
const IMAGE_FETCH_TIMEOUT_MS = 10_000;
|
||||
const USER_AGENT =
|
||||
'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1';
|
||||
|
||||
const COOKIE_PATHS = ['/app/secrets/cookies.txt', './secrets/cookies.txt'];
|
||||
|
||||
function resolveCookiePath(): string | null {
|
||||
for (const p of COOKIE_PATHS) {
|
||||
if (existsSync(p)) return p;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
interface YtDlpJson {
|
||||
description?: string | null;
|
||||
title?: string | null;
|
||||
thumbnail?: string | null;
|
||||
thumbnails?: Array<{ url?: string }>;
|
||||
}
|
||||
|
||||
function pickThumbnailUrl(data: YtDlpJson): string | null {
|
||||
if (data.thumbnail) return data.thumbnail;
|
||||
const first = (data.thumbnails ?? []).find((t) => t?.url);
|
||||
return first?.url ?? null;
|
||||
}
|
||||
|
||||
async function fetchImageAsBase64(imageUrl: string): Promise<string | null> {
|
||||
try {
|
||||
const response = await fetch(imageUrl, {
|
||||
signal: AbortSignal.timeout(IMAGE_FETCH_TIMEOUT_MS)
|
||||
});
|
||||
if (response.status !== 200) return null;
|
||||
const contentType = response.headers.get('content-type') ?? '';
|
||||
if (!contentType.startsWith('image/')) return null;
|
||||
const buf = Buffer.from(await response.arrayBuffer());
|
||||
return `data:${contentType};base64,${buf.toString('base64')}`;
|
||||
} catch (e) {
|
||||
logError('[ytdlp] Thumbnail fetch failed', e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function classifyYtDlpError(stderr: string): { recoverable: boolean; reason: string } {
|
||||
const lower = stderr.toLowerCase();
|
||||
if (
|
||||
lower.includes('login required') ||
|
||||
lower.includes('login_required') ||
|
||||
lower.includes('private') ||
|
||||
lower.includes('rate-limit') ||
|
||||
lower.includes('rate limit')
|
||||
) {
|
||||
return {
|
||||
recoverable: false,
|
||||
reason:
|
||||
'Instagram requires authentication for this reel. Drop a Netscape cookies.txt at secrets/cookies.txt and retry.'
|
||||
};
|
||||
}
|
||||
if (lower.includes('unsupported url')) {
|
||||
return { recoverable: false, reason: 'URL not recognised by yt-dlp.' };
|
||||
}
|
||||
if (lower.includes('http error 404') || lower.includes('does not exist')) {
|
||||
return { recoverable: false, reason: 'Reel not found (404).' };
|
||||
}
|
||||
return { recoverable: true, reason: stderr.split('\n').filter(Boolean).slice(-2).join(' ') };
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract caption text + thumbnail data-URL from an Instagram reel.
|
||||
*
|
||||
* Mirrors the signature of the legacy Playwright extractor so QueueProcessor
|
||||
* needs no contract change. ProgressCallback events use existing types
|
||||
* (`status`, `method`, `error`) so the SSE consumers do not need updates.
|
||||
*/
|
||||
export async function extractTextAndThumbnail(
|
||||
url: string,
|
||||
progressCallback?: ProgressCallback
|
||||
): Promise<ExtractedContent> {
|
||||
progressCallback?.({
|
||||
type: 'status',
|
||||
message: 'Invoking yt-dlp...',
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
|
||||
const cookies = resolveCookiePath();
|
||||
if (cookies) {
|
||||
progressCallback?.({
|
||||
type: 'status',
|
||||
message: `Using cookies from ${cookies}`,
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
|
||||
const args = [
|
||||
'--dump-single-json',
|
||||
'--skip-download',
|
||||
'--no-warnings',
|
||||
'--no-call-home',
|
||||
'--socket-timeout',
|
||||
'20',
|
||||
'--user-agent',
|
||||
USER_AGENT,
|
||||
...(cookies ? ['--cookies', cookies] : []),
|
||||
url
|
||||
];
|
||||
|
||||
let stdout: string;
|
||||
try {
|
||||
const result = await execFileAsync('yt-dlp', args, {
|
||||
timeout: YTDLP_TIMEOUT_MS,
|
||||
maxBuffer: 10 * 1024 * 1024
|
||||
});
|
||||
stdout = result.stdout;
|
||||
} catch (e: any) {
|
||||
const stderr = String(e?.stderr ?? e?.message ?? '');
|
||||
const code = e?.code;
|
||||
if (code === 'ENOENT') {
|
||||
throw new Error(
|
||||
'yt-dlp is not installed in this container. Add it to the Dockerfile.'
|
||||
);
|
||||
}
|
||||
const { recoverable, reason } = classifyYtDlpError(stderr);
|
||||
progressCallback?.({
|
||||
type: 'error',
|
||||
message: `yt-dlp failed: ${reason}`,
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
const err = new Error(`yt-dlp extraction failed: ${reason}`);
|
||||
// QueueProcessor.isRecoverableError() classifies on message; surface keywords.
|
||||
if (!recoverable) (err as any).nonRecoverable = true;
|
||||
throw err;
|
||||
}
|
||||
|
||||
let data: YtDlpJson;
|
||||
try {
|
||||
data = JSON.parse(stdout);
|
||||
} catch (e) {
|
||||
logError('[ytdlp] Failed to parse yt-dlp JSON output', e);
|
||||
throw new Error('yt-dlp returned invalid JSON');
|
||||
}
|
||||
|
||||
const bodyText = (data.description ?? data.title ?? '').trim();
|
||||
if (!bodyText) {
|
||||
throw new Error('yt-dlp returned no description for this reel');
|
||||
}
|
||||
|
||||
progressCallback?.({
|
||||
type: 'status',
|
||||
message: `Caption extracted (${bodyText.length} chars)`,
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
|
||||
let thumbnail: string | null = null;
|
||||
const thumbUrl = pickThumbnailUrl(data);
|
||||
if (thumbUrl) {
|
||||
progressCallback?.({
|
||||
type: 'thumbnail',
|
||||
message: 'Fetching thumbnail...',
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
thumbnail = await fetchImageAsBase64(thumbUrl);
|
||||
progressCallback?.({
|
||||
type: 'status',
|
||||
message: thumbnail ? 'Thumbnail fetched' : 'Thumbnail fetch failed (continuing without)',
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
|
||||
progressCallback?.({
|
||||
type: 'complete',
|
||||
message: 'Extraction complete',
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
|
||||
return { bodyText, thumbnail };
|
||||
}
|
||||
@@ -2,15 +2,24 @@ import OpenAI from 'openai';
|
||||
import { env } from '$env/dynamic/private';
|
||||
import { logError } from './utils/logger';
|
||||
|
||||
const DEFAULT_REQUEST_TIMEOUT_MS = 120_000;
|
||||
|
||||
const parseTimeoutMs = (raw: string | undefined): number => {
|
||||
if (!raw) return DEFAULT_REQUEST_TIMEOUT_MS;
|
||||
const n = Number(raw);
|
||||
return Number.isFinite(n) && n > 0 ? n : DEFAULT_REQUEST_TIMEOUT_MS;
|
||||
};
|
||||
|
||||
export const createLLM = () => {
|
||||
// Detect if we are using Ollama or OpenAI based on URL
|
||||
const baseURL = env.OPENAI_BASE_URL;
|
||||
const apiKey = env.OPENAI_API_KEY;
|
||||
const model = env.LLM_MODEL || 'gpt-4o';
|
||||
const timeout = parseTimeoutMs(env.LLM_REQUEST_TIMEOUT_MS);
|
||||
|
||||
console.log('[LLM] Initializing client...');
|
||||
console.log('[LLM] Base URL:', baseURL);
|
||||
console.log('[LLM] Model:', model);
|
||||
console.log('[LLM] Request timeout (ms):', timeout);
|
||||
|
||||
if (!baseURL) {
|
||||
throw new Error('OPENAI_BASE_URL environment variable is not set');
|
||||
@@ -22,7 +31,9 @@ export const createLLM = () => {
|
||||
|
||||
const client = new OpenAI({
|
||||
apiKey,
|
||||
baseURL
|
||||
baseURL,
|
||||
timeout,
|
||||
maxRetries: 0
|
||||
});
|
||||
|
||||
return { client, model };
|
||||
@@ -43,6 +54,47 @@ export async function checkLLMHealth(): Promise<boolean> {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Strip a trailing /v1 (or /v1/) from a base URL to get the llama-swap root.
|
||||
* llama-swap exposes both /v1/* (OpenAI-compatible) and /running, /upstream, etc.
|
||||
* at the bare root.
|
||||
*/
|
||||
function llamaSwapRoot(baseURL: string): string {
|
||||
return baseURL.replace(/\/v1\/?$/, '').replace(/\/$/, '');
|
||||
}
|
||||
|
||||
interface RunningModelEntry {
|
||||
model: string;
|
||||
state?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Query llama-swap's /running endpoint and report whether `model` is currently
|
||||
* loaded and ready to serve. Returns false on any error (treat as cold).
|
||||
*
|
||||
* Why we don't fold this into checkModelAvailability(): /v1/models lists every
|
||||
* model llama-swap is configured to swap to (not just loaded ones), while
|
||||
* /running returns only the in-VRAM instance. Both signals are useful.
|
||||
*/
|
||||
export async function isModelLoaded(model: string): Promise<boolean> {
|
||||
const baseURL = env.OPENAI_BASE_URL;
|
||||
if (!baseURL) return false;
|
||||
|
||||
try {
|
||||
const url = `${llamaSwapRoot(baseURL)}/running`;
|
||||
const response = await fetch(url, {
|
||||
signal: AbortSignal.timeout(5_000)
|
||||
});
|
||||
if (!response.ok) return false;
|
||||
const data = (await response.json()) as { running?: RunningModelEntry[] };
|
||||
const running = data.running ?? [];
|
||||
return running.some((m) => m.model === model && (m.state ?? 'ready') === 'ready');
|
||||
} catch (e) {
|
||||
logError('[LLM] isModelLoaded check failed', e);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a specific model is available in the OpenAI-compatible API
|
||||
* @param model - The model ID to check for availability
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
import { createLLM, checkModelAvailability } from './llm';
|
||||
import { createLLM, checkModelAvailability, isModelLoaded } from './llm';
|
||||
import { zodResponseFormat } from 'openai/helpers/zod';
|
||||
import { z } from 'zod';
|
||||
import { RECIPE_DETECTION_PROMPT, RECIPE_EXTRACTION_PROMPT } from './prompts/recipe-extraction';
|
||||
import { logError } from './utils/logger';
|
||||
import type { ProgressCallback } from './extraction';
|
||||
|
||||
const RecipeSchema = z.object({
|
||||
name: z.string(),
|
||||
@@ -144,11 +145,33 @@ export async function parseRecipe(text: string): Promise<Recipe> {
|
||||
}
|
||||
|
||||
/**
|
||||
* Complete workflow: detect recipe and parse if found
|
||||
* Complete workflow: detect recipe and parse if found.
|
||||
*
|
||||
* Emits a `model_loading` progress event (if a callback is supplied) when the
|
||||
* configured llama-swap model is not yet warm — the first request after idle
|
||||
* blocks for several seconds while llama-swap loads the model into VRAM.
|
||||
*
|
||||
* @param text - The text to analyze
|
||||
* @param progressCallback - Optional callback for surfacing cold-load state
|
||||
* @returns Parsed recipe object if detected, null otherwise
|
||||
*/
|
||||
export async function extractRecipe(text: string): Promise<Recipe | null> {
|
||||
export async function extractRecipe(
|
||||
text: string,
|
||||
progressCallback?: ProgressCallback
|
||||
): Promise<Recipe | null> {
|
||||
if (progressCallback) {
|
||||
const { model } = createLLM();
|
||||
const warm = await isModelLoaded(model);
|
||||
if (!warm) {
|
||||
progressCallback({
|
||||
type: 'model_loading',
|
||||
message: `Inference server cold — loading ${model} into VRAM (5–30s)...`,
|
||||
data: { model },
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const isRecipe = await detectRecipe(text);
|
||||
|
||||
if (!isRecipe) {
|
||||
|
||||
@@ -12,15 +12,30 @@
|
||||
*/
|
||||
|
||||
import { queueManager } from './QueueManager';
|
||||
import { extractTextAndThumbnail } from '$lib/server/extraction';
|
||||
import { extractTextAndThumbnail as extractWithPlaywright } from '$lib/server/extraction';
|
||||
import { extractTextAndThumbnail as extractWithYtDlp } from '$lib/server/instagram-extractor';
|
||||
import { extractRecipe } from '$lib/server/parser';
|
||||
import { uploadRecipeWithIngredientsDTO, uploadRecipeImage } from '$lib/server/tandoor';
|
||||
import { pushNotificationService } from '$lib/server/notifications/PushNotificationService';
|
||||
import { queueConfig } from './config';
|
||||
import { logError } from '../utils/logger';
|
||||
import type { ProgressEvent } from '$lib/server/extraction';
|
||||
import { env } from '$env/dynamic/private';
|
||||
import type { ProgressEvent, ExtractedContent, ProgressCallback } from '$lib/server/extraction';
|
||||
import type { QueueItem } from './types';
|
||||
|
||||
// Feature flag: pick which Instagram extractor backend to invoke.
|
||||
// Default to yt-dlp; set EXTRACTOR_BACKEND=playwright to fall back to the
|
||||
// legacy stealth scraper while we verify the new path.
|
||||
const extractTextAndThumbnail = (
|
||||
url: string,
|
||||
cb?: ProgressCallback
|
||||
): Promise<ExtractedContent> => {
|
||||
const backend = (env.EXTRACTOR_BACKEND ?? 'ytdlp').toLowerCase();
|
||||
return backend === 'playwright'
|
||||
? extractWithPlaywright(url, cb)
|
||||
: extractWithYtDlp(url, cb);
|
||||
};
|
||||
|
||||
/**
|
||||
* Queue processor with configurable concurrency
|
||||
*
|
||||
@@ -250,7 +265,9 @@ export class QueueProcessor {
|
||||
});
|
||||
|
||||
console.log(`[QueueProcessor] Parsing recipe: ${item.id}`);
|
||||
const recipe = await extractRecipe(item.extractedText);
|
||||
const recipe = await extractRecipe(item.extractedText, (event) => {
|
||||
queueManager.addProgressEvent(item.id, event);
|
||||
});
|
||||
|
||||
if (!recipe) {
|
||||
throw new Error('Failed to parse recipe from extracted text');
|
||||
|
||||
Reference in New Issue
Block a user