diff --git a/src/lib/server/queue/QueueProcessor.ts b/src/lib/server/queue/QueueProcessor.ts index f7d8d2b..c19dff3 100644 --- a/src/lib/server/queue/QueueProcessor.ts +++ b/src/lib/server/queue/QueueProcessor.ts @@ -23,18 +23,60 @@ import { env } from '$env/dynamic/private'; import type { ProgressEvent, ExtractedContent, ProgressCallback } from '$lib/server/extraction'; import type { QueueItem } from './types'; -// Feature flag: pick which Instagram extractor backend to invoke. -// Default to yt-dlp; set EXTRACTOR_BACKEND=playwright to fall back to the -// legacy stealth scraper while we verify the new path. -const extractTextAndThumbnail = ( +/** + * Instagram caption truncation marker: yt-dlp's GraphQL path returns a + * caption ending with "…." (U+2026 + ".") when Instagram cut it short. + * When we detect this, fall back to the Playwright extractor which intercepts + * the full caption from live network traffic (requires a valid auth.json). + */ +const TRUNCATION_SUFFIX = '\u2026.'; + +async function extractTextAndThumbnail( url: string, cb?: ProgressCallback -): Promise => { +): Promise { const backend = (env.EXTRACTOR_BACKEND ?? 'ytdlp').toLowerCase(); - return backend === 'playwright' - ? extractWithPlaywright(url, cb) - : extractWithYtDlp(url, cb); -}; + + if (backend === 'playwright') { + return extractWithPlaywright(url, cb); + } + + // Primary path: yt-dlp (fast, no browser) + const result = await extractWithYtDlp(url, cb); + + // Truncation detected → fall back to Playwright for the full caption + if (result.bodyText.endsWith(TRUNCATION_SUFFIX)) { + cb?.({ + type: 'status', + message: 'Caption truncated by Instagram API — retrying with browser extraction...', + timestamp: new Date().toISOString() + }); + try { + const playwrightResult = await extractWithPlaywright(url, cb); + if (playwrightResult.bodyText && !playwrightResult.bodyText.endsWith(TRUNCATION_SUFFIX)) { + cb?.({ + type: 'status', + message: `Full caption retrieved via browser (${playwrightResult.bodyText.length} chars)`, + timestamp: new Date().toISOString() + }); + // Prefer yt-dlp thumbnail if Playwright didn't get one + return { + bodyText: playwrightResult.bodyText, + thumbnail: playwrightResult.thumbnail ?? result.thumbnail + }; + } + } catch (e) { + // Playwright fallback failed — surface the original yt-dlp result + cb?.({ + type: 'status', + message: `Browser fallback failed (${e instanceof Error ? e.message : String(e)}), using partial caption`, + timestamp: new Date().toISOString() + }); + } + } + + return result; +} /** * Queue processor with configurable concurrency