diff --git a/src/lib/server/queue/QueueProcessor.ts b/src/lib/server/queue/QueueProcessor.ts index f7d8d2b..80cf957 100644 --- a/src/lib/server/queue/QueueProcessor.ts +++ b/src/lib/server/queue/QueueProcessor.ts @@ -23,17 +23,52 @@ import { env } from '$env/dynamic/private'; import type { ProgressEvent, ExtractedContent, ProgressCallback } from '$lib/server/extraction'; import type { QueueItem } from './types'; +// Returns true when yt-dlp returns a caption that Instagram truncated server-side. +// Truncated captions end with the Unicode ellipsis character (…), optionally +// followed by a period: "…." or just "…". +function isCaptionTruncated(text: string): boolean { + const t = text.trimEnd(); + return t.endsWith('…') || t.endsWith('….') || t.endsWith('...'); +} + // Feature flag: pick which Instagram extractor backend to invoke. -// Default to yt-dlp; set EXTRACTOR_BACKEND=playwright to fall back to the -// legacy stealth scraper while we verify the new path. -const extractTextAndThumbnail = ( +// Default to yt-dlp (fast, no browser); set EXTRACTOR_BACKEND=playwright to +// always use the stealth browser scraper. +// When yt-dlp is the primary backend, a truncated caption (ending with "…") +// automatically triggers a Playwright fallback to get the full text. +const extractTextAndThumbnail = async ( url: string, cb?: ProgressCallback ): Promise => { const backend = (env.EXTRACTOR_BACKEND ?? 'ytdlp').toLowerCase(); - return backend === 'playwright' - ? extractWithPlaywright(url, cb) - : extractWithYtDlp(url, cb); + if (backend === 'playwright') { + return extractWithPlaywright(url, cb); + } + + // yt-dlp primary path + const result = await extractWithYtDlp(url, cb); + + if (isCaptionTruncated(result.bodyText)) { + cb?.({ + type: 'status', + message: 'Caption truncated by Instagram — retrying with browser to get full text…', + timestamp: new Date().toISOString() + }); + try { + const full = await extractWithPlaywright(url, cb); + if (full.bodyText.length > result.bodyText.length) { + return full; + } + } catch (e) { + cb?.({ + type: 'status', + message: 'Browser fallback failed — continuing with truncated caption', + timestamp: new Date().toISOString() + }); + } + } + + return result; }; /**