From 958353d15add11bd292db1e7e353085b5411c5b9 Mon Sep 17 00:00:00 2001 From: Giancarmine Salucci Date: Wed, 13 May 2026 00:17:36 +0200 Subject: [PATCH] feat: Playwright fallback for truncated Instagram captions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When yt-dlp returns a caption ending with the truncation marker '….' (GraphQL API caps the text), automatically retry with the Playwright extractor, which intercepts the full caption from live GraphQL network traffic. Falls back gracefully to the partial yt-dlp caption if Playwright fails. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- src/lib/server/queue/QueueProcessor.ts | 60 ++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 9 deletions(-) diff --git a/src/lib/server/queue/QueueProcessor.ts b/src/lib/server/queue/QueueProcessor.ts index f7d8d2b..c19dff3 100644 --- a/src/lib/server/queue/QueueProcessor.ts +++ b/src/lib/server/queue/QueueProcessor.ts @@ -23,18 +23,60 @@ import { env } from '$env/dynamic/private'; import type { ProgressEvent, ExtractedContent, ProgressCallback } from '$lib/server/extraction'; import type { QueueItem } from './types'; -// Feature flag: pick which Instagram extractor backend to invoke. -// Default to yt-dlp; set EXTRACTOR_BACKEND=playwright to fall back to the -// legacy stealth scraper while we verify the new path. -const extractTextAndThumbnail = ( +/** + * Instagram caption truncation marker: yt-dlp's GraphQL path returns a + * caption ending with "…." (U+2026 + ".") when Instagram cut it short. + * When we detect this, fall back to the Playwright extractor which intercepts + * the full caption from live network traffic (requires a valid auth.json). + */ +const TRUNCATION_SUFFIX = '\u2026.'; + +async function extractTextAndThumbnail( url: string, cb?: ProgressCallback -): Promise => { +): Promise { const backend = (env.EXTRACTOR_BACKEND ?? 'ytdlp').toLowerCase(); - return backend === 'playwright' - ? extractWithPlaywright(url, cb) - : extractWithYtDlp(url, cb); -}; + + if (backend === 'playwright') { + return extractWithPlaywright(url, cb); + } + + // Primary path: yt-dlp (fast, no browser) + const result = await extractWithYtDlp(url, cb); + + // Truncation detected → fall back to Playwright for the full caption + if (result.bodyText.endsWith(TRUNCATION_SUFFIX)) { + cb?.({ + type: 'status', + message: 'Caption truncated by Instagram API — retrying with browser extraction...', + timestamp: new Date().toISOString() + }); + try { + const playwrightResult = await extractWithPlaywright(url, cb); + if (playwrightResult.bodyText && !playwrightResult.bodyText.endsWith(TRUNCATION_SUFFIX)) { + cb?.({ + type: 'status', + message: `Full caption retrieved via browser (${playwrightResult.bodyText.length} chars)`, + timestamp: new Date().toISOString() + }); + // Prefer yt-dlp thumbnail if Playwright didn't get one + return { + bodyText: playwrightResult.bodyText, + thumbnail: playwrightResult.thumbnail ?? result.thumbnail + }; + } + } catch (e) { + // Playwright fallback failed — surface the original yt-dlp result + cb?.({ + type: 'status', + message: `Browser fallback failed (${e instanceof Error ? e.message : String(e)}), using partial caption`, + timestamp: new Date().toISOString() + }); + } + } + + return result; +} /** * Queue processor with configurable concurrency