feat: auto Playwright fallback when yt-dlp caption is truncated

Instagram truncates long captions server-side (ends with '…'). When yt-dlp returns a truncated caption, automatically fall back to the Playwright extractor which runs JS in a real browser and can click the 'more' button to expand the full caption. Falls back gracefully: if Playwright fails, the truncated text is still used rather than failing the whole extraction. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-12 23:46:24 +02:00
parent 22280d5536
commit 8c25bce400
1 changed files with 41 additions and 6 deletions
--- a/src/lib/server/queue/QueueProcessor.ts
+++ b/src/lib/server/queue/QueueProcessor.ts
@@ -23,17 +23,52 @@ import { env } from '$env/dynamic/private';
 import type { ProgressEvent, ExtractedContent, ProgressCallback } from '$lib/server/extraction';
 import type { QueueItem } from './types';

+// Returns true when yt-dlp returns a caption that Instagram truncated server-side.
+// Truncated captions end with the Unicode ellipsis character (…), optionally
+// followed by a period: "…." or just "…".
+function isCaptionTruncated(text: string): boolean {
+	const t = text.trimEnd();
+	return t.endsWith('…') || t.endsWith('….') || t.endsWith('...');
+}
+
 // Feature flag: pick which Instagram extractor backend to invoke.
-// Default to yt-dlp; set EXTRACTOR_BACKEND=playwright to fall back to the
-// legacy stealth scraper while we verify the new path.
-const extractTextAndThumbnail = (
+// Default to yt-dlp (fast, no browser); set EXTRACTOR_BACKEND=playwright to
+// always use the stealth browser scraper.
+// When yt-dlp is the primary backend, a truncated caption (ending with "…")
+// automatically triggers a Playwright fallback to get the full text.
+const extractTextAndThumbnail = async (
 	url: string,
 	cb?: ProgressCallback
 ): Promise<ExtractedContent> => {
 	const backend = (env.EXTRACTOR_BACKEND ?? 'ytdlp').toLowerCase();
-	return backend === 'playwright'
-		? extractWithPlaywright(url, cb)
-		: extractWithYtDlp(url, cb);
+	if (backend === 'playwright') {
+		return extractWithPlaywright(url, cb);
+	}
+
+	// yt-dlp primary path
+	const result = await extractWithYtDlp(url, cb);
+
+	if (isCaptionTruncated(result.bodyText)) {
+		cb?.({
+			type: 'status',
+			message: 'Caption truncated by Instagram — retrying with browser to get full text…',
+			timestamp: new Date().toISOString()
+		});
+		try {
+			const full = await extractWithPlaywright(url, cb);
+			if (full.bodyText.length > result.bodyText.length) {
+				return full;
+			}
+		} catch (e) {
+			cb?.({
+				type: 'status',
+				message: 'Browser fallback failed — continuing with truncated caption',
+				timestamp: new Date().toISOString()
+			});
+		}
+	}
+
+	return result;
 };

 /**