feat: auto Playwright fallback when yt-dlp caption is truncated
All checks were successful
Build & Push Docker Image / test-and-build (push) Successful in 1m2s

Instagram truncates long captions server-side (ends with '…').
When yt-dlp returns a truncated caption, automatically fall back to
the Playwright extractor which runs JS in a real browser and can
click the 'more' button to expand the full caption.

Falls back gracefully: if Playwright fails, the truncated text is
still used rather than failing the whole extraction.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Giancarmine Salucci
2026-05-12 23:46:24 +02:00
parent 22280d5536
commit 8c25bce400

View File

@@ -23,17 +23,52 @@ import { env } from '$env/dynamic/private';
import type { ProgressEvent, ExtractedContent, ProgressCallback } from '$lib/server/extraction';
import type { QueueItem } from './types';
// Returns true when yt-dlp returns a caption that Instagram truncated server-side.
// Truncated captions end with the Unicode ellipsis character (…), optionally
// followed by a period: "…." or just "…".
function isCaptionTruncated(text: string): boolean {
const t = text.trimEnd();
return t.endsWith('…') || t.endsWith('….') || t.endsWith('...');
}
// Feature flag: pick which Instagram extractor backend to invoke.
// Default to yt-dlp; set EXTRACTOR_BACKEND=playwright to fall back to the
// legacy stealth scraper while we verify the new path.
const extractTextAndThumbnail = (
// Default to yt-dlp (fast, no browser); set EXTRACTOR_BACKEND=playwright to
// always use the stealth browser scraper.
// When yt-dlp is the primary backend, a truncated caption (ending with "…")
// automatically triggers a Playwright fallback to get the full text.
const extractTextAndThumbnail = async (
url: string,
cb?: ProgressCallback
): Promise<ExtractedContent> => {
const backend = (env.EXTRACTOR_BACKEND ?? 'ytdlp').toLowerCase();
return backend === 'playwright'
? extractWithPlaywright(url, cb)
: extractWithYtDlp(url, cb);
if (backend === 'playwright') {
return extractWithPlaywright(url, cb);
}
// yt-dlp primary path
const result = await extractWithYtDlp(url, cb);
if (isCaptionTruncated(result.bodyText)) {
cb?.({
type: 'status',
message: 'Caption truncated by Instagram — retrying with browser to get full text…',
timestamp: new Date().toISOString()
});
try {
const full = await extractWithPlaywright(url, cb);
if (full.bodyText.length > result.bodyText.length) {
return full;
}
} catch (e) {
cb?.({
type: 'status',
message: 'Browser fallback failed — continuing with truncated caption',
timestamp: new Date().toISOString()
});
}
}
return result;
};
/**