feat: Playwright fallback for truncated Instagram captions
All checks were successful
Build & Push Docker Image / test-and-build (push) Successful in 1m1s

When yt-dlp returns a caption ending with the truncation marker '….'
(GraphQL API caps the text), automatically retry with the Playwright
extractor, which intercepts the full caption from live GraphQL network
traffic.

Falls back gracefully to the partial yt-dlp caption if Playwright fails.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Giancarmine Salucci
2026-05-13 00:17:36 +02:00
parent 10c4f78ace
commit 958353d15a

View File

@@ -23,18 +23,60 @@ import { env } from '$env/dynamic/private';
import type { ProgressEvent, ExtractedContent, ProgressCallback } from '$lib/server/extraction';
import type { QueueItem } from './types';
// Feature flag: pick which Instagram extractor backend to invoke.
// Default to yt-dlp; set EXTRACTOR_BACKEND=playwright to fall back to the
// legacy stealth scraper while we verify the new path.
const extractTextAndThumbnail = (
/**
* Instagram caption truncation marker: yt-dlp's GraphQL path returns a
* caption ending with "…." (U+2026 + ".") when Instagram cut it short.
* When we detect this, fall back to the Playwright extractor which intercepts
* the full caption from live network traffic (requires a valid auth.json).
*/
const TRUNCATION_SUFFIX = '\u2026.';
async function extractTextAndThumbnail(
url: string,
cb?: ProgressCallback
): Promise<ExtractedContent> => {
): Promise<ExtractedContent> {
const backend = (env.EXTRACTOR_BACKEND ?? 'ytdlp').toLowerCase();
return backend === 'playwright'
? extractWithPlaywright(url, cb)
: extractWithYtDlp(url, cb);
if (backend === 'playwright') {
return extractWithPlaywright(url, cb);
}
// Primary path: yt-dlp (fast, no browser)
const result = await extractWithYtDlp(url, cb);
// Truncation detected → fall back to Playwright for the full caption
if (result.bodyText.endsWith(TRUNCATION_SUFFIX)) {
cb?.({
type: 'status',
message: 'Caption truncated by Instagram API — retrying with browser extraction...',
timestamp: new Date().toISOString()
});
try {
const playwrightResult = await extractWithPlaywright(url, cb);
if (playwrightResult.bodyText && !playwrightResult.bodyText.endsWith(TRUNCATION_SUFFIX)) {
cb?.({
type: 'status',
message: `Full caption retrieved via browser (${playwrightResult.bodyText.length} chars)`,
timestamp: new Date().toISOString()
});
// Prefer yt-dlp thumbnail if Playwright didn't get one
return {
bodyText: playwrightResult.bodyText,
thumbnail: playwrightResult.thumbnail ?? result.thumbnail
};
}
} catch (e) {
// Playwright fallback failed — surface the original yt-dlp result
cb?.({
type: 'status',
message: `Browser fallback failed (${e instanceof Error ? e.message : String(e)}), using partial caption`,
timestamp: new Date().toISOString()
});
}
}
return result;
}
/**
* Queue processor with configurable concurrency