feat: auto Playwright fallback when yt-dlp caption is truncated
All checks were successful
Build & Push Docker Image / test-and-build (push) Successful in 1m2s
All checks were successful
Build & Push Docker Image / test-and-build (push) Successful in 1m2s
Instagram truncates long captions server-side (ends with '…'). When yt-dlp returns a truncated caption, automatically fall back to the Playwright extractor which runs JS in a real browser and can click the 'more' button to expand the full caption. Falls back gracefully: if Playwright fails, the truncated text is still used rather than failing the whole extraction. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -23,17 +23,52 @@ import { env } from '$env/dynamic/private';
|
||||
import type { ProgressEvent, ExtractedContent, ProgressCallback } from '$lib/server/extraction';
|
||||
import type { QueueItem } from './types';
|
||||
|
||||
// Returns true when yt-dlp returns a caption that Instagram truncated server-side.
|
||||
// Truncated captions end with the Unicode ellipsis character (…), optionally
|
||||
// followed by a period: "…." or just "…".
|
||||
function isCaptionTruncated(text: string): boolean {
|
||||
const t = text.trimEnd();
|
||||
return t.endsWith('…') || t.endsWith('….') || t.endsWith('...');
|
||||
}
|
||||
|
||||
// Feature flag: pick which Instagram extractor backend to invoke.
|
||||
// Default to yt-dlp; set EXTRACTOR_BACKEND=playwright to fall back to the
|
||||
// legacy stealth scraper while we verify the new path.
|
||||
const extractTextAndThumbnail = (
|
||||
// Default to yt-dlp (fast, no browser); set EXTRACTOR_BACKEND=playwright to
|
||||
// always use the stealth browser scraper.
|
||||
// When yt-dlp is the primary backend, a truncated caption (ending with "…")
|
||||
// automatically triggers a Playwright fallback to get the full text.
|
||||
const extractTextAndThumbnail = async (
|
||||
url: string,
|
||||
cb?: ProgressCallback
|
||||
): Promise<ExtractedContent> => {
|
||||
const backend = (env.EXTRACTOR_BACKEND ?? 'ytdlp').toLowerCase();
|
||||
return backend === 'playwright'
|
||||
? extractWithPlaywright(url, cb)
|
||||
: extractWithYtDlp(url, cb);
|
||||
if (backend === 'playwright') {
|
||||
return extractWithPlaywright(url, cb);
|
||||
}
|
||||
|
||||
// yt-dlp primary path
|
||||
const result = await extractWithYtDlp(url, cb);
|
||||
|
||||
if (isCaptionTruncated(result.bodyText)) {
|
||||
cb?.({
|
||||
type: 'status',
|
||||
message: 'Caption truncated by Instagram — retrying with browser to get full text…',
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
try {
|
||||
const full = await extractWithPlaywright(url, cb);
|
||||
if (full.bodyText.length > result.bodyText.length) {
|
||||
return full;
|
||||
}
|
||||
} catch (e) {
|
||||
cb?.({
|
||||
type: 'status',
|
||||
message: 'Browser fallback failed — continuing with truncated caption',
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
};
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user