feat: Playwright fallback for truncated Instagram captions
All checks were successful
Build & Push Docker Image / test-and-build (push) Successful in 1m1s
All checks were successful
Build & Push Docker Image / test-and-build (push) Successful in 1m1s
When yt-dlp returns a caption ending with the truncation marker '….' (GraphQL API caps the text), automatically retry with the Playwright extractor, which intercepts the full caption from live GraphQL network traffic. Falls back gracefully to the partial yt-dlp caption if Playwright fails. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -23,18 +23,60 @@ import { env } from '$env/dynamic/private';
|
||||
import type { ProgressEvent, ExtractedContent, ProgressCallback } from '$lib/server/extraction';
|
||||
import type { QueueItem } from './types';
|
||||
|
||||
// Feature flag: pick which Instagram extractor backend to invoke.
|
||||
// Default to yt-dlp; set EXTRACTOR_BACKEND=playwright to fall back to the
|
||||
// legacy stealth scraper while we verify the new path.
|
||||
const extractTextAndThumbnail = (
|
||||
/**
|
||||
* Instagram caption truncation marker: yt-dlp's GraphQL path returns a
|
||||
* caption ending with "…." (U+2026 + ".") when Instagram cut it short.
|
||||
* When we detect this, fall back to the Playwright extractor which intercepts
|
||||
* the full caption from live network traffic (requires a valid auth.json).
|
||||
*/
|
||||
const TRUNCATION_SUFFIX = '\u2026.';
|
||||
|
||||
async function extractTextAndThumbnail(
|
||||
url: string,
|
||||
cb?: ProgressCallback
|
||||
): Promise<ExtractedContent> => {
|
||||
): Promise<ExtractedContent> {
|
||||
const backend = (env.EXTRACTOR_BACKEND ?? 'ytdlp').toLowerCase();
|
||||
return backend === 'playwright'
|
||||
? extractWithPlaywright(url, cb)
|
||||
: extractWithYtDlp(url, cb);
|
||||
};
|
||||
|
||||
if (backend === 'playwright') {
|
||||
return extractWithPlaywright(url, cb);
|
||||
}
|
||||
|
||||
// Primary path: yt-dlp (fast, no browser)
|
||||
const result = await extractWithYtDlp(url, cb);
|
||||
|
||||
// Truncation detected → fall back to Playwright for the full caption
|
||||
if (result.bodyText.endsWith(TRUNCATION_SUFFIX)) {
|
||||
cb?.({
|
||||
type: 'status',
|
||||
message: 'Caption truncated by Instagram API — retrying with browser extraction...',
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
try {
|
||||
const playwrightResult = await extractWithPlaywright(url, cb);
|
||||
if (playwrightResult.bodyText && !playwrightResult.bodyText.endsWith(TRUNCATION_SUFFIX)) {
|
||||
cb?.({
|
||||
type: 'status',
|
||||
message: `Full caption retrieved via browser (${playwrightResult.bodyText.length} chars)`,
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
// Prefer yt-dlp thumbnail if Playwright didn't get one
|
||||
return {
|
||||
bodyText: playwrightResult.bodyText,
|
||||
thumbnail: playwrightResult.thumbnail ?? result.thumbnail
|
||||
};
|
||||
}
|
||||
} catch (e) {
|
||||
// Playwright fallback failed — surface the original yt-dlp result
|
||||
cb?.({
|
||||
type: 'status',
|
||||
message: `Browser fallback failed (${e instanceof Error ? e.message : String(e)}), using partial caption`,
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Queue processor with configurable concurrency
|
||||
|
||||
Reference in New Issue
Block a user