feat: use Playwright for caption, yt-dlp for thumbnail only
Some checks failed
Build & Push Docker Image / test-and-build (push) Failing after 33s

Always extract the full caption via Playwright (browser sees the
untruncated text). yt-dlp runs in parallel only to get the thumbnail
CDN URL quickly; its result for the description is discarded.

This eliminates the truncation problem at the source without needing
a fallback heuristic.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
Giancarmine Salucci
2026-05-13 01:31:33 +02:00
parent 958353d15a
commit c9f5300272
2 changed files with 49 additions and 44 deletions

34
playwright.config.ts.bak Normal file
View File

@@ -0,0 +1,34 @@
import { defineConfig, devices } from '@playwright/test';
/**
* Playwright configuration for E2E tests
*
* See https://playwright.dev/docs/test-configuration
*/
export default defineConfig({
testDir: './src/tests',
testMatch: '**/*.e2e.spec.ts',
fullyParallel: true,
forbidOnly: !!process.env.CI,
retries: process.env.CI ? 2 : 0,
workers: process.env.CI ? 1 : undefined,
reporter: 'list',
use: {
baseURL: 'http://localhost:5173',
trace: 'on-first-retry'
},
projects: [
{
name: 'chromium',
use: { ...devices['Desktop Chrome'] }
}
],
webServer: {
command: 'npm run dev',
url: 'http://localhost:5173',
reuseExistingServer: !process.env.CI,
timeout: 120000
}
});

View File

@@ -19,63 +19,34 @@ import { uploadRecipeWithIngredientsDTO, uploadRecipeImage } from '$lib/server/t
import { pushNotificationService } from '$lib/server/notifications/PushNotificationService';
import { queueConfig } from './config';
import { logError } from '../utils/logger';
import { env } from '$env/dynamic/private';
import type { ProgressEvent, ExtractedContent, ProgressCallback } from '$lib/server/extraction';
import type { QueueItem } from './types';
/**
* Instagram caption truncation marker: yt-dlp's GraphQL path returns a
* caption ending with "…." (U+2026 + ".") when Instagram cut it short.
* When we detect this, fall back to the Playwright extractor which intercepts
* the full caption from live network traffic (requires a valid auth.json).
* Extract caption via Playwright (full, untruncated) and thumbnail via yt-dlp
* (fast, reliable CDN URL). Both run in parallel; yt-dlp failure is non-fatal.
*/
const TRUNCATION_SUFFIX = '\u2026.';
async function extractTextAndThumbnail(
url: string,
cb?: ProgressCallback
): Promise<ExtractedContent> {
const backend = (env.EXTRACTOR_BACKEND ?? 'ytdlp').toLowerCase();
// Run Playwright (caption) and yt-dlp (thumbnail) concurrently
const [ytdlpResult, playwrightResult] = await Promise.allSettled([
extractWithYtDlp(url),
extractWithPlaywright(url, cb)
]);
if (backend === 'playwright') {
return extractWithPlaywright(url, cb);
if (playwrightResult.status === 'rejected') {
throw playwrightResult.reason;
}
// Primary path: yt-dlp (fast, no browser)
const result = await extractWithYtDlp(url, cb);
// Prefer yt-dlp thumbnail; fall back to whatever Playwright captured
const thumbnail =
ytdlpResult.status === 'fulfilled' && ytdlpResult.value.thumbnail
? ytdlpResult.value.thumbnail
: playwrightResult.value.thumbnail;
// Truncation detected → fall back to Playwright for the full caption
if (result.bodyText.endsWith(TRUNCATION_SUFFIX)) {
cb?.({
type: 'status',
message: 'Caption truncated by Instagram API — retrying with browser extraction...',
timestamp: new Date().toISOString()
});
try {
const playwrightResult = await extractWithPlaywright(url, cb);
if (playwrightResult.bodyText && !playwrightResult.bodyText.endsWith(TRUNCATION_SUFFIX)) {
cb?.({
type: 'status',
message: `Full caption retrieved via browser (${playwrightResult.bodyText.length} chars)`,
timestamp: new Date().toISOString()
});
// Prefer yt-dlp thumbnail if Playwright didn't get one
return {
bodyText: playwrightResult.bodyText,
thumbnail: playwrightResult.thumbnail ?? result.thumbnail
};
}
} catch (e) {
// Playwright fallback failed — surface the original yt-dlp result
cb?.({
type: 'status',
message: `Browser fallback failed (${e instanceof Error ? e.message : String(e)}), using partial caption`,
timestamp: new Date().toISOString()
});
}
}
return result;
return { bodyText: playwrightResult.value.bodyText, thumbnail };
}
/**