From c9f5300272c73cdfb635924dfb0b4758c19c3540 Mon Sep 17 00:00:00 2001 From: Giancarmine Salucci Date: Wed, 13 May 2026 01:31:33 +0200 Subject: [PATCH] feat: use Playwright for caption, yt-dlp for thumbnail only Always extract the full caption via Playwright (browser sees the untruncated text). yt-dlp runs in parallel only to get the thumbnail CDN URL quickly; its result for the description is discarded. This eliminates the truncation problem at the source without needing a fallback heuristic. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- playwright.config.ts.bak | 34 +++++++++++++++ src/lib/server/queue/QueueProcessor.ts | 59 +++++++------------------- 2 files changed, 49 insertions(+), 44 deletions(-) create mode 100644 playwright.config.ts.bak diff --git a/playwright.config.ts.bak b/playwright.config.ts.bak new file mode 100644 index 0000000..f72915b --- /dev/null +++ b/playwright.config.ts.bak @@ -0,0 +1,34 @@ +import { defineConfig, devices } from '@playwright/test'; + +/** + * Playwright configuration for E2E tests + * + * See https://playwright.dev/docs/test-configuration + */ +export default defineConfig({ + testDir: './src/tests', + testMatch: '**/*.e2e.spec.ts', + fullyParallel: true, + forbidOnly: !!process.env.CI, + retries: process.env.CI ? 2 : 0, + workers: process.env.CI ? 1 : undefined, + reporter: 'list', + use: { + baseURL: 'http://localhost:5173', + trace: 'on-first-retry' + }, + + projects: [ + { + name: 'chromium', + use: { ...devices['Desktop Chrome'] } + } + ], + + webServer: { + command: 'npm run dev', + url: 'http://localhost:5173', + reuseExistingServer: !process.env.CI, + timeout: 120000 + } +}); diff --git a/src/lib/server/queue/QueueProcessor.ts b/src/lib/server/queue/QueueProcessor.ts index c19dff3..e025538 100644 --- a/src/lib/server/queue/QueueProcessor.ts +++ b/src/lib/server/queue/QueueProcessor.ts @@ -19,63 +19,34 @@ import { uploadRecipeWithIngredientsDTO, uploadRecipeImage } from '$lib/server/t import { pushNotificationService } from '$lib/server/notifications/PushNotificationService'; import { queueConfig } from './config'; import { logError } from '../utils/logger'; -import { env } from '$env/dynamic/private'; import type { ProgressEvent, ExtractedContent, ProgressCallback } from '$lib/server/extraction'; import type { QueueItem } from './types'; /** - * Instagram caption truncation marker: yt-dlp's GraphQL path returns a - * caption ending with "…." (U+2026 + ".") when Instagram cut it short. - * When we detect this, fall back to the Playwright extractor which intercepts - * the full caption from live network traffic (requires a valid auth.json). + * Extract caption via Playwright (full, untruncated) and thumbnail via yt-dlp + * (fast, reliable CDN URL). Both run in parallel; yt-dlp failure is non-fatal. */ -const TRUNCATION_SUFFIX = '\u2026.'; - async function extractTextAndThumbnail( url: string, cb?: ProgressCallback ): Promise { - const backend = (env.EXTRACTOR_BACKEND ?? 'ytdlp').toLowerCase(); + // Run Playwright (caption) and yt-dlp (thumbnail) concurrently + const [ytdlpResult, playwrightResult] = await Promise.allSettled([ + extractWithYtDlp(url), + extractWithPlaywright(url, cb) + ]); - if (backend === 'playwright') { - return extractWithPlaywright(url, cb); + if (playwrightResult.status === 'rejected') { + throw playwrightResult.reason; } - // Primary path: yt-dlp (fast, no browser) - const result = await extractWithYtDlp(url, cb); + // Prefer yt-dlp thumbnail; fall back to whatever Playwright captured + const thumbnail = + ytdlpResult.status === 'fulfilled' && ytdlpResult.value.thumbnail + ? ytdlpResult.value.thumbnail + : playwrightResult.value.thumbnail; - // Truncation detected → fall back to Playwright for the full caption - if (result.bodyText.endsWith(TRUNCATION_SUFFIX)) { - cb?.({ - type: 'status', - message: 'Caption truncated by Instagram API — retrying with browser extraction...', - timestamp: new Date().toISOString() - }); - try { - const playwrightResult = await extractWithPlaywright(url, cb); - if (playwrightResult.bodyText && !playwrightResult.bodyText.endsWith(TRUNCATION_SUFFIX)) { - cb?.({ - type: 'status', - message: `Full caption retrieved via browser (${playwrightResult.bodyText.length} chars)`, - timestamp: new Date().toISOString() - }); - // Prefer yt-dlp thumbnail if Playwright didn't get one - return { - bodyText: playwrightResult.bodyText, - thumbnail: playwrightResult.thumbnail ?? result.thumbnail - }; - } - } catch (e) { - // Playwright fallback failed — surface the original yt-dlp result - cb?.({ - type: 'status', - message: `Browser fallback failed (${e instanceof Error ? e.message : String(e)}), using partial caption`, - timestamp: new Date().toISOString() - }); - } - } - - return result; + return { bodyText: playwrightResult.value.bodyText, thumbnail }; } /**