feat: use Playwright for caption, yt-dlp for thumbnail only
Some checks failed
Build & Push Docker Image / test-and-build (push) Failing after 33s
Some checks failed
Build & Push Docker Image / test-and-build (push) Failing after 33s
Always extract the full caption via Playwright (browser sees the untruncated text). yt-dlp runs in parallel only to get the thumbnail CDN URL quickly; its result for the description is discarded. This eliminates the truncation problem at the source without needing a fallback heuristic. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
34
playwright.config.ts.bak
Normal file
34
playwright.config.ts.bak
Normal file
@@ -0,0 +1,34 @@
|
||||
import { defineConfig, devices } from '@playwright/test';
|
||||
|
||||
/**
|
||||
* Playwright configuration for E2E tests
|
||||
*
|
||||
* See https://playwright.dev/docs/test-configuration
|
||||
*/
|
||||
export default defineConfig({
|
||||
testDir: './src/tests',
|
||||
testMatch: '**/*.e2e.spec.ts',
|
||||
fullyParallel: true,
|
||||
forbidOnly: !!process.env.CI,
|
||||
retries: process.env.CI ? 2 : 0,
|
||||
workers: process.env.CI ? 1 : undefined,
|
||||
reporter: 'list',
|
||||
use: {
|
||||
baseURL: 'http://localhost:5173',
|
||||
trace: 'on-first-retry'
|
||||
},
|
||||
|
||||
projects: [
|
||||
{
|
||||
name: 'chromium',
|
||||
use: { ...devices['Desktop Chrome'] }
|
||||
}
|
||||
],
|
||||
|
||||
webServer: {
|
||||
command: 'npm run dev',
|
||||
url: 'http://localhost:5173',
|
||||
reuseExistingServer: !process.env.CI,
|
||||
timeout: 120000
|
||||
}
|
||||
});
|
||||
@@ -19,63 +19,34 @@ import { uploadRecipeWithIngredientsDTO, uploadRecipeImage } from '$lib/server/t
|
||||
import { pushNotificationService } from '$lib/server/notifications/PushNotificationService';
|
||||
import { queueConfig } from './config';
|
||||
import { logError } from '../utils/logger';
|
||||
import { env } from '$env/dynamic/private';
|
||||
import type { ProgressEvent, ExtractedContent, ProgressCallback } from '$lib/server/extraction';
|
||||
import type { QueueItem } from './types';
|
||||
|
||||
/**
|
||||
* Instagram caption truncation marker: yt-dlp's GraphQL path returns a
|
||||
* caption ending with "…." (U+2026 + ".") when Instagram cut it short.
|
||||
* When we detect this, fall back to the Playwright extractor which intercepts
|
||||
* the full caption from live network traffic (requires a valid auth.json).
|
||||
* Extract caption via Playwright (full, untruncated) and thumbnail via yt-dlp
|
||||
* (fast, reliable CDN URL). Both run in parallel; yt-dlp failure is non-fatal.
|
||||
*/
|
||||
const TRUNCATION_SUFFIX = '\u2026.';
|
||||
|
||||
async function extractTextAndThumbnail(
|
||||
url: string,
|
||||
cb?: ProgressCallback
|
||||
): Promise<ExtractedContent> {
|
||||
const backend = (env.EXTRACTOR_BACKEND ?? 'ytdlp').toLowerCase();
|
||||
// Run Playwright (caption) and yt-dlp (thumbnail) concurrently
|
||||
const [ytdlpResult, playwrightResult] = await Promise.allSettled([
|
||||
extractWithYtDlp(url),
|
||||
extractWithPlaywright(url, cb)
|
||||
]);
|
||||
|
||||
if (backend === 'playwright') {
|
||||
return extractWithPlaywright(url, cb);
|
||||
if (playwrightResult.status === 'rejected') {
|
||||
throw playwrightResult.reason;
|
||||
}
|
||||
|
||||
// Primary path: yt-dlp (fast, no browser)
|
||||
const result = await extractWithYtDlp(url, cb);
|
||||
// Prefer yt-dlp thumbnail; fall back to whatever Playwright captured
|
||||
const thumbnail =
|
||||
ytdlpResult.status === 'fulfilled' && ytdlpResult.value.thumbnail
|
||||
? ytdlpResult.value.thumbnail
|
||||
: playwrightResult.value.thumbnail;
|
||||
|
||||
// Truncation detected → fall back to Playwright for the full caption
|
||||
if (result.bodyText.endsWith(TRUNCATION_SUFFIX)) {
|
||||
cb?.({
|
||||
type: 'status',
|
||||
message: 'Caption truncated by Instagram API — retrying with browser extraction...',
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
try {
|
||||
const playwrightResult = await extractWithPlaywright(url, cb);
|
||||
if (playwrightResult.bodyText && !playwrightResult.bodyText.endsWith(TRUNCATION_SUFFIX)) {
|
||||
cb?.({
|
||||
type: 'status',
|
||||
message: `Full caption retrieved via browser (${playwrightResult.bodyText.length} chars)`,
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
// Prefer yt-dlp thumbnail if Playwright didn't get one
|
||||
return {
|
||||
bodyText: playwrightResult.bodyText,
|
||||
thumbnail: playwrightResult.thumbnail ?? result.thumbnail
|
||||
};
|
||||
}
|
||||
} catch (e) {
|
||||
// Playwright fallback failed — surface the original yt-dlp result
|
||||
cb?.({
|
||||
type: 'status',
|
||||
message: `Browser fallback failed (${e instanceof Error ? e.message : String(e)}), using partial caption`,
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
return { bodyText: playwrightResult.value.bodyText, thumbnail };
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user