feat: use Playwright for caption, yt-dlp for thumbnail only
Some checks failed
Build & Push Docker Image / test-and-build (push) Failing after 33s
Some checks failed
Build & Push Docker Image / test-and-build (push) Failing after 33s
Always extract the full caption via Playwright (browser sees the untruncated text). yt-dlp runs in parallel only to get the thumbnail CDN URL quickly; its result for the description is discarded. This eliminates the truncation problem at the source without needing a fallback heuristic. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
34
playwright.config.ts.bak
Normal file
34
playwright.config.ts.bak
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
import { defineConfig, devices } from '@playwright/test';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Playwright configuration for E2E tests
|
||||||
|
*
|
||||||
|
* See https://playwright.dev/docs/test-configuration
|
||||||
|
*/
|
||||||
|
export default defineConfig({
|
||||||
|
testDir: './src/tests',
|
||||||
|
testMatch: '**/*.e2e.spec.ts',
|
||||||
|
fullyParallel: true,
|
||||||
|
forbidOnly: !!process.env.CI,
|
||||||
|
retries: process.env.CI ? 2 : 0,
|
||||||
|
workers: process.env.CI ? 1 : undefined,
|
||||||
|
reporter: 'list',
|
||||||
|
use: {
|
||||||
|
baseURL: 'http://localhost:5173',
|
||||||
|
trace: 'on-first-retry'
|
||||||
|
},
|
||||||
|
|
||||||
|
projects: [
|
||||||
|
{
|
||||||
|
name: 'chromium',
|
||||||
|
use: { ...devices['Desktop Chrome'] }
|
||||||
|
}
|
||||||
|
],
|
||||||
|
|
||||||
|
webServer: {
|
||||||
|
command: 'npm run dev',
|
||||||
|
url: 'http://localhost:5173',
|
||||||
|
reuseExistingServer: !process.env.CI,
|
||||||
|
timeout: 120000
|
||||||
|
}
|
||||||
|
});
|
||||||
@@ -19,63 +19,34 @@ import { uploadRecipeWithIngredientsDTO, uploadRecipeImage } from '$lib/server/t
|
|||||||
import { pushNotificationService } from '$lib/server/notifications/PushNotificationService';
|
import { pushNotificationService } from '$lib/server/notifications/PushNotificationService';
|
||||||
import { queueConfig } from './config';
|
import { queueConfig } from './config';
|
||||||
import { logError } from '../utils/logger';
|
import { logError } from '../utils/logger';
|
||||||
import { env } from '$env/dynamic/private';
|
|
||||||
import type { ProgressEvent, ExtractedContent, ProgressCallback } from '$lib/server/extraction';
|
import type { ProgressEvent, ExtractedContent, ProgressCallback } from '$lib/server/extraction';
|
||||||
import type { QueueItem } from './types';
|
import type { QueueItem } from './types';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instagram caption truncation marker: yt-dlp's GraphQL path returns a
|
* Extract caption via Playwright (full, untruncated) and thumbnail via yt-dlp
|
||||||
* caption ending with "…." (U+2026 + ".") when Instagram cut it short.
|
* (fast, reliable CDN URL). Both run in parallel; yt-dlp failure is non-fatal.
|
||||||
* When we detect this, fall back to the Playwright extractor which intercepts
|
|
||||||
* the full caption from live network traffic (requires a valid auth.json).
|
|
||||||
*/
|
*/
|
||||||
const TRUNCATION_SUFFIX = '\u2026.';
|
|
||||||
|
|
||||||
async function extractTextAndThumbnail(
|
async function extractTextAndThumbnail(
|
||||||
url: string,
|
url: string,
|
||||||
cb?: ProgressCallback
|
cb?: ProgressCallback
|
||||||
): Promise<ExtractedContent> {
|
): Promise<ExtractedContent> {
|
||||||
const backend = (env.EXTRACTOR_BACKEND ?? 'ytdlp').toLowerCase();
|
// Run Playwright (caption) and yt-dlp (thumbnail) concurrently
|
||||||
|
const [ytdlpResult, playwrightResult] = await Promise.allSettled([
|
||||||
|
extractWithYtDlp(url),
|
||||||
|
extractWithPlaywright(url, cb)
|
||||||
|
]);
|
||||||
|
|
||||||
if (backend === 'playwright') {
|
if (playwrightResult.status === 'rejected') {
|
||||||
return extractWithPlaywright(url, cb);
|
throw playwrightResult.reason;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Primary path: yt-dlp (fast, no browser)
|
// Prefer yt-dlp thumbnail; fall back to whatever Playwright captured
|
||||||
const result = await extractWithYtDlp(url, cb);
|
const thumbnail =
|
||||||
|
ytdlpResult.status === 'fulfilled' && ytdlpResult.value.thumbnail
|
||||||
|
? ytdlpResult.value.thumbnail
|
||||||
|
: playwrightResult.value.thumbnail;
|
||||||
|
|
||||||
// Truncation detected → fall back to Playwright for the full caption
|
return { bodyText: playwrightResult.value.bodyText, thumbnail };
|
||||||
if (result.bodyText.endsWith(TRUNCATION_SUFFIX)) {
|
|
||||||
cb?.({
|
|
||||||
type: 'status',
|
|
||||||
message: 'Caption truncated by Instagram API — retrying with browser extraction...',
|
|
||||||
timestamp: new Date().toISOString()
|
|
||||||
});
|
|
||||||
try {
|
|
||||||
const playwrightResult = await extractWithPlaywright(url, cb);
|
|
||||||
if (playwrightResult.bodyText && !playwrightResult.bodyText.endsWith(TRUNCATION_SUFFIX)) {
|
|
||||||
cb?.({
|
|
||||||
type: 'status',
|
|
||||||
message: `Full caption retrieved via browser (${playwrightResult.bodyText.length} chars)`,
|
|
||||||
timestamp: new Date().toISOString()
|
|
||||||
});
|
|
||||||
// Prefer yt-dlp thumbnail if Playwright didn't get one
|
|
||||||
return {
|
|
||||||
bodyText: playwrightResult.bodyText,
|
|
||||||
thumbnail: playwrightResult.thumbnail ?? result.thumbnail
|
|
||||||
};
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
// Playwright fallback failed — surface the original yt-dlp result
|
|
||||||
cb?.({
|
|
||||||
type: 'status',
|
|
||||||
message: `Browser fallback failed (${e instanceof Error ? e.message : String(e)}), using partial caption`,
|
|
||||||
timestamp: new Date().toISOString()
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
Reference in New Issue
Block a user