fix

2025-12-21 02:03:05 +01:00
parent 167cd1f4bb
commit 9357bd483a
36 changed files with 6251 additions and 1547 deletions
--- a/src/lib/server/extraction.ts
+++ b/src/lib/server/extraction.ts
@@ -0,0 +1,115 @@
+import { createBrowserContext } from './browser';
+import fs from 'fs';
+import path from 'path';
+import type { Page } from 'playwright';
+
+export interface ExtractedContent {
+	bodyText: string;
+	thumbnail: string | null;
+}
+
+/**
+ * Resolve authentication storage path
+ * Checks Docker path first, then local path
+ */
+function resolveAuthPath(): string | undefined {
+	const authPathDocker = '/app/secrets/auth.json';
+	const authPathLocal = './secrets/auth.json';
+
+	if (fs.existsSync(authPathDocker)) {
+		return authPathDocker;
+	}
+
+	if (fs.existsSync(authPathLocal)) {
+		return authPathLocal;
+	}
+
+	return undefined;
+}
+
+/**
+ * Extract text content and thumbnail from a URL using Playwright browser
+ * @param url - The URL to extract from
+ * @returns Extracted text and thumbnail
+ */
+export async function extractTextAndThumbnail(
+	url: string
+): Promise<ExtractedContent> {
+	const authPath = resolveAuthPath();
+	const context = await createBrowserContext(authPath);
+	const page = await context.newPage();
+
+	// Set a fixed viewport size (Instagram feed width)
+	await page.setViewportSize({ width: 1080, height: 1920 });
+
+	let bodyText = '';
+	let thumbnail: string | null = null;
+
+	try {
+		await page.goto(url, { waitUntil: 'domcontentloaded' });
+		
+		// Extract and clean text content
+		bodyText = await extractCleanText(page);
+		
+		// Save debug content
+		fs.writeFileSync(path.resolve('debug_page.txt'), bodyText);
+
+		// Extract thumbnail from video element
+		thumbnail = await extractThumbnail(page);
+	} catch (e) {
+		console.error('Scraping error:', e);
+		throw new Error('Failed to scrape URL');
+	} finally {
+		await page.close();
+		await context.close();
+	}
+
+	return { bodyText, thumbnail };
+}
+
+/**
+ * Extract and clean text from page body
+ */
+async function extractCleanText(page: Page): Promise<string> {
+	let text = (await page.evaluate(() => document.body.innerText))
+		.replace(/^(?:.*\n){6}/, '') // Remove first 6 lines
+		.split('More posts from')[0] // Cut at "More posts from"
+		.trim();
+
+	// Remove mentions and hashtags
+	text = text.replace(/@\w+/g, '').replace(/#\w+/g, '');
+
+	return text;
+}
+
+/**
+ * Extract thumbnail from video element or take full page screenshot
+ */
+async function extractThumbnail(page: Page): Promise<string | null> {
+	const videoBounds = await page.evaluate(() => {
+		const video = document.querySelector('video');
+		if (!video) return null;
+		const rect = video.getBoundingClientRect();
+		return {
+			x: Math.max(0, rect.left),
+			y: Math.max(0, rect.top),
+			width: Math.min(rect.width, window.innerWidth),
+			height: Math.min(rect.height, window.innerHeight)
+		};
+	});
+
+	let screenshotBuffer: Buffer;
+
+	if (videoBounds && videoBounds.width > 0 && videoBounds.height > 0) {
+		screenshotBuffer = await page.screenshot({
+			type: 'jpeg',
+			quality: 85,
+			clip: videoBounds
+		});
+	} else {
+		console.warn('Video element not found or has no size, taking full page screenshot');
+		screenshotBuffer = await page.screenshot({ type: 'jpeg', quality: 85 });
+	}
+
+	return `data:image/jpeg;base64,${screenshotBuffer.toString('base64')}`;
+}