fix
This commit is contained in:
115
src/lib/server/extraction.ts
Normal file
115
src/lib/server/extraction.ts
Normal file
@@ -0,0 +1,115 @@
|
||||
import { createBrowserContext } from './browser';
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
import type { Page } from 'playwright';
|
||||
|
||||
export interface ExtractedContent {
|
||||
bodyText: string;
|
||||
thumbnail: string | null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve authentication storage path
|
||||
* Checks Docker path first, then local path
|
||||
*/
|
||||
function resolveAuthPath(): string | undefined {
|
||||
const authPathDocker = '/app/secrets/auth.json';
|
||||
const authPathLocal = './secrets/auth.json';
|
||||
|
||||
if (fs.existsSync(authPathDocker)) {
|
||||
return authPathDocker;
|
||||
}
|
||||
|
||||
if (fs.existsSync(authPathLocal)) {
|
||||
return authPathLocal;
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract text content and thumbnail from a URL using Playwright browser
|
||||
* @param url - The URL to extract from
|
||||
* @returns Extracted text and thumbnail
|
||||
*/
|
||||
export async function extractTextAndThumbnail(
|
||||
url: string
|
||||
): Promise<ExtractedContent> {
|
||||
const authPath = resolveAuthPath();
|
||||
const context = await createBrowserContext(authPath);
|
||||
const page = await context.newPage();
|
||||
|
||||
// Set a fixed viewport size (Instagram feed width)
|
||||
await page.setViewportSize({ width: 1080, height: 1920 });
|
||||
|
||||
let bodyText = '';
|
||||
let thumbnail: string | null = null;
|
||||
|
||||
try {
|
||||
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
||||
|
||||
// Extract and clean text content
|
||||
bodyText = await extractCleanText(page);
|
||||
|
||||
// Save debug content
|
||||
fs.writeFileSync(path.resolve('debug_page.txt'), bodyText);
|
||||
|
||||
// Extract thumbnail from video element
|
||||
thumbnail = await extractThumbnail(page);
|
||||
} catch (e) {
|
||||
console.error('Scraping error:', e);
|
||||
throw new Error('Failed to scrape URL');
|
||||
} finally {
|
||||
await page.close();
|
||||
await context.close();
|
||||
}
|
||||
|
||||
return { bodyText, thumbnail };
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract and clean text from page body
|
||||
*/
|
||||
async function extractCleanText(page: Page): Promise<string> {
|
||||
let text = (await page.evaluate(() => document.body.innerText))
|
||||
.replace(/^(?:.*\n){6}/, '') // Remove first 6 lines
|
||||
.split('More posts from')[0] // Cut at "More posts from"
|
||||
.trim();
|
||||
|
||||
// Remove mentions and hashtags
|
||||
text = text.replace(/@\w+/g, '').replace(/#\w+/g, '');
|
||||
|
||||
return text;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract thumbnail from video element or take full page screenshot
|
||||
*/
|
||||
async function extractThumbnail(page: Page): Promise<string | null> {
|
||||
const videoBounds = await page.evaluate(() => {
|
||||
const video = document.querySelector('video');
|
||||
if (!video) return null;
|
||||
const rect = video.getBoundingClientRect();
|
||||
return {
|
||||
x: Math.max(0, rect.left),
|
||||
y: Math.max(0, rect.top),
|
||||
width: Math.min(rect.width, window.innerWidth),
|
||||
height: Math.min(rect.height, window.innerHeight)
|
||||
};
|
||||
});
|
||||
|
||||
let screenshotBuffer: Buffer;
|
||||
|
||||
if (videoBounds && videoBounds.width > 0 && videoBounds.height > 0) {
|
||||
screenshotBuffer = await page.screenshot({
|
||||
type: 'jpeg',
|
||||
quality: 85,
|
||||
clip: videoBounds
|
||||
});
|
||||
} else {
|
||||
console.warn('Video element not found or has no size, taking full page screenshot');
|
||||
screenshotBuffer = await page.screenshot({ type: 'jpeg', quality: 85 });
|
||||
}
|
||||
|
||||
return `data:image/jpeg;base64,${screenshotBuffer.toString('base64')}`;
|
||||
}
|
||||
Reference in New Issue
Block a user