This commit is contained in:
Giancarmine Salucci
2025-12-21 02:03:05 +01:00
parent 167cd1f4bb
commit 9357bd483a
36 changed files with 6251 additions and 1547 deletions

View File

@@ -0,0 +1,115 @@
import { createBrowserContext } from './browser';
import fs from 'fs';
import path from 'path';
import type { Page } from 'playwright';
export interface ExtractedContent {
bodyText: string;
thumbnail: string | null;
}
/**
* Resolve authentication storage path
* Checks Docker path first, then local path
*/
function resolveAuthPath(): string | undefined {
const authPathDocker = '/app/secrets/auth.json';
const authPathLocal = './secrets/auth.json';
if (fs.existsSync(authPathDocker)) {
return authPathDocker;
}
if (fs.existsSync(authPathLocal)) {
return authPathLocal;
}
return undefined;
}
/**
* Extract text content and thumbnail from a URL using Playwright browser
* @param url - The URL to extract from
* @returns Extracted text and thumbnail
*/
export async function extractTextAndThumbnail(
url: string
): Promise<ExtractedContent> {
const authPath = resolveAuthPath();
const context = await createBrowserContext(authPath);
const page = await context.newPage();
// Set a fixed viewport size (Instagram feed width)
await page.setViewportSize({ width: 1080, height: 1920 });
let bodyText = '';
let thumbnail: string | null = null;
try {
await page.goto(url, { waitUntil: 'domcontentloaded' });
// Extract and clean text content
bodyText = await extractCleanText(page);
// Save debug content
fs.writeFileSync(path.resolve('debug_page.txt'), bodyText);
// Extract thumbnail from video element
thumbnail = await extractThumbnail(page);
} catch (e) {
console.error('Scraping error:', e);
throw new Error('Failed to scrape URL');
} finally {
await page.close();
await context.close();
}
return { bodyText, thumbnail };
}
/**
* Extract and clean text from page body
*/
async function extractCleanText(page: Page): Promise<string> {
let text = (await page.evaluate(() => document.body.innerText))
.replace(/^(?:.*\n){6}/, '') // Remove first 6 lines
.split('More posts from')[0] // Cut at "More posts from"
.trim();
// Remove mentions and hashtags
text = text.replace(/@\w+/g, '').replace(/#\w+/g, '');
return text;
}
/**
* Extract thumbnail from video element or take full page screenshot
*/
async function extractThumbnail(page: Page): Promise<string | null> {
const videoBounds = await page.evaluate(() => {
const video = document.querySelector('video');
if (!video) return null;
const rect = video.getBoundingClientRect();
return {
x: Math.max(0, rect.left),
y: Math.max(0, rect.top),
width: Math.min(rect.width, window.innerWidth),
height: Math.min(rect.height, window.innerHeight)
};
});
let screenshotBuffer: Buffer;
if (videoBounds && videoBounds.width > 0 && videoBounds.height > 0) {
screenshotBuffer = await page.screenshot({
type: 'jpeg',
quality: 85,
clip: videoBounds
});
} else {
console.warn('Video element not found or has no size, taking full page screenshot');
screenshotBuffer = await page.screenshot({ type: 'jpeg', quality: 85 });
}
return `data:image/jpeg;base64,${screenshotBuffer.toString('base64')}`;
}