import { createBrowserContext } from './browser'; import fs from 'fs'; import path from 'path'; import type { Page, BrowserContext } from 'playwright'; export interface ExtractedContent { bodyText: string; thumbnail: string | null; } export type ExtractionMethod = 'embedded-json' | 'dom-selector' | 'graphql-api' | 'legacy'; export type ProgressEventType = 'status' | 'method' | 'retry' | 'error' | 'thumbnail' | 'complete'; export interface ProgressEvent { type: ProgressEventType; message: string; method?: ExtractionMethod; attemptNumber?: number; maxAttempts?: number; data?: any; timestamp?: string; } export type ProgressCallback = (event: ProgressEvent) => void; interface ExtractionResult { success: boolean; method?: ExtractionMethod; data?: ExtractedContent; error?: string; } interface InstagramEmbeddedData { entry_data?: { PostPage?: Array<{ graphql?: { shortcode_media?: { edge_media_to_caption?: { edges?: Array<{ node: { text: string } }>; }; display_url?: string; video_url?: string; owner?: { username: string; profile_pic_url: string; }; }; }; }>; }; } interface RetryConfig { maxAttempts: number; initialDelayMs: number; maxDelayMs: number; backoffMultiplier: number; } const DEFAULT_RETRY_CONFIG: RetryConfig = { maxAttempts: 3, initialDelayMs: 1000, maxDelayMs: 10000, backoffMultiplier: 2 }; /** * Resolve authentication storage path * Checks Docker path first, then local path */ function resolveAuthPath(): string | undefined { const authPathDocker = '/app/secrets/auth.json'; const authPathLocal = './secrets/auth.json'; if (fs.existsSync(authPathDocker)) { return authPathDocker; } if (fs.existsSync(authPathLocal)) { return authPathLocal; } return undefined; } /** * Sleep utility for retry logic */ async function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } /** * Check if error should not be retried */ function isNonRetriableError(error: unknown): boolean { if (error instanceof Error) { // Don't retry authentication errors if (error.message.includes('authentication') || error.message.includes('login required')) { return true; } // Don't retry invalid URLs if (error.message.includes('invalid url')) { return true; } } return false; } /** * Get human-readable display name for extraction method */ function getMethodDisplayName(method: ExtractionMethod): string { const names: Record = { 'embedded-json': 'Embedded JSON', 'dom-selector': 'DOM Selector', 'graphql-api': 'GraphQL API', legacy: 'Legacy Parser' }; return names[method]; } /** * Retry wrapper with exponential backoff */ async function withRetry( fn: () => Promise, config: RetryConfig = DEFAULT_RETRY_CONFIG, onProgress?: ProgressCallback ): Promise { let lastError: Error | null = null; let delay = config.initialDelayMs; for (let attempt = 1; attempt <= config.maxAttempts; attempt++) { try { return await fn(); } catch (error) { lastError = error as Error; // Don't retry on certain errors if (isNonRetriableError(error)) { onProgress?.({ type: 'error', message: `Non-retriable error: ${lastError.message}`, timestamp: new Date().toISOString() }); throw error; } if (attempt < config.maxAttempts) { const message = `Attempt ${attempt}/${config.maxAttempts} failed. Retrying in ${delay}ms...`; console.warn(`[Retry] ${message}`, error); onProgress?.({ type: 'retry', message, attemptNumber: attempt, maxAttempts: config.maxAttempts, timestamp: new Date().toISOString() }); await sleep(delay); delay = Math.min(delay * config.backoffMultiplier, config.maxDelayMs); } } } throw lastError || new Error('Max retry attempts exceeded'); } /** * Extract shortcode from Instagram URL */ function extractShortcode(url: string): string | null { // Extract from /p/, /reel/, /tv/ URLs const match = url.match(/\/(p|reel|tv)\/([A-Za-z0-9_-]+)/); return match ? match[2] : null; } /** * Clean extracted text */ function cleanText(text: string): string { // Remove excessive whitespace let cleaned = text.replace(/\s+/g, ' ').trim(); // Remove common UI text patterns const uiPatterns = [ /^\s*More posts from.+$/gim, /^\s*View all \d+ comments$/gim, /^\s*Add a comment\.\.\.$/gim, /^\s*Liked by.+$/gim ]; uiPatterns.forEach((pattern) => { cleaned = cleaned.replace(pattern, ''); }); return cleaned.trim(); } /** * Strategy 1: Extract from embedded JSON data in script tags */ async function extractFromEmbeddedJSON( page: Page, progressCallback?: ProgressCallback ): Promise { try { // Extract all script tag contents const scriptContents = await page.evaluate(() => { const scripts = Array.from(document.querySelectorAll('script[type="text/javascript"]')); return scripts.map((script) => script.textContent || ''); }); // Look for embedded data patterns for (const content of scriptContents) { // Try window._sharedData pattern const sharedDataMatch = content.match(/window\._sharedData\s*=\s*(\{.+?\});/s); if (sharedDataMatch) { try { const data: InstagramEmbeddedData = JSON.parse(sharedDataMatch[1]); const result = parseInstagramData(data); if (result) { const thumbnail = await extractThumbnailStealth(page, progressCallback); return { ...result, thumbnail }; } } catch (e) { console.warn('Failed to parse _sharedData:', e); } } // Try __additionalDataLoaded pattern const additionalDataMatch = content.match(/window\.__additionalDataLoaded\([^,]+,\s*(\{.+?\})\);/s); if (additionalDataMatch) { try { const data = JSON.parse(additionalDataMatch[1]); const result = parseInstagramData(data); if (result) { const thumbnail = await extractThumbnailStealth(page, progressCallback); return { ...result, thumbnail }; } } catch (e) { console.warn('Failed to parse __additionalDataLoaded:', e); } } } return null; } catch (error) { console.warn('Failed to extract from embedded JSON:', error); return null; } } /** * Parse Instagram data structure */ function parseInstagramData(data: any): Omit | null { try { // Navigate the nested structure const media = data?.entry_data?.PostPage?.[0]?.graphql?.shortcode_media; if (!media) { // Try alternative structures const items = data?.items || data?.data?.shortcode_media; if (items) { return extractFromAlternativeStructure(items); } return null; } // Extract caption const captionEdges = media.edge_media_to_caption?.edges || []; const bodyText = captionEdges.map((edge: any) => edge.node.text).join('\n'); if (!bodyText) { return null; } return { bodyText: cleanText(bodyText) }; } catch (error) { console.warn('Failed to parse Instagram data structure:', error); return null; } } /** * Parse alternative Instagram data structures */ function extractFromAlternativeStructure(items: any): Omit | null { try { if (Array.isArray(items)) { items = items[0]; } const caption = items?.caption?.text || items?.edge_media_to_caption?.edges?.[0]?.node?.text; if (caption) { return { bodyText: cleanText(caption) }; } return null; } catch (error) { console.warn('Failed to parse alternative structure:', error); return null; } } /** * Strategy 2: Extract from DOM using specific selectors */ async function extractFromDOM( page: Page, progressCallback?: ProgressCallback ): Promise { try { // Strategy: Direct caption selector const captionText = await page.evaluate(() => { // Try h1[dir="auto"] (most reliable for captions) const h1 = document.querySelector('h1[dir="auto"]'); if (h1?.textContent) { return h1.textContent.trim(); } // Try article caption div const captionDiv = document.querySelector('article div._a9zs, article span'); if (captionDiv?.textContent) { return captionDiv.textContent.trim(); } // Try meta tag const metaDesc = document.querySelector('meta[property="og:description"]'); if (metaDesc) { return metaDesc.getAttribute('content') || ''; } return null; }); if (!captionText) { return null; } // Extract thumbnail using existing logic const thumbnail = await extractThumbnailStealth(page, progressCallback); return { bodyText: cleanText(captionText), thumbnail }; } catch (error) { console.warn('Failed to extract from DOM:', error); return null; } } /** * Strategy 3: Extract via GraphQL API */ async function extractViaGraphQL( url: string, context: BrowserContext ): Promise { const shortcode = extractShortcode(url); if (!shortcode) { console.warn('Could not extract shortcode from URL:', url); return null; } try { const page = await context.newPage(); // Make GraphQL request const response = await page.request.post('https://www.instagram.com/graphql/query/', { form: { variables: JSON.stringify({ shortcode }), doc_id: '7950326061742207' // May need periodic updates } }); if (!response.ok()) { console.warn(`GraphQL request failed: ${response.status()}`); await page.close(); return null; } const data = await response.json(); // Parse GraphQL response const media = data?.data?.shortcode_media; if (!media) { await page.close(); return null; } const bodyText = media.edge_media_to_caption?.edges?.[0]?.node?.text || ''; await page.close(); if (!bodyText) { return null; } return { bodyText: cleanText(bodyText), thumbnail: null // GraphQL doesn't easily provide thumbnail, would need page context }; } catch (error) { console.error('GraphQL extraction failed:', error); return null; } } /** * Strategy 4: Legacy extraction method (fallback) */ async function extractCleanTextLegacy(page: Page): Promise { let text = (await page.evaluate(() => document.body.innerText)) .replace(/^(?:.*\n){6}/, '') // Remove first 6 lines .split('More posts from')[0] // Cut at "More posts from" .trim(); // Remove mentions and hashtags text = text.replace(/@\w+/g, '').replace(/#\w+/g, ''); return text; } /** * Orchestrate extraction strategies */ async function extractWithStrategies( url: string, page: Page, context: BrowserContext, onProgress?: ProgressCallback ): Promise { const strategies: Array<{ name: ExtractionMethod; fn: () => Promise; }> = [ { name: 'embedded-json', fn: () => extractFromEmbeddedJSON(page, onProgress) }, { name: 'dom-selector', fn: () => extractFromDOM(page, onProgress) }, { name: 'graphql-api', fn: () => extractViaGraphQL(url, context) }, { name: 'legacy', fn: async () => { const text = await extractCleanTextLegacy(page); const thumbnail = await extractThumbnailStealth(page, onProgress); return { bodyText: text, thumbnail }; } } ]; for (const strategy of strategies) { try { const methodMessage = `Trying extraction method: ${getMethodDisplayName(strategy.name)}`; console.log(`[Extractor] ${methodMessage}`); onProgress?.({ type: 'method', message: methodMessage, method: strategy.name, timestamp: new Date().toISOString() }); const result = await strategy.fn(); if (result && result.bodyText) { const successMessage = `✓ Success with method: ${getMethodDisplayName(strategy.name)}`; console.log(`[Extractor] ${successMessage}`); onProgress?.({ type: 'status', message: successMessage, method: strategy.name, timestamp: new Date().toISOString() }); return { success: true, method: strategy.name, data: result }; } } catch (error) { console.warn(`[Extractor] Method ${strategy.name} failed:`, error); // Continue to next strategy } } return { success: false, error: 'All extraction methods failed' }; } /** * Extract text content and thumbnail from a URL using Playwright browser * Uses multiple extraction strategies with fallback * @param url - The URL to extract from * @param onProgress - Optional callback to receive progress updates * @returns Extracted text and thumbnail */ export async function extractTextAndThumbnail( url: string, onProgress?: ProgressCallback ): Promise { onProgress?.({ type: 'status', message: 'Starting extraction...', timestamp: new Date().toISOString() }); return withRetry(async () => { const authPath = resolveAuthPath(); const context = await createBrowserContext(authPath); const page = await context.newPage(); try { // Set timeout page.setDefaultTimeout(30000); onProgress?.({ type: 'status', message: 'Loading Instagram page...', timestamp: new Date().toISOString() }); await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 }); // Add small human-like delay await page.waitForTimeout(1000 + Math.random() * 2000); const result = await extractWithStrategies(url, page, context, onProgress); if (!result.success || !result.data) { throw new Error(result.error || 'Extraction failed'); } // Save debug content fs.writeFileSync( path.resolve('debug_page.txt'), `Method: ${result.method}\n\n${result.data.bodyText}` ); onProgress?.({ type: 'complete', message: 'Extraction completed successfully', method: result.method, timestamp: new Date().toISOString() }); return result.data; } finally { await page.close(); await context.close(); } }, DEFAULT_RETRY_CONFIG, onProgress); } /** * Extract thumbnail from video element or take full page screenshot */ /** * Screenshot-based thumbnail extraction (fallback method) * Takes a screenshot of the video element or full page if video not found */ async function extractThumbnailScreenshot(page: Page): Promise { const videoBounds = await page.evaluate(() => { const video = document.querySelector('video'); if (!video) return null; const rect = video.getBoundingClientRect(); return { x: Math.max(0, rect.left), y: Math.max(0, rect.top), width: Math.min(rect.width, window.innerWidth), height: Math.min(rect.height, window.innerHeight) }; }); let screenshotBuffer: Buffer; if (videoBounds && videoBounds.width > 0 && videoBounds.height > 0) { screenshotBuffer = await page.screenshot({ type: 'jpeg', quality: 85, clip: videoBounds }); } else { console.warn('[Thumbnail] Video element not found or has no size, taking full page screenshot'); screenshotBuffer = await page.screenshot({ type: 'jpeg', quality: 85 }); } return `data:image/jpeg;base64,${screenshotBuffer.toString('base64')}`; } /** * Helper: Fetch image from URL and convert to base64 data URI * * **Validation Criteria:** * - HTTP status must be exactly 200 (not 2xx, only 200) * - Content-Type must start with 'image/' (e.g., image/jpeg, image/png, image/webp) * - Request must complete within 10 seconds * * **Failure Scenarios:** * - Non-200 status → Returns null, reports status code via progress callback * - Invalid content-type → Returns null, reports content-type via progress callback * - Timeout → Returns null, reports timeout via progress callback * - Network error → Returns null, reports error message via progress callback * * **Usage in Fallback Chain:** * This function is used by `extractThumbnailStealth()` which tries multiple URL sources: * 1. Meta tags (og:image, twitter:image) * 2. Video poster attribute * 3. Instagram data structures (display_url, thumbnail_src) * 4. Screenshot fallback (always succeeds) * * When this function returns null, extraction continues to the next method. * * @param imageUrl - The image URL to fetch (must be HTTPS) * @param progressCallback - Optional callback for progress reporting * @returns Base64 data URI (data:image/*;base64,...) or null if validation fails * * @example * ```typescript * const thumbnail = await fetchImageAsBase64( * 'https://instagram.com/image.jpg', * (event) => console.log(event.message) * ); * * if (thumbnail) { * // thumbnail is a valid base64 data URI * console.log(thumbnail.substring(0, 50)); // "data:image/jpeg;base64,/9j/4AAQSkZJRg..." * } else { * // URL validation failed, try next method * } * ``` */ async function fetchImageAsBase64( imageUrl: string, progressCallback?: ProgressCallback ): Promise { try { // Create abort controller for timeout const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 10000); // 10s timeout console.log(`[Thumbnail] Validating URL: ${imageUrl}`); const response = await fetch(imageUrl, { signal: controller.signal }); clearTimeout(timeoutId); // Strict status validation: must be exactly 200 if (response.status !== 200) { console.warn(`[Thumbnail] URL validation failed: HTTP ${response.status} for ${imageUrl}`); progressCallback?.({ type: 'status', message: `Thumbnail URL returned HTTP ${response.status}, trying next method...`, timestamp: new Date().toISOString() }); return null; } // Validate content-type const contentType = response.headers.get('content-type') || ''; if (!contentType.startsWith('image/')) { console.warn( `[Thumbnail] URL validation failed: Invalid content-type '${contentType}' for ${imageUrl}` ); progressCallback?.({ type: 'status', message: `Thumbnail URL returned non-image content (${contentType}), trying next method...`, timestamp: new Date().toISOString() }); return null; } console.log(`[Thumbnail] URL validation successful: ${imageUrl} (${contentType})`); const arrayBuffer = await response.arrayBuffer(); const buffer = Buffer.from(arrayBuffer); const base64Data = `data:${contentType};base64,${buffer.toString('base64')}`; progressCallback?.({ type: 'status', message: 'Thumbnail fetched and validated from URL', timestamp: new Date().toISOString() }); return base64Data; } catch (e) { if (e instanceof Error) { if (e.name === 'AbortError') { console.error(`[Thumbnail] URL fetch timeout: ${imageUrl}`); progressCallback?.({ type: 'status', message: 'Thumbnail URL fetch timeout, trying next method...', timestamp: new Date().toISOString() }); } else { console.error(`[Thumbnail] Failed to fetch image from ${imageUrl}:`, e.message); progressCallback?.({ type: 'status', message: `Thumbnail URL fetch failed (${e.message}), trying next method...`, timestamp: new Date().toISOString() }); } } else { console.error('[Thumbnail] Failed to fetch image:', e); } return null; } } /** * Extract thumbnail from Instagram post using stealth techniques * * Tries multiple methods in order of stealth: * 1. Meta tags (og:image, twitter:image) - Returns: Direct HTTPS URL * 2. Video poster attribute - Returns: Direct HTTPS URL * 3. Instagram window data structures - Returns: Direct HTTPS URL * 4. Screenshot fallback - Returns: Base64 data URL (data:image/jpeg;base64,...) * * @param page - Playwright page instance * @param progressCallback - Optional progress callback for SSE updates * @returns Image URL (either direct HTTPS URL or base64 data URL) or null if all methods fail * * **Thumbnail Format Guide:** * - Methods 1-3: Return direct HTTPS URLs → Tandoor can use URL pass-through (efficient) * - Method 4: Returns base64 data URL → Requires conversion to file blob for upload */ async function extractThumbnailStealth( page: Page, progressCallback?: ProgressCallback ): Promise { console.log('[Thumbnail] Starting stealth extraction'); // Method 1: Try meta tags (most stealthy) try { const ogImage = await page.getAttribute('meta[property="og:image"]', 'content'); if (ogImage) { console.log('[Thumbnail] Found og:image meta tag'); const imageBuffer = await fetchImageAsBase64(ogImage, progressCallback); if (imageBuffer) { if (progressCallback) { progressCallback({ type: 'thumbnail', message: 'Thumbnail extracted from meta tags', data: { thumbnail: imageBuffer }, timestamp: new Date().toISOString() }); } return imageBuffer; } } const twitterImage = await page.getAttribute('meta[name="twitter:image"]', 'content'); if (twitterImage) { console.log('[Thumbnail] Found twitter:image meta tag'); const imageBuffer = await fetchImageAsBase64(twitterImage, progressCallback); if (imageBuffer) { if (progressCallback) { progressCallback({ type: 'thumbnail', message: 'Thumbnail extracted from meta tags', data: { thumbnail: imageBuffer }, timestamp: new Date().toISOString() }); } return imageBuffer; } } } catch (e) { console.log('[Thumbnail] Meta tag method failed:', e); } // Method 2: Try video poster attribute try { const poster = await page.getAttribute('video', 'poster'); if (poster) { console.log('[Thumbnail] Found video poster attribute'); const imageBuffer = await fetchImageAsBase64(poster, progressCallback); if (imageBuffer) { if (progressCallback) { progressCallback({ type: 'thumbnail', message: 'Thumbnail extracted from video poster', data: { thumbnail: imageBuffer }, timestamp: new Date().toISOString() }); } return imageBuffer; } } } catch (e) { console.log('[Thumbnail] Video poster method failed:', e); } // Method 3: Try Instagram window data structures try { const thumbnailUrl = await page.evaluate(() => { // Check for Instagram's internal data structures const data = (window as any).__additionalDataLoaded; if (data) { // Navigate through Instagram's data structure for (const key in data) { const item = data[key]; if (item?.graphql?.shortcode_media?.display_url) { return item.graphql.shortcode_media.display_url; } if (item?.graphql?.shortcode_media?.thumbnail_src) { return item.graphql.shortcode_media.thumbnail_src; } } } return null; }); if (thumbnailUrl) { console.log('[Thumbnail] Found thumbnail in Instagram data structures'); const imageBuffer = await fetchImageAsBase64(thumbnailUrl, progressCallback); if (imageBuffer) { if (progressCallback) { progressCallback({ type: 'thumbnail', message: 'Thumbnail extracted from Instagram data', data: { thumbnail: imageBuffer }, timestamp: new Date().toISOString() }); } return imageBuffer; } } } catch (e) { console.log('[Thumbnail] Instagram data method failed:', e); } // Method 4: Screenshot fallback (existing method) console.log('[Thumbnail] Falling back to screenshot method'); const screenshotThumbnail = await extractThumbnailScreenshot(page); if (screenshotThumbnail && progressCallback) { progressCallback({ type: 'thumbnail', message: 'Thumbnail extracted via screenshot', data: { thumbnail: screenshotThumbnail }, timestamp: new Date().toISOString() }); } return screenshotThumbnail; }