import { createBrowserContext } from './browser'; import { logError } from './utils/logger'; import fs from 'fs'; import path from 'path'; import type { Page, BrowserContext } from 'playwright'; export interface ExtractedContent { bodyText: string; thumbnail: string | null; } export type ExtractionMethod = | 'embedded-json' | 'internal-state' | 'html-section' | 'dom-selector' | 'graphql-api' | 'graphql-intercept' | 'legacy'; type CaptionCandidate = { element: Element; text: string; score: number; innerHTML: string; brCount: number; }; export type ProgressEventType = | 'status' | 'method' | 'retry' | 'error' | 'thumbnail' | 'complete' | 'model_loading'; export interface ProgressEvent { type: ProgressEventType; message: string; method?: ExtractionMethod; attemptNumber?: number; maxAttempts?: number; data?: any; timestamp?: string; } export type ProgressCallback = (event: ProgressEvent) => void; interface ExtractionResult { success: boolean; method?: ExtractionMethod; data?: ExtractedContent; error?: string; } interface InstagramEmbeddedData { entry_data?: { PostPage?: Array<{ graphql?: { shortcode_media?: { edge_media_to_caption?: { edges?: Array<{ node: { text: string } }>; }; display_url?: string; video_url?: string; owner?: { username: string; profile_pic_url: string; }; }; }; }>; }; } interface RetryConfig { maxAttempts: number; initialDelayMs: number; maxDelayMs: number; backoffMultiplier: number; } const DEFAULT_RETRY_CONFIG: RetryConfig = { maxAttempts: 3, initialDelayMs: 1000, maxDelayMs: 10000, backoffMultiplier: 2 }; /** * Resolve authentication storage path * Checks Docker path first, then local path */ function resolveAuthPath(): string | undefined { const authPathDocker = '/app/secrets/auth.json'; const authPathLocal = './secrets/auth.json'; if (fs.existsSync(authPathDocker)) { return authPathDocker; } if (fs.existsSync(authPathLocal)) { return authPathLocal; } return undefined; } /** * Sleep utility for retry logic */ async function sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)); } /** * Check if error should not be retried */ function isNonRetriableError(error: unknown): boolean { if (error instanceof Error) { // Don't retry authentication errors if (error.message.includes('authentication') || error.message.includes('login required')) { return true; } // Don't retry invalid URLs if (error.message.includes('invalid url')) { return true; } } return false; } /** * Get human-readable display name for extraction method */ function getMethodDisplayName(method: ExtractionMethod): string { const names: Record = { 'embedded-json': 'Embedded JSON', 'internal-state': 'Internal State', 'html-section': 'HTML Section', 'dom-selector': 'DOM Selector', 'graphql-api': 'GraphQL API', 'graphql-intercept': 'GraphQL Intercept', legacy: 'Legacy Parser' }; return names[method]; } /** * Retry wrapper with exponential backoff */ async function withRetry( fn: () => Promise, config: RetryConfig = DEFAULT_RETRY_CONFIG, onProgress?: ProgressCallback ): Promise { let lastError: Error | null = null; let delay = config.initialDelayMs; for (let attempt = 1; attempt <= config.maxAttempts; attempt++) { try { return await fn(); } catch (error) { lastError = error as Error; // Don't retry on certain errors if (isNonRetriableError(error)) { onProgress?.({ type: 'error', message: `Non-retriable error: ${lastError.message}`, timestamp: new Date().toISOString() }); throw error; } if (attempt < config.maxAttempts) { const message = `Attempt ${attempt}/${config.maxAttempts} failed. Retrying in ${delay}ms...`; logError(`[Retry] ${message}`, error); onProgress?.({ type: 'retry', message, attemptNumber: attempt, maxAttempts: config.maxAttempts, timestamp: new Date().toISOString() }); await sleep(delay); delay = Math.min(delay * config.backoffMultiplier, config.maxDelayMs); } } } throw lastError || new Error('Max retry attempts exceeded'); } /** * Extract shortcode from Instagram URL */ function extractShortcode(url: string): string | undefined { // Extract from /p/, /reel/, /reels/, /tv/ URLs const match = url.match(/\/(p|reel|reels|tv)\/([A-Za-z0-9_-]+)/); return match ? match[2] : undefined; } /** * Recipe keywords used for caption scoring */ const RECIPE_KEYWORDS = [ 'ingredienti', 'procedimento', 'preparazione', 'ricetta', 'recipe', 'instructions' ]; /** * Timeout configuration constants (in milliseconds) */ const TIMEOUTS = { CONTENT_LOAD: 1500, MORE_BUTTON_VISIBILITY: 1000, CAPTION_EXPANSION: 3000, MORE_BUTTON_VISIBILITY_DOM: 500, MORE_BUTTON_CLICK: 800, PAGE_LOAD: 10000, NETWORK_SETTLE: 2000, ARTICLE_SELECTOR: 5000, GRAPHQL_WAIT: 1000, PAGE_NAVIGATION: 30000, ANTI_DETECTION_MIN: 1000, ANTI_DETECTION_MAX: 3000 } as const; /** * Try to expand truncated caption by clicking "more" button in HTML section method */ async function tryExpandCaptionInHTMLSection(page: Page): Promise { console.log('[Extractor] Looking for "more" button in primary post container...'); try { await page.waitForTimeout(TIMEOUTS.CONTENT_LOAD); const mainContainer = page.locator('article, main, [role="main"]').first(); const containerExists = (await mainContainer.count()) > 0; if (!containerExists) { console.log('[Extractor] No main container found'); return; } console.log('[Extractor] Found main post container, searching for "more" button...'); const morePatterns = [ { locator: mainContainer.locator('span').filter({ hasText: /\.\.\.\s*more/i }), desc: "span with '...more'" }, { locator: mainContainer.locator('span').filter({ hasText: /…\s*more/i }), desc: "span with '… more'" }, { locator: mainContainer.locator('div[role="button"]').filter({ hasText: /more/i }), desc: "button with 'more'" }, { locator: mainContainer.locator('span[role="button"]').filter({ hasText: /more/i }), desc: "span button with 'more'" } ]; for (const pattern of morePatterns) { const count = await pattern.locator.count(); console.log(`[Extractor] Checking ${pattern.desc}: found ${count}`); if (count === 0) continue; const firstMore = pattern.locator.first(); try { if (await firstMore.isVisible({ timeout: TIMEOUTS.MORE_BUTTON_VISIBILITY })) { const text = await firstMore.textContent(); console.log(`[Extractor] Found visible "more": "${text}"`); await firstMore.click(); console.log('[Extractor] Clicked "more" - waiting for expansion...'); await page.waitForTimeout(TIMEOUTS.CAPTION_EXPANSION); console.log('[Extractor] Caption expansion complete'); break; } } catch (e) { console.log(`[Extractor] ${pattern.desc} not clickable: ${e}`); } } console.log('[Extractor] Finished "more" button expansion attempt'); } catch (e) { console.log(`[Extractor] Error while trying to expand caption: ${e}`); } } /** * Try to expand truncated caption by clicking "more" button in DOM method */ async function tryExpandCaptionInDOM(page: Page): Promise { const moreButtonSelectors = [ 'article button:has-text("more")', 'article button:has-text("More")', 'article button:has-text("… more")', 'article span[role="button"]:has-text("more")', 'article [role="button"]:has-text("more")', 'article div[role="button"]:has-text("more")', 'xpath=//article//span[contains(text(), "more")]/..', 'xpath=//article//button[contains(., "more")]' ]; const maxExpandAttempts = 3; let expandAttempts = 0; while (expandAttempts < maxExpandAttempts) { try { let clicked = false; for (const selector of moreButtonSelectors) { try { const button = page.locator(selector).first(); if (await button.isVisible({ timeout: TIMEOUTS.MORE_BUTTON_VISIBILITY_DOM })) { await button.click(); await page.waitForTimeout(TIMEOUTS.MORE_BUTTON_CLICK); console.log(`[Extractor] Clicked "more" button with selector: ${selector}`); clicked = true; expandAttempts++; break; } } catch (e) { // Try next selector } } if (!clicked) break; } catch (e) { break; } } } /** * Clean up extracted text - removes HTML tags, decodes entities, cleans whitespace */ export function cleanText(text: string): string { let cleaned = text; // First, convert
tags to newlines to preserve line breaks cleaned = cleaned.replace(//gi, '\n'); // Strip all other HTML tags while keeping the text content cleaned = cleaned.replace(/<[^>]+>/g, ''); // Decode HTML entities cleaned = cleaned .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, "'") .replace(/ /g, ' '); // Remove common UI text patterns const uiPatterns = [ /More posts from.+/gi, /View all \d+ comments/gi, /Add a comment\.\.\./gi, /Liked by.+?(?=\n|$)/gi ]; uiPatterns.forEach((pattern) => { cleaned = cleaned.replace(pattern, ''); }); // Clean up whitespace while preserving intentional line breaks // Remove spaces at the beginning and end of lines cleaned = cleaned.replace(/[ \t]+$/gm, ''); // trailing spaces on each line cleaned = cleaned.replace(/^[ \t]+/gm, ''); // leading spaces on each line // Replace multiple consecutive blank lines with max 2 newlines cleaned = cleaned.replace(/\n\s*\n\s*\n+/g, '\n\n'); // Remove spaces around newlines cleaned = cleaned.replace(/ *\n */g, '\n'); // Normalize multiple spaces to single space within lines cleaned = cleaned.replace(/ {2,}/g, ' '); // Remove hashtags from end of text // Pattern: #word #multiple_words (supports international characters) cleaned = cleaned.replace(/(#[\w\u00C0-\u024F\u1E00-\u1EFF\u0400-\u04FF]+\s*)+$/gi, '').trim(); return cleaned.trim(); } /** * Strategy 1: Extract from embedded JSON data in script tags */ async function extractFromEmbeddedJSON( page: Page, progressCallback?: ProgressCallback ): Promise { try { // Extract all script tag contents const scriptInfo = await page.evaluate(() => { const scripts = Array.from(document.querySelectorAll('script')); const scriptData = scripts.map((script, idx) => ({ type: script.getAttribute('type') || 'no-type', hasContent: !!script.textContent, length: script.textContent?.length || 0, preview: script.textContent?.substring(0, 100) || '' })); console.log(`[Extractor] Found ${scripts.length} script tags`); return { contents: scripts.map((script) => script.textContent || ''), info: scriptData }; }); console.log(`[Extractor] Script tags summary:`, scriptInfo.info); // Look for embedded data patterns for (let i = 0; i < scriptInfo.contents.length; i++) { const content = scriptInfo.contents[i]; // Try window._sharedData pattern const sharedDataMatch = content.match(/window\._sharedData\s*=\s*(\{.+?\});/s); if (sharedDataMatch) { console.log(`[Extractor] Found _sharedData in script ${i}`); try { const data: InstagramEmbeddedData = JSON.parse(sharedDataMatch[1]); const result = parseInstagramData(data); if (result) { const thumbnail = await extractThumbnailStealth(page, progressCallback); return { ...result, thumbnail }; } } catch (e) { logError('[Extractor] Failed to parse _sharedData', e); } } // Try __additionalDataLoaded pattern const additionalDataMatch = content.match( /window\.__additionalDataLoaded$[^,]+,\s*(\{.+?\})$;/s ); if (additionalDataMatch) { console.log(`[Extractor] Found __additionalDataLoaded in script ${i}`); try { const data = JSON.parse(additionalDataMatch[1]); const result = parseInstagramData(data); if (result) { const thumbnail = await extractThumbnailStealth(page, progressCallback); return { ...result, thumbnail }; } } catch (e) { logError('[Extractor] Failed to parse __additionalDataLoaded', e); } } // Try to find any large JSON with caption data (new Instagram format) if ((content.includes('"caption"') || content.includes('"text"')) && content.length > 10000) { console.log( `[Extractor] Attempting to extract from large JSON in script ${i} (length: ${content.length})` ); try { // Try to parse as direct JSON const jsonData = JSON.parse(content); // Try deep search first const deepResult = deepSearchForCaption(jsonData); if (deepResult && deepResult.bodyText && deepResult.bodyText.length > 130) { console.log( `[Extractor] Deep search in JSON found caption: ${deepResult.bodyText.length} chars` ); const thumbnail = await extractThumbnailStealth(page, progressCallback); return { ...deepResult, thumbnail }; } // Try standard parsing const result = parseInstagramData(jsonData); if (result && result.bodyText && result.bodyText.length > 130) { console.log( `[Extractor] Successfully extracted from JSON, text length: ${result.bodyText.length}` ); const thumbnail = await extractThumbnailStealth(page, progressCallback); return { ...result, thumbnail }; } } catch (e) { // Not direct JSON or parsing failed, try to find caption fields with regex console.log(`[Extractor] JSON parse failed, trying regex extraction...`); // Try multiple patterns for different Instagram JSON structures const patterns = [ /"caption"\s*:\s*\{\s*"text"\s*:\s*"([^"\\]*(\\.[^"\\]*)*)"/, // Escaped quotes /"text"\s*:\s*"([^"\\]*(\\.[^"\\]*)*)"\s*,?\s*"pk"/, // text field near pk /"edge_media_to_caption"\s*:\s*\{\s*"edges"\s*:\s*\[\s*\{\s*"node"\s*:\s*\{\s*"text"\s*:\s*"([^"\\]*(\\.[^"\\]*)*)"/ ]; for (const pattern of patterns) { const captionMatch = content.match(pattern); if (captionMatch) { // Get the captured group (first non-undefined) const rawText = captionMatch[1] || ''; const captionText = rawText .replace(/\\n/g, '\n') .replace(/\\"/g, '"') .replace(/\\u([0-9a-fA-F]{4})/g, (_, code) => String.fromCharCode(parseInt(code, 16)) ) .replace(/\\\\/g, '\\'); if (captionText.length > 130) { console.log( `[Extractor] Extracted caption from regex pattern, length: ${captionText.length}` ); const thumbnail = await extractThumbnailStealth(page, progressCallback); return { bodyText: cleanText(captionText), thumbnail }; } } } } } } return null; } catch (error) { logError('[Extractor] Failed to extract from embedded JSON', error); return null; } } /** * Parse Instagram data structure */ function parseInstagramData(data: any): Omit | null { try { // Navigate the nested structure const media = data?.entry_data?.PostPage?.[0]?.graphql?.shortcode_media; if (!media) { // Try alternative structures const items = data?.items || data?.data?.shortcode_media; if (items) { return extractFromAlternativeStructure(items); } return null; } // Extract caption const captionEdges = media.edge_media_to_caption?.edges || []; const bodyText = captionEdges.map((edge: any) => edge.node.text).join('\n'); if (!bodyText) { return null; } return { bodyText: cleanText(bodyText) }; } catch (error) { logError('[Extractor] Failed to parse Instagram data structure', error); return null; } } /** * Parse alternative Instagram data structures */ function extractFromAlternativeStructure(items: any): Omit | null { try { if (Array.isArray(items)) { items = items[0]; } const caption = items?.caption?.text || items?.edge_media_to_caption?.edges?.[0]?.node?.text; if (caption) { return { bodyText: cleanText(caption) }; } return null; } catch (error) { logError('[Extractor] Failed to parse alternative structure', error); return null; } } /** * Strategy 2.5: Extract caption by finding the span with recipe content characteristics * Instagram uses obfuscated class names, but the caption span has identifiable patterns: * - Contains substantial text (> 100 chars) * - Has multiple
tags for formatting * - Contains tags for mentions and hashtags * - Usually has a style attribute with line-height */ export async function extractFromHTMLSection( page: Page, progressCallback?: ProgressCallback, targetUrl?: string ): Promise { try { console.log('[Extractor] Waiting for page content to load...'); // Validate we're on the correct page const currentUrl = page.url(); const targetShortcode = targetUrl ? extractShortcode(targetUrl) : null; const currentShortcode = extractShortcode(currentUrl); console.log(`[Extractor] Current page URL: ${currentUrl}`); console.log( `[Extractor] Target shortcode: ${targetShortcode}, Current shortcode: ${currentShortcode}` ); if (targetShortcode && currentShortcode !== targetShortcode) { console.log(`[Extractor] URL mismatch: expected ${targetShortcode}, got ${currentShortcode}`); return null; } console.log(`[Extractor] Confirmed on correct post: ${currentShortcode}`); // Wait for network to settle await page.waitForLoadState('domcontentloaded', { timeout: TIMEOUTS.PAGE_LOAD }); await page.waitForTimeout(TIMEOUTS.NETWORK_SETTLE); // Try to expand truncated caption by clicking "more" button // STRATEGY: Since we're already on the correct page (URL validated above), // the FIRST article/main post container should be our target post. await tryExpandCaptionInHTMLSection(page); console.log('[Extractor] Extracting caption using intelligent span detection...'); const result = await page.evaluate((shortcode) => { // Strategy: Find the caption span that belongs to the correct post // Instagram loads multiple posts, so we need to find the span associated // with our target shortcode const recipeKeywords = [ 'ingredienti', 'procedimento', 'preparazione', 'ricetta', 'recipe', 'instructions' ]; // First, try to find links pointing to our target post const postLinks = document.querySelectorAll(`a[href*="/${shortcode}"]`); console.log(`[Extractor] Found ${postLinks.length} links to target post ${shortcode}`); // If we found links to the post, search for spans within those link ancestors const searchRoots: Element[] = []; if (postLinks.length > 0) { postLinks.forEach((link) => { // Get the article or section container for this post let container = link.closest('article') || link.closest('section') || link.closest('[role="main"]'); if (container && !searchRoots.includes(container)) { searchRoots.push(container); console.log(`[Extractor] Found container for target post`); } }); } // If no specific containers found, search the whole document (fallback) if (searchRoots.length === 0) { console.log(`[Extractor] No specific container found, searching whole document`); searchRoots.push(document.body); } const spans: HTMLElement[] = []; searchRoots.forEach((root) => { root.querySelectorAll('span').forEach((span) => spans.push(span as HTMLElement)); }); console.log(`[Extractor] Searching ${spans.length} spans for recipe content`); let bestCandidate: CaptionCandidate | null = null; // Search all spans for the best caption candidate // PRIMARY CRITERIA: Most
tags (recipe formatting indicator) spans.forEach((span, spanIdx) => { const text = (span.textContent || '').toLowerCase(); const innerHTML = span.innerHTML || ''; // Skip empty or very short spans if (text.length < 30) return; // Count
tags - this is the MOST reliable indicator for recipes const brCount = (innerHTML.match(//gi) || []).length; // No minimum br count - take what we can get // Calculate a score based on recipe characteristics let score = 0; //
tags are the PRIMARY signal score += brCount * 100; // Massive weight for line breaks // Check for recipe keywords (strong indicator) const hasKeywords = recipeKeywords.some((keyword) => text.includes(keyword)); if (hasKeywords) { score += 500; // Huge boost for recipe keywords } // Count tags - captions have hashtags/mentions const linkCount = span.querySelectorAll('a').length; if (linkCount > 2) { score += linkCount * 10; } // Text length (longer is better for recipes) score += Math.min(text.length / 5, 200); // Check for line-height style (caption formatting) const style = span.getAttribute('style') || ''; if (style.includes('line-height')) { score += 30; } // Penalize UI elements if (text.match(/^(follow|following|liked by|view all|more posts|comments)/i)) { score -= 500; } // Penalize audio/music credits if (text.match(/·|papaoutai|afro soul/i) && text.length < 100) { score -= 200; } // Update best candidate if (score > 0 && (!bestCandidate || score > bestCandidate.score)) { console.log( `[Extractor] New best: score=${score}, len=${text.length}, br=${brCount}, links=${linkCount}, preview="${text.substring(0, 80)}..."` ); bestCandidate = { element: span, text: span.textContent || '', score: score, innerHTML: innerHTML, brCount: brCount }; } }); if (!bestCandidate) { return { success: false, error: 'No suitable caption span found', text: '' }; } // Explicit type assertion (safe after null guard) const candidate: CaptionCandidate = bestCandidate; console.log( `[Extractor] Final caption candidate: score=${candidate.score}, length=${candidate.text.length}` ); // Extract text from the best candidate // Use innerHTML to preserve
tags, which will be converted to newlines in cleanText let captionText = candidate.innerHTML; return { success: true, text: captionText, score: candidate.score, length: captionText.length, htmlPreview: candidate.innerHTML.substring(0, 500) }; }, currentShortcode); console.log(`[Extractor] HTML Section result:`, { success: result.success, textLength: result.length, score: result.score }); if (result.htmlPreview) { console.log('[Extractor] HTML preview (first 500 chars):'); console.log(result.htmlPreview); } if (!result.success) { console.log(`[Extractor] ${result.error}`); return null; } const captionText = result.text; if (!captionText || captionText.length === 0) { console.log('[Extractor] No text extracted from HTML section'); return null; } const thumbnail = await extractThumbnailStealth(page, progressCallback); return { bodyText: cleanText(captionText), thumbnail }; } catch (error) { logError('[Extractor] Failed to extract from HTML section', error); return null; } } /** * Strategy 3: Extract from DOM using specific selectors */ export async function extractFromDOM( page: Page, progressCallback?: ProgressCallback ): Promise { try { // Give Instagram more time to load dynamic content console.log('[Extractor] Waiting for network idle...'); await page.waitForLoadState('networkidle', { timeout: TIMEOUTS.PAGE_LOAD }).catch(() => { console.log('[Extractor] Network idle timeout, continuing anyway'); }); // Try to wait for article content await page.waitForSelector('article', { timeout: TIMEOUTS.ARTICLE_SELECTOR }).catch(() => {}); // Additional wait for dynamic content await page.waitForTimeout(TIMEOUTS.NETWORK_SETTLE); // Try to intercept GraphQL responses let graphqlCaption: string | null = null; page.on('response', async (response) => { const url = response.url(); if (url.includes('graphql') || url.includes('api/v1')) { try { const json = await response.json(); const captionData = extractCaptionFromGraphQL(json); if (captionData && captionData.length > 130) { graphqlCaption = captionData; console.log( `[Extractor] Intercepted GraphQL response with ${captionData.length} chars` ); } } catch (e) { // Not JSON or parsing failed } } }); await page.waitForTimeout(TIMEOUTS.GRAPHQL_WAIT); if (graphqlCaption) { const thumbnail = await extractThumbnailStealth(page, progressCallback); return { bodyText: cleanText(graphqlCaption), thumbnail }; } // Try to expand truncated captions by clicking "more" button await tryExpandCaptionInDOM(page); const captionText = await page.evaluate(() => { // First check og:description for comparison const metaDesc = document.querySelector('meta[property="og:description"]'); const ogContent = metaDesc?.getAttribute('content') || ''; console.log(`[Extractor] og:description length: ${ogContent.length}`); if (ogContent.length > 200) { console.log(`[Extractor] og:description preview: ${ogContent.substring(0, 200)}...`); } // SMART APPROACH: Find the truncated text first, then look for full version nearby // Look for text that ends with "..." or "… more" const allSpans = Array.from( document.querySelectorAll('article span, article div, article h1') ); let longestText = ''; let matchedElement = null; // Strategy 1: Find elements with substantial text for (const element of allSpans) { const text = element.textContent?.trim() || ''; // Skip UI elements if (text.match(/^(follow|like|comment|share|view all|load more|add a comment)$/i)) { continue; } // Look for text that seems like content if (text.length > longestText.length) { longestText = text; matchedElement = element; } } // Strategy 2: Look in data attributes const elementsWithData = Array.from( document.querySelectorAll('[data-caption], [data-text], [data-content]') ); for (const el of elementsWithData) { const dataCaption = el.getAttribute('data-caption') || el.getAttribute('data-text') || el.getAttribute('data-content'); if (dataCaption && dataCaption.length > longestText.length) { longestText = dataCaption; console.log(`[Extractor] Found data attribute with ${dataCaption.length} chars`); } } // Strategy 3: Look for hidden/collapsed content const hiddenElements = Array.from( document.querySelectorAll( '[style*="display: none"], [style*="display:none"], .collapsed, [aria-hidden="true"]' ) ); for (const el of hiddenElements) { const text = el.textContent?.trim() || ''; if (text.length > longestText.length && text.length > 200) { longestText = text; console.log(`[Extractor] Found hidden element with ${text.length} chars`); } } // Strategy 4: Find parent of truncated text if (matchedElement && longestText.endsWith('...')) { // Look at siblings and parent const parent = matchedElement.parentElement; if (parent) { const parentText = parent.textContent?.trim() || ''; if (parentText.length > longestText.length) { longestText = parentText; console.log( `[Extractor] Found fuller text in parent element: ${parentText.length} chars` ); } } // Check next siblings let sibling = matchedElement.nextElementSibling; let siblingCount = 0; while (sibling && siblingCount < 5) { const siblingText = sibling.textContent?.trim() || ''; if (siblingText.length > 50) { longestText = longestText + ' ' + siblingText; console.log(`[Extractor] Found continuation in sibling: ${siblingText.length} chars`); } sibling = sibling.nextElementSibling; siblingCount++; } } if (longestText && longestText.length > 100) { console.log(`[Extractor] Best extraction: ${longestText.length} chars`); return longestText; } // Fallback to og:description if (metaDesc) { const content = ogContent; const cleanedContent = content.replace( /^\d+K?\s+likes,\s+\d+\s+comments\s+-\s+[\w.]+\s+on\s+[^:]+:\s*["']?/, '' ); console.log('[Extractor] DOM selector fallback: og:description (with metadata cleanup)'); return cleanedContent; } return null; }); if (!captionText) { return null; } // Extract thumbnail using existing logic const thumbnail = await extractThumbnailStealth(page, progressCallback); return { bodyText: cleanText(captionText), thumbnail }; } catch (error) { logError('[Extractor] Failed to extract from DOM', error); return null; } } /** * Strategy 3: Extract via GraphQL API */ async function extractViaGraphQL( url: string, context: BrowserContext ): Promise { const shortcode = extractShortcode(url); if (!shortcode) { console.warn('Could not extract shortcode from URL:', url); return null; } try { const page = await context.newPage(); // Make GraphQL request const response = await page.request.post('https://www.instagram.com/graphql/query/', { form: { variables: JSON.stringify({ shortcode }), doc_id: '7950326061742207' // May need periodic updates } }); if (!response.ok()) { console.warn(`GraphQL request failed: ${response.status()}`); await page.close(); return null; } const data = await response.json(); // Parse GraphQL response const media = data?.data?.shortcode_media; if (!media) { await page.close(); return null; } const bodyText = media.edge_media_to_caption?.edges?.[0]?.node?.text || ''; await page.close(); if (!bodyText) { return null; } return { bodyText: cleanText(bodyText), thumbnail: null // GraphQL doesn't easily provide thumbnail, would need page context }; } catch (error) { logError('[Extractor] GraphQL extraction failed', error); return null; } } /** * Strategy 4: Legacy extraction method (fallback) */ async function extractCleanTextLegacy(page: Page): Promise { let text = (await page.evaluate(() => document.body.innerText)) .replace(/^(?:.*\n){6}/, '') // Remove first 6 lines .split('More posts from')[0] // Cut at "More posts from" .trim(); // Remove mentions and hashtags text = text.replace(/@\w+/g, '').replace(/#\w+/g, ''); return text; } /** * Strategy 5: Extract from Instagram's internal state/cache */ async function extractFromInternalState( page: Page, progressCallback?: ProgressCallback ): Promise { try { const stateData = await page.evaluate(() => { // Try to access Instagram's internal React/Apollo cache const possibleKeys = [ '_sharedData', '__PRIVATE_STATE__', '__additionalData', '__initialData', '__RELAY_STORE__' ]; for (const key of possibleKeys) { if ((window as any)[key]) { const data = (window as any)[key]; console.log(`[Extractor] Found internal state: ${key}`); return { key, data: JSON.stringify(data).substring(0, 500000) }; // Limit to 500KB } } return null; }); if (stateData) { console.log(`[Extractor] Parsing internal state from ${stateData.key}`); try { const parsed = JSON.parse(stateData.data); // Try multiple parsing strategies let result = parseInstagramData(parsed); console.log(`[Extractor] Standard parsing result: ${result?.bodyText?.length || 0} chars`); // Debug: log structure if (parsed.entry_data) { console.log(`[Extractor] Found entry_data with keys:`, Object.keys(parsed.entry_data)); } if (parsed.config) { console.log(`[Extractor] Found config`); } // If standard parsing failed, try deep search for caption text if (!result || !result.bodyText || result.bodyText.length <= 130) { console.log(`[Extractor] Attempting deep search in ${stateData.key}...`); result = deepSearchForCaption(parsed); if (result) { console.log(`[Extractor] Deep search found: ${result.bodyText.length} chars`); } else { console.log(`[Extractor] Deep search found no caption`); } } if (result && result.bodyText && result.bodyText.length > 130) { console.log( `[Extractor] Successfully extracted from ${stateData.key}, length: ${result.bodyText.length}` ); const thumbnail = await extractThumbnailStealth(page, progressCallback); return { ...result, thumbnail }; } else if (result?.bodyText) { console.log( `[Extractor] Found text in ${stateData.key} but it's truncated (${result.bodyText.length} chars)` ); } } catch (e) { console.log(`[Extractor] Failed to parse ${stateData.key}:`, e); } } return null; } catch (error) { logError('[Extractor] Failed to extract from internal state', error); return null; } } /** * Deep search for caption text in any nested object structure */ function deepSearchForCaption( obj: any, maxDepth = 10, currentDepth = 0 ): Omit | null { if (currentDepth > maxDepth || !obj || typeof obj !== 'object') { return null; } // Look for caption/text fields if (obj.caption && typeof obj.caption === 'object' && obj.caption.text) { const text = obj.caption.text; if (typeof text === 'string' && text.length > 130) { return { bodyText: cleanText(text) }; } } // Look for edge_media_to_caption pattern if (obj.edge_media_to_caption?.edges?.[0]?.node?.text) { const text = obj.edge_media_to_caption.edges[0].node.text; if (typeof text === 'string' && text.length > 130) { return { bodyText: cleanText(text) }; } } // Look for direct text field in media items if (obj.text && typeof obj.text === 'string' && obj.text.length > 130) { // Make sure it's not just a UI label if (!obj.text.match(/^(more|less|follow|like|comment|share)$/i)) { return { bodyText: cleanText(obj.text) }; } } // Recursively search in all properties for (const key in obj) { if (obj.hasOwnProperty(key)) { const result = deepSearchForCaption(obj[key], maxDepth, currentDepth + 1); if (result && result.bodyText.length > 130) { return result; } } } return null; } /** * Extract caption from intercepted GraphQL response */ /** * Extract caption from GraphQL response, validating it matches the expected shortcode */ function extractCaptionFromGraphQL(data: any, expectedShortcode?: string): string | null { // If we have an expected shortcode, verify this GraphQL response is for that content if (expectedShortcode) { // Search for shortcode in the response const hasMatchingShortcode = JSON.stringify(data).includes(expectedShortcode); if (!hasMatchingShortcode) { // This GraphQL response is for different content, ignore it return null; } } const result = deepSearchForCaption(data); return result?.bodyText || null; } /** * Orchestrate extraction strategies */ async function extractWithStrategies( url: string, page: Page, context: BrowserContext, onProgress?: ProgressCallback ): Promise { const strategies: Array<{ name: ExtractionMethod; fn: () => Promise; }> = [ { name: 'embedded-json', fn: () => extractFromEmbeddedJSON(page, onProgress) }, { name: 'internal-state', fn: () => extractFromInternalState(page, onProgress) }, { name: 'html-section', fn: () => extractFromHTMLSection(page, onProgress, url) }, { name: 'dom-selector', fn: () => extractFromDOM(page, onProgress) }, { name: 'graphql-api', fn: () => extractViaGraphQL(url, context) }, { name: 'legacy', fn: async () => { const text = await extractCleanTextLegacy(page); const thumbnail = await extractThumbnailStealth(page, onProgress); return { bodyText: text, thumbnail }; } } ]; for (const strategy of strategies) { try { const methodMessage = `Trying extraction method: ${getMethodDisplayName(strategy.name)}`; console.log(`[Extractor] ${methodMessage}`); onProgress?.({ type: 'method', message: methodMessage, method: strategy.name, timestamp: new Date().toISOString() }); const result = await strategy.fn(); if (result && result.bodyText) { const successMessage = `✓ Success with method: ${getMethodDisplayName(strategy.name)}`; console.log(`[Extractor] ${successMessage}`); onProgress?.({ type: 'status', message: successMessage, method: strategy.name, timestamp: new Date().toISOString() }); return { success: true, method: strategy.name, data: result }; } } catch (error) { logError(`[Extractor] Method ${strategy.name} failed`, error); // Continue to next strategy } } return { success: false, error: 'All extraction methods failed' }; } /** * Extract text content and thumbnail from a URL using Playwright browser * Uses multiple extraction strategies with fallback * @param url - The URL to extract from * @param onProgress - Optional callback to receive progress updates * @returns Extracted text and thumbnail */ export async function extractTextAndThumbnail( url: string, onProgress?: ProgressCallback ): Promise { onProgress?.({ type: 'status', message: 'Starting extraction...', timestamp: new Date().toISOString() }); return withRetry( async () => { const authPath = resolveAuthPath(); const context = await createBrowserContext(authPath); const page = await context.newPage(); // Extract shortcode for validation const expectedShortcode = extractShortcode(url); console.log(`[Extractor] Target shortcode: ${expectedShortcode || 'unknown'}`); try { // Set timeout page.setDefaultTimeout(30000); // Set up GraphQL response interception BEFORE loading the page // This is critical to catch initial network requests during page load let interceptedCaption: string | null = null; page.on('response', async (response) => { try { const responseUrl = response.url(); if ( responseUrl.includes('graphql') || responseUrl.includes('api/v1') || responseUrl.includes('/web/') ) { try { const json = await response.json(); const captionData = extractCaptionFromGraphQL(json, expectedShortcode ?? undefined); if (captionData && captionData.length > 130) { interceptedCaption = captionData; console.log( `[Extractor] ✓ Intercepted GraphQL with full caption: ${captionData.length} chars (shortcode verified)` ); } } catch (e) { // Not JSON or parse error, skip } } } catch (e) { // Ignore response errors } }); onProgress?.({ type: 'status', message: 'Loading Instagram page...', timestamp: new Date().toISOString() }); await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 }); // Add small human-like delay await page.waitForTimeout(1000 + Math.random() * 2000); // Try scrolling and waiting to trigger additional GraphQL requests console.log('[Extractor] Scrolling to trigger lazy loading...'); await page.evaluate(() => { window.scrollBy(0, 300); }); await page.waitForTimeout(1500); await page.evaluate(() => { window.scrollBy(0, 300); }); await page.waitForTimeout(1500); await page.evaluate(() => { window.scrollTo(0, 0); }); await page.waitForTimeout(1000); // If we intercepted a full caption, use it immediately if (interceptedCaption) { console.log('[Extractor] Using intercepted caption from network traffic'); const thumbnail = await extractThumbnailStealth(page, onProgress); onProgress?.({ type: 'complete', message: 'Extraction completed via GraphQL interception', method: 'graphql-intercept', timestamp: new Date().toISOString() }); return { bodyText: cleanText(interceptedCaption), thumbnail }; } const result = await extractWithStrategies(url, page, context, onProgress); if (!result.success || !result.data) { throw new Error(result.error || 'Extraction failed'); } // Save debug content fs.writeFileSync( path.resolve('debug_page.txt'), `Method: ${result.method}\n\n${result.data.bodyText}` ); onProgress?.({ type: 'complete', message: 'Extraction completed successfully', method: result.method, timestamp: new Date().toISOString() }); return result.data; } finally { await page.close(); await context.close(); } }, DEFAULT_RETRY_CONFIG, onProgress ); } /** * Extract thumbnail from video element or take full page screenshot */ /** * Screenshot-based thumbnail extraction (fallback method) * Takes a screenshot of the video element or full page if video not found */ async function extractThumbnailScreenshot(page: Page): Promise { const videoBounds = await page.evaluate(() => { const video = document.querySelector('video'); if (!video) return null; const rect = video.getBoundingClientRect(); return { x: Math.max(0, rect.left), y: Math.max(0, rect.top), width: Math.min(rect.width, window.innerWidth), height: Math.min(rect.height, window.innerHeight) }; }); let screenshotBuffer: Buffer; if (videoBounds && videoBounds.width > 0 && videoBounds.height > 0) { screenshotBuffer = await page.screenshot({ type: 'jpeg', quality: 85, clip: videoBounds }); } else { console.warn('[Thumbnail] Video element not found or has no size, taking full page screenshot'); screenshotBuffer = await page.screenshot({ type: 'jpeg', quality: 85 }); } return `data:image/jpeg;base64,${screenshotBuffer.toString('base64')}`; } /** * Helper: Fetch image from URL and convert to base64 data URI * * **Validation Criteria:** * - HTTP status must be exactly 200 (not 2xx, only 200) * - Content-Type must start with 'image/' (e.g., image/jpeg, image/png, image/webp) * - Request must complete within 10 seconds * * **Failure Scenarios:** * - Non-200 status → Returns null, reports status code via progress callback * - Invalid content-type → Returns null, reports content-type via progress callback * - Timeout → Returns null, reports timeout via progress callback * - Network error → Returns null, reports error message via progress callback * * **Usage in Fallback Chain:** * This function is used by `extractThumbnailStealth()` which tries multiple URL sources: * 1. Meta tags (og:image, twitter:image) * 2. Video poster attribute * 3. Instagram data structures (display_url, thumbnail_src) * 4. Screenshot fallback (always succeeds) * * When this function returns null, extraction continues to the next method. * * @param imageUrl - The image URL to fetch (must be HTTPS) * @param progressCallback - Optional callback for progress reporting * @returns Base64 data URI (data:image/*;base64,...) or null if validation fails * * @example * ```typescript * const thumbnail = await fetchImageAsBase64( * 'https://instagram.com/image.jpg', * (event) => console.log(event.message) * ); * * if (thumbnail) { * // thumbnail is a valid base64 data URI * console.log(thumbnail.substring(0, 50)); // "data:image/jpeg;base64,/9j/4AAQSkZJRg..." * } else { * // URL validation failed, try next method * } * ``` */ async function fetchImageAsBase64( imageUrl: string, progressCallback?: ProgressCallback ): Promise { try { // Create abort controller for timeout const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), 10000); // 10s timeout console.log(`[Thumbnail] Validating URL: ${imageUrl}`); const response = await fetch(imageUrl, { signal: controller.signal }); clearTimeout(timeoutId); // Strict status validation: must be exactly 200 if (response.status !== 200) { console.warn(`[Thumbnail] URL validation failed: HTTP ${response.status} for ${imageUrl}`); progressCallback?.({ type: 'status', message: `Thumbnail URL returned HTTP ${response.status}, trying next method...`, timestamp: new Date().toISOString() }); return null; } // Validate content-type const contentType = response.headers.get('content-type') || ''; if (!contentType.startsWith('image/')) { console.warn( `[Thumbnail] URL validation failed: Invalid content-type '${contentType}' for ${imageUrl}` ); progressCallback?.({ type: 'status', message: `Thumbnail URL returned non-image content (${contentType}), trying next method...`, timestamp: new Date().toISOString() }); return null; } console.log(`[Thumbnail] URL validation successful: ${imageUrl} (${contentType})`); const arrayBuffer = await response.arrayBuffer(); const buffer = Buffer.from(arrayBuffer); const base64Data = `data:${contentType};base64,${buffer.toString('base64')}`; progressCallback?.({ type: 'status', message: 'Thumbnail fetched and validated from URL', timestamp: new Date().toISOString() }); return base64Data; } catch (e) { if (e instanceof Error) { if (e.name === 'AbortError') { console.error(`[Thumbnail] URL fetch timeout: ${imageUrl}`); progressCallback?.({ type: 'status', message: 'Thumbnail URL fetch timeout, trying next method...', timestamp: new Date().toISOString() }); } else { console.error(`[Thumbnail] Failed to fetch image from ${imageUrl}:`, e.message); progressCallback?.({ type: 'status', message: `Thumbnail URL fetch failed (${e.message}), trying next method...`, timestamp: new Date().toISOString() }); } } else { logError('[Thumbnail] Failed to fetch image', e); } return null; } } /** * Extract thumbnail from Instagram post using stealth techniques * * Tries multiple methods in order of stealth: * 1. Meta tags (og:image, twitter:image) - Returns: Direct HTTPS URL * 2. Video poster attribute - Returns: Direct HTTPS URL * 3. Instagram window data structures - Returns: Direct HTTPS URL * 4. Screenshot fallback - Returns: Base64 data URL (data:image/jpeg;base64,...) * * @param page - Playwright page instance * @param progressCallback - Optional progress callback for SSE updates * @returns Image URL (either direct HTTPS URL or base64 data URL) or null if all methods fail * * **Thumbnail Format Guide:** * - Methods 1-3: Return direct HTTPS URLs → Tandoor can use URL pass-through (efficient) * - Method 4: Returns base64 data URL → Requires conversion to file blob for upload */ async function extractThumbnailStealth( page: Page, progressCallback?: ProgressCallback ): Promise { console.log('[Thumbnail] Starting stealth extraction'); // Method 1: Try meta tags (most stealthy) try { const ogImage = await page.getAttribute('meta[property="og:image"]', 'content'); if (ogImage) { console.log('[Thumbnail] Found og:image meta tag'); const imageBuffer = await fetchImageAsBase64(ogImage, progressCallback); if (imageBuffer) { if (progressCallback) { progressCallback({ type: 'thumbnail', message: 'Thumbnail extracted from meta tags', data: { thumbnail: imageBuffer }, timestamp: new Date().toISOString() }); } return imageBuffer; } } const twitterImage = await page.getAttribute('meta[name="twitter:image"]', 'content'); if (twitterImage) { console.log('[Thumbnail] Found twitter:image meta tag'); const imageBuffer = await fetchImageAsBase64(twitterImage, progressCallback); if (imageBuffer) { if (progressCallback) { progressCallback({ type: 'thumbnail', message: 'Thumbnail extracted from meta tags', data: { thumbnail: imageBuffer }, timestamp: new Date().toISOString() }); } return imageBuffer; } } } catch (e) { logError('[Thumbnail] Meta tag method failed', e); } // Method 2: Try video poster attribute try { const poster = await page.getAttribute('video', 'poster'); if (poster) { console.log('[Thumbnail] Found video poster attribute'); const imageBuffer = await fetchImageAsBase64(poster, progressCallback); if (imageBuffer) { if (progressCallback) { progressCallback({ type: 'thumbnail', message: 'Thumbnail extracted from video poster', data: { thumbnail: imageBuffer }, timestamp: new Date().toISOString() }); } return imageBuffer; } } } catch (e) { logError('[Thumbnail] Video poster method failed', e); } // Method 3: Try Instagram window data structures try { const thumbnailUrl = await page.evaluate(() => { // Check for Instagram's internal data structures const data = (window as any).__additionalDataLoaded; if (data) { // Navigate through Instagram's data structure for (const key in data) { const item = data[key]; if (item?.graphql?.shortcode_media?.display_url) { return item.graphql.shortcode_media.display_url; } if (item?.graphql?.shortcode_media?.thumbnail_src) { return item.graphql.shortcode_media.thumbnail_src; } } } return null; }); if (thumbnailUrl) { console.log('[Thumbnail] Found thumbnail in Instagram data structures'); const imageBuffer = await fetchImageAsBase64(thumbnailUrl, progressCallback); if (imageBuffer) { if (progressCallback) { progressCallback({ type: 'thumbnail', message: 'Thumbnail extracted from Instagram data', data: { thumbnail: imageBuffer }, timestamp: new Date().toISOString() }); } return imageBuffer; } } } catch (e) { logError('[Thumbnail] Instagram data method failed', e); } // Method 4: Screenshot fallback (existing method) console.log('[Thumbnail] Falling back to screenshot method'); const screenshotThumbnail = await extractThumbnailScreenshot(page); if (screenshotThumbnail && progressCallback) { progressCallback({ type: 'thumbnail', message: 'Thumbnail extracted via screenshot', data: { thumbnail: screenshotThumbnail }, timestamp: new Date().toISOString() }); } return screenshotThumbnail; }