insta-recipe/src/lib/server/extraction.ts

import { createBrowserContext } from './browser';
import { logError } from './utils/logger';
import fs from 'fs';
import path from 'path';
import type { Page, BrowserContext } from 'playwright';

export interface ExtractedContent {
	bodyText: string;
	thumbnail: string | null;
}

export type ExtractionMethod =
	| 'embedded-json'
	| 'internal-state'
	| 'html-section'
	| 'dom-selector'
	| 'graphql-api'
	| 'graphql-intercept'
	| 'legacy';

type CaptionCandidate = {
	element: Element;
	text: string;
	score: number;
	innerHTML: string;
	brCount: number;
};

export type ProgressEventType =
	| 'status'
	| 'method'
	| 'retry'
	| 'error'
	| 'thumbnail'
	| 'complete'
	| 'model_loading';

export interface ProgressEvent {
	type: ProgressEventType;
	message: string;
	method?: ExtractionMethod;
	attemptNumber?: number;
	maxAttempts?: number;
	data?: any;
	timestamp?: string;
}

export type ProgressCallback = (event: ProgressEvent) => void;

interface ExtractionResult {
	success: boolean;
	method?: ExtractionMethod;
	data?: ExtractedContent;
	error?: string;
}

interface InstagramEmbeddedData {
	entry_data?: {
		PostPage?: Array<{
			graphql?: {
				shortcode_media?: {
					edge_media_to_caption?: {
						edges?: Array<{ node: { text: string } }>;
					};
					display_url?: string;
					video_url?: string;
					owner?: {
						username: string;
						profile_pic_url: string;
					};
				};
			};
		}>;
	};
}

interface RetryConfig {
	maxAttempts: number;
	initialDelayMs: number;
	maxDelayMs: number;
	backoffMultiplier: number;
}

const DEFAULT_RETRY_CONFIG: RetryConfig = {
	maxAttempts: 3,
	initialDelayMs: 1000,
	maxDelayMs: 10000,
	backoffMultiplier: 2
};

/**
 * Resolve authentication storage path
 * Checks Docker path first, then local path
 */
function resolveAuthPath(): string | undefined {
	const authPathDocker = '/app/secrets/auth.json';
	const authPathLocal = './secrets/auth.json';

	if (fs.existsSync(authPathDocker)) {
		return authPathDocker;
	}

	if (fs.existsSync(authPathLocal)) {
		return authPathLocal;
	}

	return undefined;
}

/**
 * Sleep utility for retry logic
 */
async function sleep(ms: number): Promise<void> {
	return new Promise((resolve) => setTimeout(resolve, ms));
}

/**
 * Check if error should not be retried
 */
function isNonRetriableError(error: unknown): boolean {
	if (error instanceof Error) {
		// Don't retry authentication errors
		if (error.message.includes('authentication') || error.message.includes('login required')) {
			return true;
		}

		// Don't retry invalid URLs
		if (error.message.includes('invalid url')) {
			return true;
		}
	}
	return false;
}

/**
 * Get human-readable display name for extraction method
 */
function getMethodDisplayName(method: ExtractionMethod): string {
	const names: Record<ExtractionMethod, string> = {
		'embedded-json': 'Embedded JSON',
		'internal-state': 'Internal State',
		'html-section': 'HTML Section',
		'dom-selector': 'DOM Selector',
		'graphql-api': 'GraphQL API',
		'graphql-intercept': 'GraphQL Intercept',
		legacy: 'Legacy Parser'
	};
	return names[method];
}

/**
 * Retry wrapper with exponential backoff
 */
async function withRetry<T>(
	fn: () => Promise<T>,
	config: RetryConfig = DEFAULT_RETRY_CONFIG,
	onProgress?: ProgressCallback
): Promise<T> {
	let lastError: Error | null = null;
	let delay = config.initialDelayMs;

	for (let attempt = 1; attempt <= config.maxAttempts; attempt++) {
		try {
			return await fn();
		} catch (error) {
			lastError = error as Error;

			// Don't retry on certain errors
			if (isNonRetriableError(error)) {
				onProgress?.({
					type: 'error',
					message: `Non-retriable error: ${lastError.message}`,
					timestamp: new Date().toISOString()
				});
				throw error;
			}

			if (attempt < config.maxAttempts) {
				const message = `Attempt ${attempt}/${config.maxAttempts} failed. Retrying in ${delay}ms...`;
				logError(`[Retry] ${message}`, error);

				onProgress?.({
					type: 'retry',
					message,
					attemptNumber: attempt,
					maxAttempts: config.maxAttempts,
					timestamp: new Date().toISOString()
				});

				await sleep(delay);
				delay = Math.min(delay * config.backoffMultiplier, config.maxDelayMs);
			}
		}
	}

	throw lastError || new Error('Max retry attempts exceeded');
}

/**
 * Extract shortcode from Instagram URL
 */
function extractShortcode(url: string): string | undefined {
	// Extract from /p/, /reel/, /reels/, /tv/ URLs
	const match = url.match(/\/(p|reel|reels|tv)\/([A-Za-z0-9_-]+)/);
	return match ? match[2] : undefined;
}

/**
 * Recipe keywords used for caption scoring
 */
const RECIPE_KEYWORDS = [
	'ingredienti',
	'procedimento',
	'preparazione',
	'ricetta',
	'recipe',
	'instructions'
];

/**
 * Timeout configuration constants (in milliseconds)
 */
const TIMEOUTS = {
	CONTENT_LOAD: 1500,
	MORE_BUTTON_VISIBILITY: 1000,
	CAPTION_EXPANSION: 3000,
	MORE_BUTTON_VISIBILITY_DOM: 500,
	MORE_BUTTON_CLICK: 800,
	PAGE_LOAD: 10000,
	NETWORK_SETTLE: 2000,
	ARTICLE_SELECTOR: 5000,
	GRAPHQL_WAIT: 1000,
	PAGE_NAVIGATION: 30000,
	ANTI_DETECTION_MIN: 1000,
	ANTI_DETECTION_MAX: 3000
} as const;

/**
 * Try to expand truncated caption by clicking "more" button in HTML section method
 */
async function tryExpandCaptionInHTMLSection(page: Page): Promise<void> {
	console.log('[Extractor] Looking for "more" button in primary post container...');
	try {
		await page.waitForTimeout(TIMEOUTS.CONTENT_LOAD);

		const mainContainer = page.locator('article, main, [role="main"]').first();
		const containerExists = (await mainContainer.count()) > 0;

		if (!containerExists) {
			console.log('[Extractor] No main container found');
			return;
		}

		console.log('[Extractor] Found main post container, searching for "more" button...');

		const morePatterns = [
			{
				locator: mainContainer.locator('span').filter({ hasText: /\.\.\.\s*more/i }),
				desc: "span with '...more'"
			},
			{
				locator: mainContainer.locator('span').filter({ hasText: /…\s*more/i }),
				desc: "span with '… more'"
			},
			{
				locator: mainContainer.locator('div[role="button"]').filter({ hasText: /more/i }),
				desc: "button with 'more'"
			},
			{
				locator: mainContainer.locator('span[role="button"]').filter({ hasText: /more/i }),
				desc: "span button with 'more'"
			}
		];

		for (const pattern of morePatterns) {
			const count = await pattern.locator.count();
			console.log(`[Extractor] Checking ${pattern.desc}: found ${count}`);

			if (count === 0) continue;

			const firstMore = pattern.locator.first();
			try {
				if (await firstMore.isVisible({ timeout: TIMEOUTS.MORE_BUTTON_VISIBILITY })) {
					const text = await firstMore.textContent();
					console.log(`[Extractor] Found visible "more": "${text}"`);
					await firstMore.click();
					console.log('[Extractor] Clicked "more" - waiting for expansion...');
					await page.waitForTimeout(TIMEOUTS.CAPTION_EXPANSION);
					console.log('[Extractor] Caption expansion complete');
					break;
				}
			} catch (e) {
				console.log(`[Extractor] ${pattern.desc} not clickable: ${e}`);
			}
		}

		console.log('[Extractor] Finished "more" button expansion attempt');
	} catch (e) {
		console.log(`[Extractor] Error while trying to expand caption: ${e}`);
	}
}

/**
 * Try to expand truncated caption by clicking "more" button in DOM method
 */
async function tryExpandCaptionInDOM(page: Page): Promise<void> {
	const moreButtonSelectors = [
		'article button:has-text("more")',
		'article button:has-text("More")',
		'article button:has-text("… more")',
		'article span[role="button"]:has-text("more")',
		'article [role="button"]:has-text("more")',
		'article div[role="button"]:has-text("more")',
		'xpath=//article//span[contains(text(), "more")]/..',
		'xpath=//article//button[contains(., "more")]'
	];

	const maxExpandAttempts = 3;
	let expandAttempts = 0;

	while (expandAttempts < maxExpandAttempts) {
		try {
			let clicked = false;
			for (const selector of moreButtonSelectors) {
				try {
					const button = page.locator(selector).first();
					if (await button.isVisible({ timeout: TIMEOUTS.MORE_BUTTON_VISIBILITY_DOM })) {
						await button.click();
						await page.waitForTimeout(TIMEOUTS.MORE_BUTTON_CLICK);
						console.log(`[Extractor] Clicked "more" button with selector: ${selector}`);
						clicked = true;
						expandAttempts++;
						break;
					}
				} catch (e) {
					// Try next selector
				}
			}

			if (!clicked) break;
		} catch (e) {
			break;
		}
	}
}

/**
 * Clean up extracted text - removes HTML tags, decodes entities, cleans whitespace
 */
export function cleanText(text: string): string {
	let cleaned = text;

	// First, convert <br> tags to newlines to preserve line breaks
	cleaned = cleaned.replace(/<br\s*\/?>/gi, '\n');

	// Strip all other HTML tags while keeping the text content
	cleaned = cleaned.replace(/<[^>]+>/g, '');

	// Decode HTML entities
	cleaned = cleaned
		.replace(/&amp;/g, '&')
		.replace(/&lt;/g, '<')
		.replace(/&gt;/g, '>')
		.replace(/&quot;/g, '"')
		.replace(/&#039;/g, "'")
		.replace(/&nbsp;/g, ' ');

	// Remove common UI text patterns
	const uiPatterns = [
		/More posts from.+/gi,
		/View all \d+ comments/gi,
		/Add a comment\.\.\./gi,
		/Liked by.+?(?=\n|$)/gi
	];

	uiPatterns.forEach((pattern) => {
		cleaned = cleaned.replace(pattern, '');
	});

	// Clean up whitespace while preserving intentional line breaks
	// Remove spaces at the beginning and end of lines
	cleaned = cleaned.replace(/[ \t]+$/gm, ''); // trailing spaces on each line
	cleaned = cleaned.replace(/^[ \t]+/gm, ''); // leading spaces on each line

	// Replace multiple consecutive blank lines with max 2 newlines
	cleaned = cleaned.replace(/\n\s*\n\s*\n+/g, '\n\n');

	// Remove spaces around newlines
	cleaned = cleaned.replace(/ *\n */g, '\n');

	// Normalize multiple spaces to single space within lines
	cleaned = cleaned.replace(/ {2,}/g, ' ');

	// Remove hashtags from end of text
	// Pattern: #word #multiple_words (supports international characters)
	cleaned = cleaned.replace(/(#[\w\u00C0-\u024F\u1E00-\u1EFF\u0400-\u04FF]+\s*)+$/gi, '').trim();

	return cleaned.trim();
}

/**
 * Strategy 1: Extract from embedded JSON data in script tags
 */
async function extractFromEmbeddedJSON(
	page: Page,
	progressCallback?: ProgressCallback
): Promise<ExtractedContent | null> {
	try {
		// Extract all script tag contents
		const scriptInfo = await page.evaluate(() => {
			const scripts = Array.from(document.querySelectorAll('script'));
			const scriptData = scripts.map((script, idx) => ({
				type: script.getAttribute('type') || 'no-type',
				hasContent: !!script.textContent,
				length: script.textContent?.length || 0,
				preview: script.textContent?.substring(0, 100) || ''
			}));
			console.log(`[Extractor] Found ${scripts.length} script tags`);
			return {
				contents: scripts.map((script) => script.textContent || ''),
				info: scriptData
			};
		});

		console.log(`[Extractor] Script tags summary:`, scriptInfo.info);

		// Look for embedded data patterns
		for (let i = 0; i < scriptInfo.contents.length; i++) {
			const content = scriptInfo.contents[i];

			// Try window._sharedData pattern
			const sharedDataMatch = content.match(/window\._sharedData\s*=\s*(\{.+?\});/s);
			if (sharedDataMatch) {
				console.log(`[Extractor] Found _sharedData in script ${i}`);
				try {
					const data: InstagramEmbeddedData = JSON.parse(sharedDataMatch[1]);
					const result = parseInstagramData(data);
					if (result) {
						const thumbnail = await extractThumbnailStealth(page, progressCallback);
						return { ...result, thumbnail };
					}
				} catch (e) {
					logError('[Extractor] Failed to parse _sharedData', e);
				}
			}

			// Try __additionalDataLoaded pattern
			const additionalDataMatch = content.match(
				/window\.__additionalDataLoaded\([^,]+,\s*(\{.+?\})\);/s
			);
			if (additionalDataMatch) {
				console.log(`[Extractor] Found __additionalDataLoaded in script ${i}`);
				try {
					const data = JSON.parse(additionalDataMatch[1]);
					const result = parseInstagramData(data);
					if (result) {
						const thumbnail = await extractThumbnailStealth(page, progressCallback);
						return { ...result, thumbnail };
					}
				} catch (e) {
					logError('[Extractor] Failed to parse __additionalDataLoaded', e);
				}
			}

			// Try to find any large JSON with caption data (new Instagram format)
			if ((content.includes('"caption"') || content.includes('"text"')) && content.length > 10000) {
				console.log(
					`[Extractor] Attempting to extract from large JSON in script ${i} (length: ${content.length})`
				);
				try {
					// Try to parse as direct JSON
					const jsonData = JSON.parse(content);

					// Try deep search first
					const deepResult = deepSearchForCaption(jsonData);
					if (deepResult && deepResult.bodyText && deepResult.bodyText.length > 130) {
						console.log(
							`[Extractor] Deep search in JSON found caption: ${deepResult.bodyText.length} chars`
						);
						const thumbnail = await extractThumbnailStealth(page, progressCallback);
						return { ...deepResult, thumbnail };
					}

					// Try standard parsing
					const result = parseInstagramData(jsonData);
					if (result && result.bodyText && result.bodyText.length > 130) {
						console.log(
							`[Extractor] Successfully extracted from JSON, text length: ${result.bodyText.length}`
						);
						const thumbnail = await extractThumbnailStealth(page, progressCallback);
						return { ...result, thumbnail };
					}
				} catch (e) {
					// Not direct JSON or parsing failed, try to find caption fields with regex
					console.log(`[Extractor] JSON parse failed, trying regex extraction...`);
					// Try multiple patterns for different Instagram JSON structures
					const patterns = [
						/"caption"\s*:\s*\{\s*"text"\s*:\s*"([^"\\]*(\\.[^"\\]*)*)"/, // Escaped quotes
						/"text"\s*:\s*"([^"\\]*(\\.[^"\\]*)*)"\s*,?\s*"pk"/, // text field near pk
						/"edge_media_to_caption"\s*:\s*\{\s*"edges"\s*:\s*\[\s*\{\s*"node"\s*:\s*\{\s*"text"\s*:\s*"([^"\\]*(\\.[^"\\]*)*)"/
					];

					for (const pattern of patterns) {
						const captionMatch = content.match(pattern);
						if (captionMatch) {
							// Get the captured group (first non-undefined)
							const rawText = captionMatch[1] || '';
							const captionText = rawText
								.replace(/\\n/g, '\n')
								.replace(/\\"/g, '"')
								.replace(/\\u([0-9a-fA-F]{4})/g, (_, code) =>
									String.fromCharCode(parseInt(code, 16))
								)
								.replace(/\\\\/g, '\\');

							if (captionText.length > 130) {
								console.log(
									`[Extractor] Extracted caption from regex pattern, length: ${captionText.length}`
								);
								const thumbnail = await extractThumbnailStealth(page, progressCallback);
								return { bodyText: cleanText(captionText), thumbnail };
							}
						}
					}
				}
			}
		}

		return null;
	} catch (error) {
		logError('[Extractor] Failed to extract from embedded JSON', error);
		return null;
	}
}

/**
 * Parse Instagram data structure
 */
function parseInstagramData(data: any): Omit<ExtractedContent, 'thumbnail'> | null {
	try {
		// Navigate the nested structure
		const media = data?.entry_data?.PostPage?.[0]?.graphql?.shortcode_media;

		if (!media) {
			// Try alternative structures
			const items = data?.items || data?.data?.shortcode_media;
			if (items) {
				return extractFromAlternativeStructure(items);
			}
			return null;
		}

		// Extract caption
		const captionEdges = media.edge_media_to_caption?.edges || [];
		const bodyText = captionEdges.map((edge: any) => edge.node.text).join('\n');

		if (!bodyText) {
			return null;
		}

		return {
			bodyText: cleanText(bodyText)
		};
	} catch (error) {
		logError('[Extractor] Failed to parse Instagram data structure', error);
		return null;
	}
}

/**
 * Parse alternative Instagram data structures
 */
function extractFromAlternativeStructure(items: any): Omit<ExtractedContent, 'thumbnail'> | null {
	try {
		if (Array.isArray(items)) {
			items = items[0];
		}

		const caption = items?.caption?.text || items?.edge_media_to_caption?.edges?.[0]?.node?.text;

		if (caption) {
			return {
				bodyText: cleanText(caption)
			};
		}

		return null;
	} catch (error) {
		logError('[Extractor] Failed to parse alternative structure', error);
		return null;
	}
}

/**
 * Strategy 2.5: Extract caption by finding the span with recipe content characteristics
 * Instagram uses obfuscated class names, but the caption span has identifiable patterns:
 * - Contains substantial text (> 100 chars)
 * - Has multiple <br> tags for formatting
 * - Contains <a> tags for mentions and hashtags
 * - Usually has a style attribute with line-height
 */
export async function extractFromHTMLSection(
	page: Page,
	progressCallback?: ProgressCallback,
	targetUrl?: string
): Promise<ExtractedContent | null> {
	try {
		console.log('[Extractor] Waiting for page content to load...');

		// Validate we're on the correct page
		const currentUrl = page.url();
		const targetShortcode = targetUrl ? extractShortcode(targetUrl) : null;
		const currentShortcode = extractShortcode(currentUrl);

		console.log(`[Extractor] Current page URL: ${currentUrl}`);
		console.log(
			`[Extractor] Target shortcode: ${targetShortcode}, Current shortcode: ${currentShortcode}`
		);

		if (targetShortcode && currentShortcode !== targetShortcode) {
			console.log(`[Extractor] URL mismatch: expected ${targetShortcode}, got ${currentShortcode}`);
			return null;
		}

		console.log(`[Extractor] Confirmed on correct post: ${currentShortcode}`);

		// Wait for network to settle
		await page.waitForLoadState('domcontentloaded', { timeout: TIMEOUTS.PAGE_LOAD });
		await page.waitForTimeout(TIMEOUTS.NETWORK_SETTLE);

		// Try to expand truncated caption by clicking "more" button
		// STRATEGY: Since we're already on the correct page (URL validated above),
		// the FIRST article/main post container should be our target post.
		await tryExpandCaptionInHTMLSection(page);

		console.log('[Extractor] Extracting caption using intelligent span detection...');

		const result = await page.evaluate((shortcode) => {
			// Strategy: Find the caption span that belongs to the correct post
			// Instagram loads multiple posts, so we need to find the span associated
			// with our target shortcode

			const recipeKeywords = [
				'ingredienti',
				'procedimento',
				'preparazione',
				'ricetta',
				'recipe',
				'instructions'
			];

			// First, try to find links pointing to our target post
			const postLinks = document.querySelectorAll(`a[href*="/${shortcode}"]`);
			console.log(`[Extractor] Found ${postLinks.length} links to target post ${shortcode}`);

			// If we found links to the post, search for spans within those link ancestors
			const searchRoots: Element[] = [];
			if (postLinks.length > 0) {
				postLinks.forEach((link) => {
					// Get the article or section container for this post
					let container =
						link.closest('article') || link.closest('section') || link.closest('[role="main"]');
					if (container && !searchRoots.includes(container)) {
						searchRoots.push(container);
						console.log(`[Extractor] Found container for target post`);
					}
				});
			}

			// If no specific containers found, search the whole document (fallback)
			if (searchRoots.length === 0) {
				console.log(`[Extractor] No specific container found, searching whole document`);
				searchRoots.push(document.body);
			}

			const spans: HTMLElement[] = [];
			searchRoots.forEach((root) => {
				root.querySelectorAll('span').forEach((span) => spans.push(span as HTMLElement));
			});

			console.log(`[Extractor] Searching ${spans.length} spans for recipe content`);

			let bestCandidate: CaptionCandidate | null = null;

			// Search all spans for the best caption candidate
			// PRIMARY CRITERIA: Most <br> tags (recipe formatting indicator)
			spans.forEach((span, spanIdx) => {
				const text = (span.textContent || '').toLowerCase();
				const innerHTML = span.innerHTML || '';

				// Skip empty or very short spans
				if (text.length < 30) return;

				// Count <br> tags - this is the MOST reliable indicator for recipes
				const brCount = (innerHTML.match(/<br\s*\/?>/gi) || []).length;

				// No minimum br count - take what we can get

				// Calculate a score based on recipe characteristics
				let score = 0;

				// <br> tags are the PRIMARY signal
				score += brCount * 100; // Massive weight for line breaks

				// Check for recipe keywords (strong indicator)
				const hasKeywords = recipeKeywords.some((keyword) => text.includes(keyword));
				if (hasKeywords) {
					score += 500; // Huge boost for recipe keywords
				}

				// Count <a> tags - captions have hashtags/mentions
				const linkCount = span.querySelectorAll('a').length;
				if (linkCount > 2) {
					score += linkCount * 10;
				}

				// Text length (longer is better for recipes)
				score += Math.min(text.length / 5, 200);

				// Check for line-height style (caption formatting)
				const style = span.getAttribute('style') || '';
				if (style.includes('line-height')) {
					score += 30;
				}

				// Penalize UI elements
				if (text.match(/^(follow|following|liked by|view all|more posts|comments)/i)) {
					score -= 500;
				}

				// Penalize audio/music credits
				if (text.match(/·|papaoutai|afro soul/i) && text.length < 100) {
					score -= 200;
				}

				// Update best candidate
				if (score > 0 && (!bestCandidate || score > bestCandidate.score)) {
					console.log(
						`[Extractor] New best: score=${score}, len=${text.length}, br=${brCount}, links=${linkCount}, preview="${text.substring(0, 80)}..."`
					);
					bestCandidate = {
						element: span,
						text: span.textContent || '',
						score: score,
						innerHTML: innerHTML,
						brCount: brCount
					};
				}
			});

			if (!bestCandidate) {
				return {
					success: false,
					error: 'No suitable caption span found',
					text: ''
				};
			}

			// Explicit type assertion (safe after null guard)
			const candidate: CaptionCandidate = bestCandidate;

			console.log(
				`[Extractor] Final caption candidate: score=${candidate.score}, length=${candidate.text.length}`
			);

			// Extract text from the best candidate
			// Use innerHTML to preserve <br> tags, which will be converted to newlines in cleanText
			let captionText = candidate.innerHTML;

			return {
				success: true,
				text: captionText,
				score: candidate.score,
				length: captionText.length,
				htmlPreview: candidate.innerHTML.substring(0, 500)
			};
		}, currentShortcode);

		console.log(`[Extractor] HTML Section result:`, {
			success: result.success,
			textLength: result.length,
			score: result.score
		});

		if (result.htmlPreview) {
			console.log('[Extractor] HTML preview (first 500 chars):');
			console.log(result.htmlPreview);
		}

		if (!result.success) {
			console.log(`[Extractor] ${result.error}`);
			return null;
		}

		const captionText = result.text;

		if (!captionText || captionText.length === 0) {
			console.log('[Extractor] No text extracted from HTML section');
			return null;
		}

		const thumbnail = await extractThumbnailStealth(page, progressCallback);

		return {
			bodyText: cleanText(captionText),
			thumbnail
		};
	} catch (error) {
		logError('[Extractor] Failed to extract from HTML section', error);
		return null;
	}
}

/**
 * Strategy 3: Extract from DOM using specific selectors
 */
export async function extractFromDOM(
	page: Page,
	progressCallback?: ProgressCallback
): Promise<ExtractedContent | null> {
	try {
		// Give Instagram more time to load dynamic content
		console.log('[Extractor] Waiting for network idle...');
		await page.waitForLoadState('networkidle', { timeout: TIMEOUTS.PAGE_LOAD }).catch(() => {
			console.log('[Extractor] Network idle timeout, continuing anyway');
		});

		// Try to wait for article content
		await page.waitForSelector('article', { timeout: TIMEOUTS.ARTICLE_SELECTOR }).catch(() => {});

		// Additional wait for dynamic content
		await page.waitForTimeout(TIMEOUTS.NETWORK_SETTLE);

		// Try to intercept GraphQL responses
		let graphqlCaption: string | null = null;
		page.on('response', async (response) => {
			const url = response.url();
			if (url.includes('graphql') || url.includes('api/v1')) {
				try {
					const json = await response.json();
					const captionData = extractCaptionFromGraphQL(json);
					if (captionData && captionData.length > 130) {
						graphqlCaption = captionData;
						console.log(
							`[Extractor] Intercepted GraphQL response with ${captionData.length} chars`
						);
					}
				} catch (e) {
					// Not JSON or parsing failed
				}
			}
		});

		await page.waitForTimeout(TIMEOUTS.GRAPHQL_WAIT);

		if (graphqlCaption) {
			const thumbnail = await extractThumbnailStealth(page, progressCallback);
			return { bodyText: cleanText(graphqlCaption), thumbnail };
		}

		// Try to expand truncated captions by clicking "more" button
		await tryExpandCaptionInDOM(page);

		const captionText = await page.evaluate(() => {
			// First check og:description for comparison
			const metaDesc = document.querySelector('meta[property="og:description"]');
			const ogContent = metaDesc?.getAttribute('content') || '';
			console.log(`[Extractor] og:description length: ${ogContent.length}`);
			if (ogContent.length > 200) {
				console.log(`[Extractor] og:description preview: ${ogContent.substring(0, 200)}...`);
			}

			// SMART APPROACH: Find the truncated text first, then look for full version nearby
			// Look for text that ends with "..." or "… more"
			const allSpans = Array.from(
				document.querySelectorAll('article span, article div, article h1')
			);

			let longestText = '';
			let matchedElement = null;

			// Strategy 1: Find elements with substantial text
			for (const element of allSpans) {
				const text = element.textContent?.trim() || '';

				// Skip UI elements
				if (text.match(/^(follow|like|comment|share|view all|load more|add a comment)$/i)) {
					continue;
				}

				// Look for text that seems like content
				if (text.length > longestText.length) {
					longestText = text;
					matchedElement = element;
				}
			}

			// Strategy 2: Look in data attributes
			const elementsWithData = Array.from(
				document.querySelectorAll('[data-caption], [data-text], [data-content]')
			);
			for (const el of elementsWithData) {
				const dataCaption =
					el.getAttribute('data-caption') ||
					el.getAttribute('data-text') ||
					el.getAttribute('data-content');
				if (dataCaption && dataCaption.length > longestText.length) {
					longestText = dataCaption;
					console.log(`[Extractor] Found data attribute with ${dataCaption.length} chars`);
				}
			}

			// Strategy 3: Look for hidden/collapsed content
			const hiddenElements = Array.from(
				document.querySelectorAll(
					'[style*="display: none"], [style*="display:none"], .collapsed, [aria-hidden="true"]'
				)
			);
			for (const el of hiddenElements) {
				const text = el.textContent?.trim() || '';
				if (text.length > longestText.length && text.length > 200) {
					longestText = text;
					console.log(`[Extractor] Found hidden element with ${text.length} chars`);
				}
			}

			// Strategy 4: Find parent of truncated text
			if (matchedElement && longestText.endsWith('...')) {
				// Look at siblings and parent
				const parent = matchedElement.parentElement;
				if (parent) {
					const parentText = parent.textContent?.trim() || '';
					if (parentText.length > longestText.length) {
						longestText = parentText;
						console.log(
							`[Extractor] Found fuller text in parent element: ${parentText.length} chars`
						);
					}
				}

				// Check next siblings
				let sibling = matchedElement.nextElementSibling;
				let siblingCount = 0;
				while (sibling && siblingCount < 5) {
					const siblingText = sibling.textContent?.trim() || '';
					if (siblingText.length > 50) {
						longestText = longestText + ' ' + siblingText;
						console.log(`[Extractor] Found continuation in sibling: ${siblingText.length} chars`);
					}
					sibling = sibling.nextElementSibling;
					siblingCount++;
				}
			}

			if (longestText && longestText.length > 100) {
				console.log(`[Extractor] Best extraction: ${longestText.length} chars`);
				return longestText;
			}

			// Fallback to og:description
			if (metaDesc) {
				const content = ogContent;
				const cleanedContent = content.replace(
					/^\d+K?\s+likes,\s+\d+\s+comments\s+-\s+[\w.]+\s+on\s+[^:]+:\s*["']?/,
					''
				);
				console.log('[Extractor] DOM selector fallback: og:description (with metadata cleanup)');
				return cleanedContent;
			}

			return null;
		});

		if (!captionText) {
			return null;
		}

		// Extract thumbnail using existing logic
		const thumbnail = await extractThumbnailStealth(page, progressCallback);

		return {
			bodyText: cleanText(captionText),
			thumbnail
		};
	} catch (error) {
		logError('[Extractor] Failed to extract from DOM', error);
		return null;
	}
}

/**
 * Strategy 3: Extract via GraphQL API
 */
async function extractViaGraphQL(
	url: string,
	context: BrowserContext
): Promise<ExtractedContent | null> {
	const shortcode = extractShortcode(url);
	if (!shortcode) {
		console.warn('Could not extract shortcode from URL:', url);
		return null;
	}

	try {
		const page = await context.newPage();

		// Make GraphQL request
		const response = await page.request.post('https://www.instagram.com/graphql/query/', {
			form: {
				variables: JSON.stringify({ shortcode }),
				doc_id: '7950326061742207' // May need periodic updates
			}
		});

		if (!response.ok()) {
			console.warn(`GraphQL request failed: ${response.status()}`);
			await page.close();
			return null;
		}

		const data = await response.json();

		// Parse GraphQL response
		const media = data?.data?.shortcode_media;
		if (!media) {
			await page.close();
			return null;
		}

		const bodyText = media.edge_media_to_caption?.edges?.[0]?.node?.text || '';

		await page.close();

		if (!bodyText) {
			return null;
		}

		return {
			bodyText: cleanText(bodyText),
			thumbnail: null // GraphQL doesn't easily provide thumbnail, would need page context
		};
	} catch (error) {
		logError('[Extractor] GraphQL extraction failed', error);
		return null;
	}
}

/**
 * Strategy 4: Legacy extraction method (fallback)
 */

async function extractCleanTextLegacy(page: Page): Promise<string> {
	let text = (await page.evaluate(() => document.body.innerText))
		.replace(/^(?:.*\n){6}/, '') // Remove first 6 lines
		.split('More posts from')[0] // Cut at "More posts from"
		.trim();

	// Remove mentions and hashtags
	text = text.replace(/@\w+/g, '').replace(/#\w+/g, '');

	return text;
}

/**
 * Strategy 5: Extract from Instagram's internal state/cache
 */
async function extractFromInternalState(
	page: Page,
	progressCallback?: ProgressCallback
): Promise<ExtractedContent | null> {
	try {
		const stateData = await page.evaluate(() => {
			// Try to access Instagram's internal React/Apollo cache
			const possibleKeys = [
				'_sharedData',
				'__PRIVATE_STATE__',
				'__additionalData',
				'__initialData',
				'__RELAY_STORE__'
			];

			for (const key of possibleKeys) {
				if ((window as any)[key]) {
					const data = (window as any)[key];
					console.log(`[Extractor] Found internal state: ${key}`);
					return { key, data: JSON.stringify(data).substring(0, 500000) }; // Limit to 500KB
				}
			}

			return null;
		});

		if (stateData) {
			console.log(`[Extractor] Parsing internal state from ${stateData.key}`);
			try {
				const parsed = JSON.parse(stateData.data);

				// Try multiple parsing strategies
				let result = parseInstagramData(parsed);

				console.log(`[Extractor] Standard parsing result: ${result?.bodyText?.length || 0} chars`);

				// Debug: log structure
				if (parsed.entry_data) {
					console.log(`[Extractor] Found entry_data with keys:`, Object.keys(parsed.entry_data));
				}
				if (parsed.config) {
					console.log(`[Extractor] Found config`);
				}

				// If standard parsing failed, try deep search for caption text
				if (!result || !result.bodyText || result.bodyText.length <= 130) {
					console.log(`[Extractor] Attempting deep search in ${stateData.key}...`);
					result = deepSearchForCaption(parsed);
					if (result) {
						console.log(`[Extractor] Deep search found: ${result.bodyText.length} chars`);
					} else {
						console.log(`[Extractor] Deep search found no caption`);
					}
				}

				if (result && result.bodyText && result.bodyText.length > 130) {
					console.log(
						`[Extractor] Successfully extracted from ${stateData.key}, length: ${result.bodyText.length}`
					);
					const thumbnail = await extractThumbnailStealth(page, progressCallback);
					return { ...result, thumbnail };
				} else if (result?.bodyText) {
					console.log(
						`[Extractor] Found text in ${stateData.key} but it's truncated (${result.bodyText.length} chars)`
					);
				}
			} catch (e) {
				console.log(`[Extractor] Failed to parse ${stateData.key}:`, e);
			}
		}

		return null;
	} catch (error) {
		logError('[Extractor] Failed to extract from internal state', error);
		return null;
	}
}

/**
 * Deep search for caption text in any nested object structure
 */
function deepSearchForCaption(
	obj: any,
	maxDepth = 10,
	currentDepth = 0
): Omit<ExtractedContent, 'thumbnail'> | null {
	if (currentDepth > maxDepth || !obj || typeof obj !== 'object') {
		return null;
	}

	// Look for caption/text fields
	if (obj.caption && typeof obj.caption === 'object' && obj.caption.text) {
		const text = obj.caption.text;
		if (typeof text === 'string' && text.length > 130) {
			return { bodyText: cleanText(text) };
		}
	}

	// Look for edge_media_to_caption pattern
	if (obj.edge_media_to_caption?.edges?.[0]?.node?.text) {
		const text = obj.edge_media_to_caption.edges[0].node.text;
		if (typeof text === 'string' && text.length > 130) {
			return { bodyText: cleanText(text) };
		}
	}

	// Look for direct text field in media items
	if (obj.text && typeof obj.text === 'string' && obj.text.length > 130) {
		// Make sure it's not just a UI label
		if (!obj.text.match(/^(more|less|follow|like|comment|share)$/i)) {
			return { bodyText: cleanText(obj.text) };
		}
	}

	// Recursively search in all properties
	for (const key in obj) {
		if (obj.hasOwnProperty(key)) {
			const result = deepSearchForCaption(obj[key], maxDepth, currentDepth + 1);
			if (result && result.bodyText.length > 130) {
				return result;
			}
		}
	}

	return null;
}

/**
 * Extract caption from intercepted GraphQL response
 */
/**
 * Extract caption from GraphQL response, validating it matches the expected shortcode
 */
function extractCaptionFromGraphQL(data: any, expectedShortcode?: string): string | null {
	// If we have an expected shortcode, verify this GraphQL response is for that content
	if (expectedShortcode) {
		// Search for shortcode in the response
		const hasMatchingShortcode = JSON.stringify(data).includes(expectedShortcode);
		if (!hasMatchingShortcode) {
			// This GraphQL response is for different content, ignore it
			return null;
		}
	}

	const result = deepSearchForCaption(data);
	return result?.bodyText || null;
}

/**
 * Orchestrate extraction strategies
 */
async function extractWithStrategies(
	url: string,
	page: Page,
	context: BrowserContext,
	onProgress?: ProgressCallback
): Promise<ExtractionResult> {
	const strategies: Array<{
		name: ExtractionMethod;
		fn: () => Promise<ExtractedContent | null>;
	}> = [
		{
			name: 'embedded-json',
			fn: () => extractFromEmbeddedJSON(page, onProgress)
		},
		{
			name: 'internal-state',
			fn: () => extractFromInternalState(page, onProgress)
		},
		{
			name: 'html-section',
			fn: () => extractFromHTMLSection(page, onProgress, url)
		},
		{
			name: 'dom-selector',
			fn: () => extractFromDOM(page, onProgress)
		},
		{
			name: 'graphql-api',
			fn: () => extractViaGraphQL(url, context)
		},
		{
			name: 'legacy',
			fn: async () => {
				const text = await extractCleanTextLegacy(page);
				const thumbnail = await extractThumbnailStealth(page, onProgress);
				return { bodyText: text, thumbnail };
			}
		}
	];

	for (const strategy of strategies) {
		try {
			const methodMessage = `Trying extraction method: ${getMethodDisplayName(strategy.name)}`;
			console.log(`[Extractor] ${methodMessage}`);

			onProgress?.({
				type: 'method',
				message: methodMessage,
				method: strategy.name,
				timestamp: new Date().toISOString()
			});

			const result = await strategy.fn();

			if (result && result.bodyText) {
				const successMessage = `✓ Success with method: ${getMethodDisplayName(strategy.name)}`;
				console.log(`[Extractor] ${successMessage}`);

				onProgress?.({
					type: 'status',
					message: successMessage,
					method: strategy.name,
					timestamp: new Date().toISOString()
				});

				return {
					success: true,
					method: strategy.name,
					data: result
				};
			}
		} catch (error) {
			logError(`[Extractor] Method ${strategy.name} failed`, error);
			// Continue to next strategy
		}
	}

	return {
		success: false,
		error: 'All extraction methods failed'
	};
}

/**
 * Extract text content and thumbnail from a URL using Playwright browser
 * Uses multiple extraction strategies with fallback
 * @param url - The URL to extract from
 * @param onProgress - Optional callback to receive progress updates
 * @returns Extracted text and thumbnail
 */
export async function extractTextAndThumbnail(
	url: string,
	onProgress?: ProgressCallback
): Promise<ExtractedContent> {
	onProgress?.({
		type: 'status',
		message: 'Starting extraction...',
		timestamp: new Date().toISOString()
	});

	return withRetry(
		async () => {
			const authPath = resolveAuthPath();
			const context = await createBrowserContext(authPath);
			const page = await context.newPage();

			// Extract shortcode for validation
			const expectedShortcode = extractShortcode(url);
			console.log(`[Extractor] Target shortcode: ${expectedShortcode || 'unknown'}`);

			try {
				// Set timeout
				page.setDefaultTimeout(30000);

				// Set up GraphQL response interception BEFORE loading the page
				// This is critical to catch initial network requests during page load
				let interceptedCaption: string | null = null;
				page.on('response', async (response) => {
					try {
						const responseUrl = response.url();
						if (
							responseUrl.includes('graphql') ||
							responseUrl.includes('api/v1') ||
							responseUrl.includes('/web/')
						) {
							try {
								const json = await response.json();
								const captionData = extractCaptionFromGraphQL(json, expectedShortcode ?? undefined);
								if (captionData && captionData.length > 130) {
									interceptedCaption = captionData;
									console.log(
										`[Extractor] ✓ Intercepted GraphQL with full caption: ${captionData.length} chars (shortcode verified)`
									);
								}
							} catch (e) {
								// Not JSON or parse error, skip
							}
						}
					} catch (e) {
						// Ignore response errors
					}
				});

				onProgress?.({
					type: 'status',
					message: 'Loading Instagram page...',
					timestamp: new Date().toISOString()
				});

				await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });

				// Add small human-like delay
				await page.waitForTimeout(1000 + Math.random() * 2000);

				// Try scrolling and waiting to trigger additional GraphQL requests
				console.log('[Extractor] Scrolling to trigger lazy loading...');
				await page.evaluate(() => {
					window.scrollBy(0, 300);
				});
				await page.waitForTimeout(1500);

				await page.evaluate(() => {
					window.scrollBy(0, 300);
				});
				await page.waitForTimeout(1500);

				await page.evaluate(() => {
					window.scrollTo(0, 0);
				});
				await page.waitForTimeout(1000);

				// Always use DOM extraction (HTML Section) — it clicks "… more" in
				// the browser and gets the fully expanded caption. The GraphQL
				// interception is unreliable: Instagram often truncates captions
				// in API responses without any "…." marker, so we cannot trust
				// the intercepted text to be complete.
				const capturedCaption = interceptedCaption as string | null;
				if (capturedCaption) {
					console.log(
						`[Extractor] Intercepted GraphQL caption (${capturedCaption.length} chars) — always using DOM extraction for full text`
					);
				}

				const result = await extractWithStrategies(url, page, context, onProgress);

				if (!result.success || !result.data) {
					// DOM extraction failed — fall back to intercepted caption if available
					if (capturedCaption) {
						console.log(
							'[Extractor] DOM extraction failed — using intercepted GraphQL caption as fallback'
						);
						const thumbnail = await extractThumbnailStealth(page, onProgress);
						return { bodyText: cleanText(capturedCaption), thumbnail };
					}
					throw new Error(result.error || 'Extraction failed');
				}

				// Save debug content
				fs.writeFileSync(
					path.resolve('debug_page.txt'),
					`Method: ${result.method}\n\n${result.data.bodyText}`
				);

				onProgress?.({
					type: 'complete',
					message: 'Extraction completed successfully',
					method: result.method,
					timestamp: new Date().toISOString()
				});

				return result.data;
			} finally {
				await page.close();
				await context.close();
			}
		},
		DEFAULT_RETRY_CONFIG,
		onProgress
	);
}

/**
 * Extract thumbnail from video element or take full page screenshot
 */
/**
 * Screenshot-based thumbnail extraction (fallback method)
 * Takes a screenshot of the video element or full page if video not found
 */
async function extractThumbnailScreenshot(page: Page): Promise<string | null> {
	const videoBounds = await page.evaluate(() => {
		const video = document.querySelector('video');
		if (!video) return null;
		const rect = video.getBoundingClientRect();
		return {
			x: Math.max(0, rect.left),
			y: Math.max(0, rect.top),
			width: Math.min(rect.width, window.innerWidth),
			height: Math.min(rect.height, window.innerHeight)
		};
	});

	let screenshotBuffer: Buffer;

	if (videoBounds && videoBounds.width > 0 && videoBounds.height > 0) {
		screenshotBuffer = await page.screenshot({
			type: 'jpeg',
			quality: 85,
			clip: videoBounds
		});
	} else {
		console.warn('[Thumbnail] Video element not found or has no size, taking full page screenshot');
		screenshotBuffer = await page.screenshot({ type: 'jpeg', quality: 85 });
	}

	return `data:image/jpeg;base64,${screenshotBuffer.toString('base64')}`;
}

/**
 * Helper: Fetch image from URL and convert to base64 data URI
 *
 * **Validation Criteria:**
 * - HTTP status must be exactly 200 (not 2xx, only 200)
 * - Content-Type must start with 'image/' (e.g., image/jpeg, image/png, image/webp)
 * - Request must complete within 10 seconds
 *
 * **Failure Scenarios:**
 * - Non-200 status → Returns null, reports status code via progress callback
 * - Invalid content-type → Returns null, reports content-type via progress callback
 * - Timeout → Returns null, reports timeout via progress callback
 * - Network error → Returns null, reports error message via progress callback
 *
 * **Usage in Fallback Chain:**
 * This function is used by `extractThumbnailStealth()` which tries multiple URL sources:
 * 1. Meta tags (og:image, twitter:image)
 * 2. Video poster attribute
 * 3. Instagram data structures (display_url, thumbnail_src)
 * 4. Screenshot fallback (always succeeds)
 *
 * When this function returns null, extraction continues to the next method.
 *
 * @param imageUrl - The image URL to fetch (must be HTTPS)
 * @param progressCallback - Optional callback for progress reporting
 * @returns Base64 data URI (data:image/*;base64,...) or null if validation fails
 *
 * @example
 * ```typescript
 * const thumbnail = await fetchImageAsBase64(
 *   'https://instagram.com/image.jpg',
 *   (event) => console.log(event.message)
 * );
 *
 * if (thumbnail) {
 *   // thumbnail is a valid base64 data URI
 *   console.log(thumbnail.substring(0, 50)); // "data:image/jpeg;base64,/9j/4AAQSkZJRg..."
 * } else {
 *   // URL validation failed, try next method
 * }
 * ```
 */
async function fetchImageAsBase64(
	imageUrl: string,
	progressCallback?: ProgressCallback
): Promise<string | null> {
	try {
		// Create abort controller for timeout
		const controller = new AbortController();
		const timeoutId = setTimeout(() => controller.abort(), 10000); // 10s timeout

		console.log(`[Thumbnail] Validating URL: ${imageUrl}`);

		const response = await fetch(imageUrl, {
			signal: controller.signal
		});

		clearTimeout(timeoutId);

		// Strict status validation: must be exactly 200
		if (response.status !== 200) {
			console.warn(`[Thumbnail] URL validation failed: HTTP ${response.status} for ${imageUrl}`);
			progressCallback?.({
				type: 'status',
				message: `Thumbnail URL returned HTTP ${response.status}, trying next method...`,
				timestamp: new Date().toISOString()
			});
			return null;
		}

		// Validate content-type
		const contentType = response.headers.get('content-type') || '';
		if (!contentType.startsWith('image/')) {
			console.warn(
				`[Thumbnail] URL validation failed: Invalid content-type '${contentType}' for ${imageUrl}`
			);
			progressCallback?.({
				type: 'status',
				message: `Thumbnail URL returned non-image content (${contentType}), trying next method...`,
				timestamp: new Date().toISOString()
			});
			return null;
		}

		console.log(`[Thumbnail] URL validation successful: ${imageUrl} (${contentType})`);

		const arrayBuffer = await response.arrayBuffer();
		const buffer = Buffer.from(arrayBuffer);

		const base64Data = `data:${contentType};base64,${buffer.toString('base64')}`;

		progressCallback?.({
			type: 'status',
			message: 'Thumbnail fetched and validated from URL',
			timestamp: new Date().toISOString()
		});

		return base64Data;
	} catch (e) {
		if (e instanceof Error) {
			if (e.name === 'AbortError') {
				console.error(`[Thumbnail] URL fetch timeout: ${imageUrl}`);
				progressCallback?.({
					type: 'status',
					message: 'Thumbnail URL fetch timeout, trying next method...',
					timestamp: new Date().toISOString()
				});
			} else {
				console.error(`[Thumbnail] Failed to fetch image from ${imageUrl}:`, e.message);
				progressCallback?.({
					type: 'status',
					message: `Thumbnail URL fetch failed (${e.message}), trying next method...`,
					timestamp: new Date().toISOString()
				});
			}
		} else {
			logError('[Thumbnail] Failed to fetch image', e);
		}
		return null;
	}
}

/**
 * Extract thumbnail from Instagram post using stealth techniques
 *
 * Tries multiple methods in order of stealth:
 * 1. Meta tags (og:image, twitter:image) - Returns: Direct HTTPS URL
 * 2. Video poster attribute - Returns: Direct HTTPS URL
 * 3. Instagram window data structures - Returns: Direct HTTPS URL
 * 4. Screenshot fallback - Returns: Base64 data URL (data:image/jpeg;base64,...)
 *
 * @param page - Playwright page instance
 * @param progressCallback - Optional progress callback for SSE updates
 * @returns Image URL (either direct HTTPS URL or base64 data URL) or null if all methods fail
 *
 * **Thumbnail Format Guide:**
 * - Methods 1-3: Return direct HTTPS URLs → Tandoor can use URL pass-through (efficient)
 * - Method 4: Returns base64 data URL → Requires conversion to file blob for upload
 */
async function extractThumbnailStealth(
	page: Page,
	progressCallback?: ProgressCallback
): Promise<string | null> {
	console.log('[Thumbnail] Starting stealth extraction');

	// Method 1: Try meta tags (most stealthy)
	try {
		const ogImage = await page.getAttribute('meta[property="og:image"]', 'content');
		if (ogImage) {
			console.log('[Thumbnail] Found og:image meta tag');
			const imageBuffer = await fetchImageAsBase64(ogImage, progressCallback);
			if (imageBuffer) {
				if (progressCallback) {
					progressCallback({
						type: 'thumbnail',
						message: 'Thumbnail extracted from meta tags',
						data: { thumbnail: imageBuffer },
						timestamp: new Date().toISOString()
					});
				}
				return imageBuffer;
			}
		}

		const twitterImage = await page.getAttribute('meta[name="twitter:image"]', 'content');
		if (twitterImage) {
			console.log('[Thumbnail] Found twitter:image meta tag');
			const imageBuffer = await fetchImageAsBase64(twitterImage, progressCallback);
			if (imageBuffer) {
				if (progressCallback) {
					progressCallback({
						type: 'thumbnail',
						message: 'Thumbnail extracted from meta tags',
						data: { thumbnail: imageBuffer },
						timestamp: new Date().toISOString()
					});
				}
				return imageBuffer;
			}
		}
	} catch (e) {
		logError('[Thumbnail] Meta tag method failed', e);
	}

	// Method 2: Try video poster attribute
	try {
		const poster = await page.getAttribute('video', 'poster');
		if (poster) {
			console.log('[Thumbnail] Found video poster attribute');
			const imageBuffer = await fetchImageAsBase64(poster, progressCallback);
			if (imageBuffer) {
				if (progressCallback) {
					progressCallback({
						type: 'thumbnail',
						message: 'Thumbnail extracted from video poster',
						data: { thumbnail: imageBuffer },
						timestamp: new Date().toISOString()
					});
				}
				return imageBuffer;
			}
		}
	} catch (e) {
		logError('[Thumbnail] Video poster method failed', e);
	}

	// Method 3: Try Instagram window data structures
	try {
		const thumbnailUrl = await page.evaluate(() => {
			// Check for Instagram's internal data structures
			const data = (window as any).__additionalDataLoaded;
			if (data) {
				// Navigate through Instagram's data structure
				for (const key in data) {
					const item = data[key];
					if (item?.graphql?.shortcode_media?.display_url) {
						return item.graphql.shortcode_media.display_url;
					}
					if (item?.graphql?.shortcode_media?.thumbnail_src) {
						return item.graphql.shortcode_media.thumbnail_src;
					}
				}
			}
			return null;
		});

		if (thumbnailUrl) {
			console.log('[Thumbnail] Found thumbnail in Instagram data structures');
			const imageBuffer = await fetchImageAsBase64(thumbnailUrl, progressCallback);
			if (imageBuffer) {
				if (progressCallback) {
					progressCallback({
						type: 'thumbnail',
						message: 'Thumbnail extracted from Instagram data',
						data: { thumbnail: imageBuffer },
						timestamp: new Date().toISOString()
					});
				}
				return imageBuffer;
			}
		}
	} catch (e) {
		logError('[Thumbnail] Instagram data method failed', e);
	}

	// Method 4: Screenshot fallback (existing method)
	console.log('[Thumbnail] Falling back to screenshot method');
	const screenshotThumbnail = await extractThumbnailScreenshot(page);
	if (screenshotThumbnail && progressCallback) {
		progressCallback({
			type: 'thumbnail',
			message: 'Thumbnail extracted via screenshot',
			data: { thumbnail: screenshotThumbnail },
			timestamp: new Date().toISOString()
		});
	}
	return screenshotThumbnail;
}