insta-recipe/src/lib/server/extraction.ts

import { createBrowserContext } from './browser';
import fs from 'fs';
import path from 'path';
import type { Page, BrowserContext } from 'playwright';

export interface ExtractedContent {
	bodyText: string;
	thumbnail: string | null;
}

export type ExtractionMethod = 'embedded-json' | 'dom-selector' | 'graphql-api' | 'legacy';

export type ProgressEventType = 'status' | 'method' | 'retry' | 'error' | 'thumbnail' | 'complete';

export interface ProgressEvent {
	type: ProgressEventType;
	message: string;
	method?: ExtractionMethod;
	attemptNumber?: number;
	maxAttempts?: number;
	data?: any;
	timestamp?: string;
}

export type ProgressCallback = (event: ProgressEvent) => void;

interface ExtractionResult {
	success: boolean;
	method?: ExtractionMethod;
	data?: ExtractedContent;
	error?: string;
}

interface InstagramEmbeddedData {
	entry_data?: {
		PostPage?: Array<{
			graphql?: {
				shortcode_media?: {
					edge_media_to_caption?: {
						edges?: Array<{ node: { text: string } }>;
					};
					display_url?: string;
					video_url?: string;
					owner?: {
						username: string;
						profile_pic_url: string;
					};
				};
			};
		}>;
	};
}

interface RetryConfig {
	maxAttempts: number;
	initialDelayMs: number;
	maxDelayMs: number;
	backoffMultiplier: number;
}

const DEFAULT_RETRY_CONFIG: RetryConfig = {
	maxAttempts: 3,
	initialDelayMs: 1000,
	maxDelayMs: 10000,
	backoffMultiplier: 2
};

/**
 * Resolve authentication storage path
 * Checks Docker path first, then local path
 */
function resolveAuthPath(): string | undefined {
	const authPathDocker = '/app/secrets/auth.json';
	const authPathLocal = './secrets/auth.json';

	if (fs.existsSync(authPathDocker)) {
		return authPathDocker;
	}

	if (fs.existsSync(authPathLocal)) {
		return authPathLocal;
	}

	return undefined;
}

/**
 * Sleep utility for retry logic
 */
async function sleep(ms: number): Promise<void> {
	return new Promise((resolve) => setTimeout(resolve, ms));
}

/**
 * Check if error should not be retried
 */
function isNonRetriableError(error: unknown): boolean {
	if (error instanceof Error) {
		// Don't retry authentication errors
		if (error.message.includes('authentication') || error.message.includes('login required')) {
			return true;
		}

		// Don't retry invalid URLs
		if (error.message.includes('invalid url')) {
			return true;
		}
	}
	return false;
}

/**
 * Get human-readable display name for extraction method
 */
function getMethodDisplayName(method: ExtractionMethod): string {
	const names: Record<ExtractionMethod, string> = {
		'embedded-json': 'Embedded JSON',
		'dom-selector': 'DOM Selector',
		'graphql-api': 'GraphQL API',
		legacy: 'Legacy Parser'
	};
	return names[method];
}

/**
 * Retry wrapper with exponential backoff
 */
async function withRetry<T>(
	fn: () => Promise<T>,
	config: RetryConfig = DEFAULT_RETRY_CONFIG,
	onProgress?: ProgressCallback
): Promise<T> {
	let lastError: Error | null = null;
	let delay = config.initialDelayMs;

	for (let attempt = 1; attempt <= config.maxAttempts; attempt++) {
		try {
			return await fn();
		} catch (error) {
			lastError = error as Error;

			// Don't retry on certain errors
			if (isNonRetriableError(error)) {
				onProgress?.({
					type: 'error',
					message: `Non-retriable error: ${lastError.message}`,
					timestamp: new Date().toISOString()
				});
				throw error;
			}

			if (attempt < config.maxAttempts) {
				const message = `Attempt ${attempt}/${config.maxAttempts} failed. Retrying in ${delay}ms...`;
				console.warn(`[Retry] ${message}`, error);

				onProgress?.({
					type: 'retry',
					message,
					attemptNumber: attempt,
					maxAttempts: config.maxAttempts,
					timestamp: new Date().toISOString()
				});

				await sleep(delay);
				delay = Math.min(delay * config.backoffMultiplier, config.maxDelayMs);
			}
		}
	}

	throw lastError || new Error('Max retry attempts exceeded');
}

/**
 * Extract shortcode from Instagram URL
 */
function extractShortcode(url: string): string | null {
	// Extract from /p/, /reel/, /tv/ URLs
	const match = url.match(/\/(p|reel|tv)\/([A-Za-z0-9_-]+)/);
	return match ? match[2] : null;
}

/**
 * Clean extracted text
 */
function cleanText(text: string): string {
	// Remove excessive whitespace
	let cleaned = text.replace(/\s+/g, ' ').trim();

	// Remove common UI text patterns
	const uiPatterns = [
		/^\s*More posts from.+$/gim,
		/^\s*View all \d+ comments$/gim,
		/^\s*Add a comment\.\.\.$/gim,
		/^\s*Liked by.+$/gim
	];

	uiPatterns.forEach((pattern) => {
		cleaned = cleaned.replace(pattern, '');
	});

	return cleaned.trim();
}

/**
 * Strategy 1: Extract from embedded JSON data in script tags
 */
async function extractFromEmbeddedJSON(
	page: Page,
	progressCallback?: ProgressCallback
): Promise<ExtractedContent | null> {
	try {
		// Extract all script tag contents
		const scriptContents = await page.evaluate(() => {
			const scripts = Array.from(document.querySelectorAll('script[type="text/javascript"]'));
			return scripts.map((script) => script.textContent || '');
		});

		// Look for embedded data patterns
		for (const content of scriptContents) {
			// Try window._sharedData pattern
			const sharedDataMatch = content.match(/window\._sharedData\s*=\s*(\{.+?\});/s);
			if (sharedDataMatch) {
				try {
					const data: InstagramEmbeddedData = JSON.parse(sharedDataMatch[1]);
					const result = parseInstagramData(data);
					if (result) {
						const thumbnail = await extractThumbnailStealth(page, progressCallback);
						return { ...result, thumbnail };
					}
				} catch (e) {
					console.warn('Failed to parse _sharedData:', e);
				}
			}

			// Try __additionalDataLoaded pattern
			const additionalDataMatch = content.match(/window\.__additionalDataLoaded\([^,]+,\s*(\{.+?\})\);/s);
			if (additionalDataMatch) {
				try {
					const data = JSON.parse(additionalDataMatch[1]);
					const result = parseInstagramData(data);
					if (result) {
						const thumbnail = await extractThumbnailStealth(page, progressCallback);
						return { ...result, thumbnail };
					}
				} catch (e) {
					console.warn('Failed to parse __additionalDataLoaded:', e);
				}
			}
		}

		return null;
	} catch (error) {
		console.warn('Failed to extract from embedded JSON:', error);
		return null;
	}
}

/**
 * Parse Instagram data structure
 */
function parseInstagramData(data: any): Omit<ExtractedContent, 'thumbnail'> | null {
	try {
		// Navigate the nested structure
		const media = data?.entry_data?.PostPage?.[0]?.graphql?.shortcode_media;

		if (!media) {
			// Try alternative structures
			const items = data?.items || data?.data?.shortcode_media;
			if (items) {
				return extractFromAlternativeStructure(items);
			}
			return null;
		}

		// Extract caption
		const captionEdges = media.edge_media_to_caption?.edges || [];
		const bodyText = captionEdges.map((edge: any) => edge.node.text).join('\n');

		if (!bodyText) {
			return null;
		}

		return {
			bodyText: cleanText(bodyText)
		};
	} catch (error) {
		console.warn('Failed to parse Instagram data structure:', error);
		return null;
	}
}

/**
 * Parse alternative Instagram data structures
 */
function extractFromAlternativeStructure(items: any): Omit<ExtractedContent, 'thumbnail'> | null {
	try {
		if (Array.isArray(items)) {
			items = items[0];
		}

		const caption = items?.caption?.text || items?.edge_media_to_caption?.edges?.[0]?.node?.text;

		if (caption) {
			return {
				bodyText: cleanText(caption)
			};
		}

		return null;
	} catch (error) {
		console.warn('Failed to parse alternative structure:', error);
		return null;
	}
}

/**
 * Strategy 2: Extract from DOM using specific selectors
 */
async function extractFromDOM(
	page: Page,
	progressCallback?: ProgressCallback
): Promise<ExtractedContent | null> {
	try {
		// Strategy: Direct caption selector
		const captionText = await page.evaluate(() => {
			// Try h1[dir="auto"] (most reliable for captions)
			const h1 = document.querySelector('h1[dir="auto"]');
			if (h1?.textContent) {
				return h1.textContent.trim();
			}

			// Try article caption div
			const captionDiv = document.querySelector('article div._a9zs, article span');
			if (captionDiv?.textContent) {
				return captionDiv.textContent.trim();
			}

			// Try meta tag
			const metaDesc = document.querySelector('meta[property="og:description"]');
			if (metaDesc) {
				return metaDesc.getAttribute('content') || '';
			}

			return null;
		});

		if (!captionText) {
			return null;
		}

		// Extract thumbnail using existing logic
		const thumbnail = await extractThumbnailStealth(page, progressCallback);

		return {
			bodyText: cleanText(captionText),
			thumbnail
		};
	} catch (error) {
		console.warn('Failed to extract from DOM:', error);
		return null;
	}
}

/**
 * Strategy 3: Extract via GraphQL API
 */
async function extractViaGraphQL(
	url: string,
	context: BrowserContext
): Promise<ExtractedContent | null> {
	const shortcode = extractShortcode(url);
	if (!shortcode) {
		console.warn('Could not extract shortcode from URL:', url);
		return null;
	}

	try {
		const page = await context.newPage();

		// Make GraphQL request
		const response = await page.request.post('https://www.instagram.com/graphql/query/', {
			form: {
				variables: JSON.stringify({ shortcode }),
				doc_id: '7950326061742207' // May need periodic updates
			}
		});

		if (!response.ok()) {
			console.warn(`GraphQL request failed: ${response.status()}`);
			await page.close();
			return null;
		}

		const data = await response.json();

		// Parse GraphQL response
		const media = data?.data?.shortcode_media;
		if (!media) {
			await page.close();
			return null;
		}

		const bodyText = media.edge_media_to_caption?.edges?.[0]?.node?.text || '';

		await page.close();

		if (!bodyText) {
			return null;
		}

		return {
			bodyText: cleanText(bodyText),
			thumbnail: null // GraphQL doesn't easily provide thumbnail, would need page context
		};
	} catch (error) {
		console.error('GraphQL extraction failed:', error);
		return null;
	}
}

/**
 * Strategy 4: Legacy extraction method (fallback)
 */
async function extractCleanTextLegacy(page: Page): Promise<string> {
	let text = (await page.evaluate(() => document.body.innerText))
		.replace(/^(?:.*\n){6}/, '') // Remove first 6 lines
		.split('More posts from')[0] // Cut at "More posts from"
		.trim();

	// Remove mentions and hashtags
	text = text.replace(/@\w+/g, '').replace(/#\w+/g, '');

	return text;
}

/**
 * Orchestrate extraction strategies
 */
async function extractWithStrategies(
	url: string,
	page: Page,
	context: BrowserContext,
	onProgress?: ProgressCallback
): Promise<ExtractionResult> {
	const strategies: Array<{
		name: ExtractionMethod;
		fn: () => Promise<ExtractedContent | null>;
	}> = [
		{
			name: 'embedded-json',
			fn: () => extractFromEmbeddedJSON(page, onProgress)
		},
		{
			name: 'dom-selector',
			fn: () => extractFromDOM(page, onProgress)
		},
		{
			name: 'graphql-api',
			fn: () => extractViaGraphQL(url, context)
		},
		{
			name: 'legacy',
			fn: async () => {
				const text = await extractCleanTextLegacy(page);
				const thumbnail = await extractThumbnailStealth(page, onProgress);
				return { bodyText: text, thumbnail };
			}
		}
	];

	for (const strategy of strategies) {
		try {
			const methodMessage = `Trying extraction method: ${getMethodDisplayName(strategy.name)}`;
			console.log(`[Extractor] ${methodMessage}`);

			onProgress?.({
				type: 'method',
				message: methodMessage,
				method: strategy.name,
				timestamp: new Date().toISOString()
			});

			const result = await strategy.fn();

			if (result && result.bodyText) {
				const successMessage = `✓ Success with method: ${getMethodDisplayName(strategy.name)}`;
				console.log(`[Extractor] ${successMessage}`);

				onProgress?.({
					type: 'status',
					message: successMessage,
					method: strategy.name,
					timestamp: new Date().toISOString()
				});

				return {
					success: true,
					method: strategy.name,
					data: result
				};
			}
		} catch (error) {
			console.warn(`[Extractor] Method ${strategy.name} failed:`, error);
			// Continue to next strategy
		}
	}

	return {
		success: false,
		error: 'All extraction methods failed'
	};
}

/**
 * Extract text content and thumbnail from a URL using Playwright browser
 * Uses multiple extraction strategies with fallback
 * @param url - The URL to extract from
 * @param onProgress - Optional callback to receive progress updates
 * @returns Extracted text and thumbnail
 */
export async function extractTextAndThumbnail(
	url: string,
	onProgress?: ProgressCallback
): Promise<ExtractedContent> {
	onProgress?.({
		type: 'status',
		message: 'Starting extraction...',
		timestamp: new Date().toISOString()
	});

	return withRetry(async () => {
		const authPath = resolveAuthPath();
		const context = await createBrowserContext(authPath);
		const page = await context.newPage();

		try {
			// Set timeout
			page.setDefaultTimeout(30000);

			onProgress?.({
				type: 'status',
				message: 'Loading Instagram page...',
				timestamp: new Date().toISOString()
			});

			await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });

			// Add small human-like delay
			await page.waitForTimeout(1000 + Math.random() * 2000);

			const result = await extractWithStrategies(url, page, context, onProgress);

			if (!result.success || !result.data) {
				throw new Error(result.error || 'Extraction failed');
			}

			// Save debug content
			fs.writeFileSync(
				path.resolve('debug_page.txt'),
				`Method: ${result.method}\n\n${result.data.bodyText}`
			);

			onProgress?.({
				type: 'complete',
				message: 'Extraction completed successfully',
				method: result.method,
				timestamp: new Date().toISOString()
			});

			return result.data;
		} finally {
			await page.close();
			await context.close();
		}
	}, DEFAULT_RETRY_CONFIG, onProgress);
}

/**
 * Extract thumbnail from video element or take full page screenshot
 */
/**
 * Screenshot-based thumbnail extraction (fallback method)
 * Takes a screenshot of the video element or full page if video not found
 */
async function extractThumbnailScreenshot(page: Page): Promise<string | null> {
	const videoBounds = await page.evaluate(() => {
		const video = document.querySelector('video');
		if (!video) return null;
		const rect = video.getBoundingClientRect();
		return {
			x: Math.max(0, rect.left),
			y: Math.max(0, rect.top),
			width: Math.min(rect.width, window.innerWidth),
			height: Math.min(rect.height, window.innerHeight)
		};
	});

	let screenshotBuffer: Buffer;

	if (videoBounds && videoBounds.width > 0 && videoBounds.height > 0) {
		screenshotBuffer = await page.screenshot({
			type: 'jpeg',
			quality: 85,
			clip: videoBounds
		});
	} else {
		console.warn('[Thumbnail] Video element not found or has no size, taking full page screenshot');
		screenshotBuffer = await page.screenshot({ type: 'jpeg', quality: 85 });
	}

	return `data:image/jpeg;base64,${screenshotBuffer.toString('base64')}`;
}

/**
 * Helper: Fetch image from URL and convert to base64 data URI
 *
 * **Validation Criteria:**
 * - HTTP status must be exactly 200 (not 2xx, only 200)
 * - Content-Type must start with 'image/' (e.g., image/jpeg, image/png, image/webp)
 * - Request must complete within 10 seconds
 *
 * **Failure Scenarios:**
 * - Non-200 status → Returns null, reports status code via progress callback
 * - Invalid content-type → Returns null, reports content-type via progress callback
 * - Timeout → Returns null, reports timeout via progress callback
 * - Network error → Returns null, reports error message via progress callback
 *
 * **Usage in Fallback Chain:**
 * This function is used by `extractThumbnailStealth()` which tries multiple URL sources:
 * 1. Meta tags (og:image, twitter:image)
 * 2. Video poster attribute
 * 3. Instagram data structures (display_url, thumbnail_src)
 * 4. Screenshot fallback (always succeeds)
 *
 * When this function returns null, extraction continues to the next method.
 *
 * @param imageUrl - The image URL to fetch (must be HTTPS)
 * @param progressCallback - Optional callback for progress reporting
 * @returns Base64 data URI (data:image/*;base64,...) or null if validation fails
 *
 * @example
 * ```typescript
 * const thumbnail = await fetchImageAsBase64(
 *   'https://instagram.com/image.jpg',
 *   (event) => console.log(event.message)
 * );
 *
 * if (thumbnail) {
 *   // thumbnail is a valid base64 data URI
 *   console.log(thumbnail.substring(0, 50)); // "data:image/jpeg;base64,/9j/4AAQSkZJRg..."
 * } else {
 *   // URL validation failed, try next method
 * }
 * ```
 */
async function fetchImageAsBase64(
	imageUrl: string,
	progressCallback?: ProgressCallback
): Promise<string | null> {
	try {
		// Create abort controller for timeout
		const controller = new AbortController();
		const timeoutId = setTimeout(() => controller.abort(), 10000); // 10s timeout

		console.log(`[Thumbnail] Validating URL: ${imageUrl}`);

		const response = await fetch(imageUrl, {
			signal: controller.signal
		});

		clearTimeout(timeoutId);

		// Strict status validation: must be exactly 200
		if (response.status !== 200) {
			console.warn(`[Thumbnail] URL validation failed: HTTP ${response.status} for ${imageUrl}`);
			progressCallback?.({
				type: 'status',
				message: `Thumbnail URL returned HTTP ${response.status}, trying next method...`,
				timestamp: new Date().toISOString()
			});
			return null;
		}

		// Validate content-type
		const contentType = response.headers.get('content-type') || '';
		if (!contentType.startsWith('image/')) {
			console.warn(
				`[Thumbnail] URL validation failed: Invalid content-type '${contentType}' for ${imageUrl}`
			);
			progressCallback?.({
				type: 'status',
				message: `Thumbnail URL returned non-image content (${contentType}), trying next method...`,
				timestamp: new Date().toISOString()
			});
			return null;
		}

		console.log(`[Thumbnail] URL validation successful: ${imageUrl} (${contentType})`);

		const arrayBuffer = await response.arrayBuffer();
		const buffer = Buffer.from(arrayBuffer);

		const base64Data = `data:${contentType};base64,${buffer.toString('base64')}`;

		progressCallback?.({
			type: 'status',
			message: 'Thumbnail fetched and validated from URL',
			timestamp: new Date().toISOString()
		});

		return base64Data;
	} catch (e) {
		if (e instanceof Error) {
			if (e.name === 'AbortError') {
				console.error(`[Thumbnail] URL fetch timeout: ${imageUrl}`);
				progressCallback?.({
					type: 'status',
					message: 'Thumbnail URL fetch timeout, trying next method...',
					timestamp: new Date().toISOString()
				});
			} else {
				console.error(`[Thumbnail] Failed to fetch image from ${imageUrl}:`, e.message);
				progressCallback?.({
					type: 'status',
					message: `Thumbnail URL fetch failed (${e.message}), trying next method...`,
					timestamp: new Date().toISOString()
				});
			}
		} else {
			console.error('[Thumbnail] Failed to fetch image:', e);
		}
		return null;
	}
}

/**
 * Extract thumbnail from Instagram post using stealth techniques
 *
 * Tries multiple methods in order of stealth:
 * 1. Meta tags (og:image, twitter:image) - Returns: Direct HTTPS URL
 * 2. Video poster attribute - Returns: Direct HTTPS URL
 * 3. Instagram window data structures - Returns: Direct HTTPS URL
 * 4. Screenshot fallback - Returns: Base64 data URL (data:image/jpeg;base64,...)
 *
 * @param page - Playwright page instance
 * @param progressCallback - Optional progress callback for SSE updates
 * @returns Image URL (either direct HTTPS URL or base64 data URL) or null if all methods fail
 *
 * **Thumbnail Format Guide:**
 * - Methods 1-3: Return direct HTTPS URLs → Tandoor can use URL pass-through (efficient)
 * - Method 4: Returns base64 data URL → Requires conversion to file blob for upload
 */
async function extractThumbnailStealth(
	page: Page,
	progressCallback?: ProgressCallback
): Promise<string | null> {
	console.log('[Thumbnail] Starting stealth extraction');

	// Method 1: Try meta tags (most stealthy)
	try {
		const ogImage = await page.getAttribute('meta[property="og:image"]', 'content');
		if (ogImage) {
			console.log('[Thumbnail] Found og:image meta tag');
			const imageBuffer = await fetchImageAsBase64(ogImage, progressCallback);
			if (imageBuffer) {
				if (progressCallback) {
					progressCallback({
						type: 'thumbnail',
						message: 'Thumbnail extracted from meta tags',
						data: { thumbnail: imageBuffer },
						timestamp: new Date().toISOString()
					});
				}
				return imageBuffer;
			}
		}

		const twitterImage = await page.getAttribute('meta[name="twitter:image"]', 'content');
		if (twitterImage) {
			console.log('[Thumbnail] Found twitter:image meta tag');
			const imageBuffer = await fetchImageAsBase64(twitterImage, progressCallback);
			if (imageBuffer) {
				if (progressCallback) {
					progressCallback({
						type: 'thumbnail',
						message: 'Thumbnail extracted from meta tags',
						data: { thumbnail: imageBuffer },
						timestamp: new Date().toISOString()
					});
				}
				return imageBuffer;
			}
		}
	} catch (e) {
		console.log('[Thumbnail] Meta tag method failed:', e);
	}

	// Method 2: Try video poster attribute
	try {
		const poster = await page.getAttribute('video', 'poster');
		if (poster) {
			console.log('[Thumbnail] Found video poster attribute');
			const imageBuffer = await fetchImageAsBase64(poster, progressCallback);
			if (imageBuffer) {
				if (progressCallback) {
					progressCallback({
						type: 'thumbnail',
						message: 'Thumbnail extracted from video poster',
						data: { thumbnail: imageBuffer },
						timestamp: new Date().toISOString()
					});
				}
				return imageBuffer;
			}
		}
	} catch (e) {
		console.log('[Thumbnail] Video poster method failed:', e);
	}

	// Method 3: Try Instagram window data structures
	try {
		const thumbnailUrl = await page.evaluate(() => {
			// Check for Instagram's internal data structures
			const data = (window as any).__additionalDataLoaded;
			if (data) {
				// Navigate through Instagram's data structure
				for (const key in data) {
					const item = data[key];
					if (item?.graphql?.shortcode_media?.display_url) {
						return item.graphql.shortcode_media.display_url;
					}
					if (item?.graphql?.shortcode_media?.thumbnail_src) {
						return item.graphql.shortcode_media.thumbnail_src;
					}
				}
			}
			return null;
		});

		if (thumbnailUrl) {
			console.log('[Thumbnail] Found thumbnail in Instagram data structures');
			const imageBuffer = await fetchImageAsBase64(thumbnailUrl, progressCallback);
			if (imageBuffer) {
				if (progressCallback) {
					progressCallback({
						type: 'thumbnail',
						message: 'Thumbnail extracted from Instagram data',
						data: { thumbnail: imageBuffer },
						timestamp: new Date().toISOString()
					});
				}
				return imageBuffer;
			}
		}
	} catch (e) {
		console.log('[Thumbnail] Instagram data method failed:', e);
	}

	// Method 4: Screenshot fallback (existing method)
	console.log('[Thumbnail] Falling back to screenshot method');
	const screenshotThumbnail = await extractThumbnailScreenshot(page);
	if (screenshotThumbnail && progressCallback) {
		progressCallback({
			type: 'thumbnail',
			message: 'Thumbnail extracted via screenshot',
			data: { thumbnail: screenshotThumbnail },
			timestamp: new Date().toISOString()
		});
	}
	return screenshotThumbnail;
}