Files
insta-recipe/src/lib/server/extraction.ts
Giancarmine Salucci 767b8a1b37 feat(extraction): enhance thumbnail URL validation with strict HTTP 200 check
- Implement strict HTTP 200 validation (reject all other status codes)
- Add content-type validation (must be image/*)
- Add 10-second timeout protection with AbortController
- Thread progressCallback through all fetchImageAsBase64 calls
- Add detailed logging for each validation failure scenario
- Report validation failures via SSE progress callbacks

Unit tests:
- Add comprehensive test coverage for all validation scenarios
- Test HTTP status codes (200, 404, 403, 500, etc.)
- Test content-type validation (image/* vs text/html, etc.)
- Test timeout behavior with AbortController
- Test error handling (network errors, DNS, SSL, etc.)
- Test progress callback reporting

Integration tests:
- Add tests for complete extraction flow with URL failures
- Test fallback chain behavior (meta tags → poster → Instagram data → screenshot)
- Test real-world scenarios (redirects, query params, different post types)

Documentation:
- Enhanced JSDoc with validation criteria
- Added examples showing fallback behavior
- Documented all failure scenarios and their handling

All tests passing 
2025-12-21 05:33:48 +01:00

872 lines
24 KiB
TypeScript

import { createBrowserContext } from './browser';
import fs from 'fs';
import path from 'path';
import type { Page, BrowserContext } from 'playwright';
export interface ExtractedContent {
bodyText: string;
thumbnail: string | null;
}
export type ExtractionMethod = 'embedded-json' | 'dom-selector' | 'graphql-api' | 'legacy';
export type ProgressEventType = 'status' | 'method' | 'retry' | 'error' | 'thumbnail' | 'complete';
export interface ProgressEvent {
type: ProgressEventType;
message: string;
method?: ExtractionMethod;
attemptNumber?: number;
maxAttempts?: number;
data?: any;
timestamp?: string;
}
export type ProgressCallback = (event: ProgressEvent) => void;
interface ExtractionResult {
success: boolean;
method?: ExtractionMethod;
data?: ExtractedContent;
error?: string;
}
interface InstagramEmbeddedData {
entry_data?: {
PostPage?: Array<{
graphql?: {
shortcode_media?: {
edge_media_to_caption?: {
edges?: Array<{ node: { text: string } }>;
};
display_url?: string;
video_url?: string;
owner?: {
username: string;
profile_pic_url: string;
};
};
};
}>;
};
}
interface RetryConfig {
maxAttempts: number;
initialDelayMs: number;
maxDelayMs: number;
backoffMultiplier: number;
}
const DEFAULT_RETRY_CONFIG: RetryConfig = {
maxAttempts: 3,
initialDelayMs: 1000,
maxDelayMs: 10000,
backoffMultiplier: 2
};
/**
* Resolve authentication storage path
* Checks Docker path first, then local path
*/
function resolveAuthPath(): string | undefined {
const authPathDocker = '/app/secrets/auth.json';
const authPathLocal = './secrets/auth.json';
if (fs.existsSync(authPathDocker)) {
return authPathDocker;
}
if (fs.existsSync(authPathLocal)) {
return authPathLocal;
}
return undefined;
}
/**
* Sleep utility for retry logic
*/
async function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
/**
* Check if error should not be retried
*/
function isNonRetriableError(error: unknown): boolean {
if (error instanceof Error) {
// Don't retry authentication errors
if (error.message.includes('authentication') || error.message.includes('login required')) {
return true;
}
// Don't retry invalid URLs
if (error.message.includes('invalid url')) {
return true;
}
}
return false;
}
/**
* Get human-readable display name for extraction method
*/
function getMethodDisplayName(method: ExtractionMethod): string {
const names: Record<ExtractionMethod, string> = {
'embedded-json': 'Embedded JSON',
'dom-selector': 'DOM Selector',
'graphql-api': 'GraphQL API',
legacy: 'Legacy Parser'
};
return names[method];
}
/**
* Retry wrapper with exponential backoff
*/
async function withRetry<T>(
fn: () => Promise<T>,
config: RetryConfig = DEFAULT_RETRY_CONFIG,
onProgress?: ProgressCallback
): Promise<T> {
let lastError: Error | null = null;
let delay = config.initialDelayMs;
for (let attempt = 1; attempt <= config.maxAttempts; attempt++) {
try {
return await fn();
} catch (error) {
lastError = error as Error;
// Don't retry on certain errors
if (isNonRetriableError(error)) {
onProgress?.({
type: 'error',
message: `Non-retriable error: ${lastError.message}`,
timestamp: new Date().toISOString()
});
throw error;
}
if (attempt < config.maxAttempts) {
const message = `Attempt ${attempt}/${config.maxAttempts} failed. Retrying in ${delay}ms...`;
console.warn(`[Retry] ${message}`, error);
onProgress?.({
type: 'retry',
message,
attemptNumber: attempt,
maxAttempts: config.maxAttempts,
timestamp: new Date().toISOString()
});
await sleep(delay);
delay = Math.min(delay * config.backoffMultiplier, config.maxDelayMs);
}
}
}
throw lastError || new Error('Max retry attempts exceeded');
}
/**
* Extract shortcode from Instagram URL
*/
function extractShortcode(url: string): string | null {
// Extract from /p/, /reel/, /tv/ URLs
const match = url.match(/\/(p|reel|tv)\/([A-Za-z0-9_-]+)/);
return match ? match[2] : null;
}
/**
* Clean extracted text
*/
function cleanText(text: string): string {
// Remove excessive whitespace
let cleaned = text.replace(/\s+/g, ' ').trim();
// Remove common UI text patterns
const uiPatterns = [
/^\s*More posts from.+$/gim,
/^\s*View all \d+ comments$/gim,
/^\s*Add a comment\.\.\.$/gim,
/^\s*Liked by.+$/gim
];
uiPatterns.forEach((pattern) => {
cleaned = cleaned.replace(pattern, '');
});
return cleaned.trim();
}
/**
* Strategy 1: Extract from embedded JSON data in script tags
*/
async function extractFromEmbeddedJSON(
page: Page,
progressCallback?: ProgressCallback
): Promise<ExtractedContent | null> {
try {
// Extract all script tag contents
const scriptContents = await page.evaluate(() => {
const scripts = Array.from(document.querySelectorAll('script[type="text/javascript"]'));
return scripts.map((script) => script.textContent || '');
});
// Look for embedded data patterns
for (const content of scriptContents) {
// Try window._sharedData pattern
const sharedDataMatch = content.match(/window\._sharedData\s*=\s*(\{.+?\});/s);
if (sharedDataMatch) {
try {
const data: InstagramEmbeddedData = JSON.parse(sharedDataMatch[1]);
const result = parseInstagramData(data);
if (result) {
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return { ...result, thumbnail };
}
} catch (e) {
console.warn('Failed to parse _sharedData:', e);
}
}
// Try __additionalDataLoaded pattern
const additionalDataMatch = content.match(/window\.__additionalDataLoaded\([^,]+,\s*(\{.+?\})\);/s);
if (additionalDataMatch) {
try {
const data = JSON.parse(additionalDataMatch[1]);
const result = parseInstagramData(data);
if (result) {
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return { ...result, thumbnail };
}
} catch (e) {
console.warn('Failed to parse __additionalDataLoaded:', e);
}
}
}
return null;
} catch (error) {
console.warn('Failed to extract from embedded JSON:', error);
return null;
}
}
/**
* Parse Instagram data structure
*/
function parseInstagramData(data: any): Omit<ExtractedContent, 'thumbnail'> | null {
try {
// Navigate the nested structure
const media = data?.entry_data?.PostPage?.[0]?.graphql?.shortcode_media;
if (!media) {
// Try alternative structures
const items = data?.items || data?.data?.shortcode_media;
if (items) {
return extractFromAlternativeStructure(items);
}
return null;
}
// Extract caption
const captionEdges = media.edge_media_to_caption?.edges || [];
const bodyText = captionEdges.map((edge: any) => edge.node.text).join('\n');
if (!bodyText) {
return null;
}
return {
bodyText: cleanText(bodyText)
};
} catch (error) {
console.warn('Failed to parse Instagram data structure:', error);
return null;
}
}
/**
* Parse alternative Instagram data structures
*/
function extractFromAlternativeStructure(items: any): Omit<ExtractedContent, 'thumbnail'> | null {
try {
if (Array.isArray(items)) {
items = items[0];
}
const caption = items?.caption?.text || items?.edge_media_to_caption?.edges?.[0]?.node?.text;
if (caption) {
return {
bodyText: cleanText(caption)
};
}
return null;
} catch (error) {
console.warn('Failed to parse alternative structure:', error);
return null;
}
}
/**
* Strategy 2: Extract from DOM using specific selectors
*/
async function extractFromDOM(
page: Page,
progressCallback?: ProgressCallback
): Promise<ExtractedContent | null> {
try {
// Strategy: Direct caption selector
const captionText = await page.evaluate(() => {
// Try h1[dir="auto"] (most reliable for captions)
const h1 = document.querySelector('h1[dir="auto"]');
if (h1?.textContent) {
return h1.textContent.trim();
}
// Try article caption div
const captionDiv = document.querySelector('article div._a9zs, article span');
if (captionDiv?.textContent) {
return captionDiv.textContent.trim();
}
// Try meta tag
const metaDesc = document.querySelector('meta[property="og:description"]');
if (metaDesc) {
return metaDesc.getAttribute('content') || '';
}
return null;
});
if (!captionText) {
return null;
}
// Extract thumbnail using existing logic
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return {
bodyText: cleanText(captionText),
thumbnail
};
} catch (error) {
console.warn('Failed to extract from DOM:', error);
return null;
}
}
/**
* Strategy 3: Extract via GraphQL API
*/
async function extractViaGraphQL(
url: string,
context: BrowserContext
): Promise<ExtractedContent | null> {
const shortcode = extractShortcode(url);
if (!shortcode) {
console.warn('Could not extract shortcode from URL:', url);
return null;
}
try {
const page = await context.newPage();
// Make GraphQL request
const response = await page.request.post('https://www.instagram.com/graphql/query/', {
form: {
variables: JSON.stringify({ shortcode }),
doc_id: '7950326061742207' // May need periodic updates
}
});
if (!response.ok()) {
console.warn(`GraphQL request failed: ${response.status()}`);
await page.close();
return null;
}
const data = await response.json();
// Parse GraphQL response
const media = data?.data?.shortcode_media;
if (!media) {
await page.close();
return null;
}
const bodyText = media.edge_media_to_caption?.edges?.[0]?.node?.text || '';
await page.close();
if (!bodyText) {
return null;
}
return {
bodyText: cleanText(bodyText),
thumbnail: null // GraphQL doesn't easily provide thumbnail, would need page context
};
} catch (error) {
console.error('GraphQL extraction failed:', error);
return null;
}
}
/**
* Strategy 4: Legacy extraction method (fallback)
*/
async function extractCleanTextLegacy(page: Page): Promise<string> {
let text = (await page.evaluate(() => document.body.innerText))
.replace(/^(?:.*\n){6}/, '') // Remove first 6 lines
.split('More posts from')[0] // Cut at "More posts from"
.trim();
// Remove mentions and hashtags
text = text.replace(/@\w+/g, '').replace(/#\w+/g, '');
return text;
}
/**
* Orchestrate extraction strategies
*/
async function extractWithStrategies(
url: string,
page: Page,
context: BrowserContext,
onProgress?: ProgressCallback
): Promise<ExtractionResult> {
const strategies: Array<{
name: ExtractionMethod;
fn: () => Promise<ExtractedContent | null>;
}> = [
{
name: 'embedded-json',
fn: () => extractFromEmbeddedJSON(page, onProgress)
},
{
name: 'dom-selector',
fn: () => extractFromDOM(page, onProgress)
},
{
name: 'graphql-api',
fn: () => extractViaGraphQL(url, context)
},
{
name: 'legacy',
fn: async () => {
const text = await extractCleanTextLegacy(page);
const thumbnail = await extractThumbnailStealth(page, onProgress);
return { bodyText: text, thumbnail };
}
}
];
for (const strategy of strategies) {
try {
const methodMessage = `Trying extraction method: ${getMethodDisplayName(strategy.name)}`;
console.log(`[Extractor] ${methodMessage}`);
onProgress?.({
type: 'method',
message: methodMessage,
method: strategy.name,
timestamp: new Date().toISOString()
});
const result = await strategy.fn();
if (result && result.bodyText) {
const successMessage = `✓ Success with method: ${getMethodDisplayName(strategy.name)}`;
console.log(`[Extractor] ${successMessage}`);
onProgress?.({
type: 'status',
message: successMessage,
method: strategy.name,
timestamp: new Date().toISOString()
});
return {
success: true,
method: strategy.name,
data: result
};
}
} catch (error) {
console.warn(`[Extractor] Method ${strategy.name} failed:`, error);
// Continue to next strategy
}
}
return {
success: false,
error: 'All extraction methods failed'
};
}
/**
* Extract text content and thumbnail from a URL using Playwright browser
* Uses multiple extraction strategies with fallback
* @param url - The URL to extract from
* @param onProgress - Optional callback to receive progress updates
* @returns Extracted text and thumbnail
*/
export async function extractTextAndThumbnail(
url: string,
onProgress?: ProgressCallback
): Promise<ExtractedContent> {
onProgress?.({
type: 'status',
message: 'Starting extraction...',
timestamp: new Date().toISOString()
});
return withRetry(async () => {
const authPath = resolveAuthPath();
const context = await createBrowserContext(authPath);
const page = await context.newPage();
try {
// Set timeout
page.setDefaultTimeout(30000);
onProgress?.({
type: 'status',
message: 'Loading Instagram page...',
timestamp: new Date().toISOString()
});
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
// Add small human-like delay
await page.waitForTimeout(1000 + Math.random() * 2000);
const result = await extractWithStrategies(url, page, context, onProgress);
if (!result.success || !result.data) {
throw new Error(result.error || 'Extraction failed');
}
// Save debug content
fs.writeFileSync(
path.resolve('debug_page.txt'),
`Method: ${result.method}\n\n${result.data.bodyText}`
);
onProgress?.({
type: 'complete',
message: 'Extraction completed successfully',
method: result.method,
timestamp: new Date().toISOString()
});
return result.data;
} finally {
await page.close();
await context.close();
}
}, DEFAULT_RETRY_CONFIG, onProgress);
}
/**
* Extract thumbnail from video element or take full page screenshot
*/
/**
* Screenshot-based thumbnail extraction (fallback method)
* Takes a screenshot of the video element or full page if video not found
*/
async function extractThumbnailScreenshot(page: Page): Promise<string | null> {
const videoBounds = await page.evaluate(() => {
const video = document.querySelector('video');
if (!video) return null;
const rect = video.getBoundingClientRect();
return {
x: Math.max(0, rect.left),
y: Math.max(0, rect.top),
width: Math.min(rect.width, window.innerWidth),
height: Math.min(rect.height, window.innerHeight)
};
});
let screenshotBuffer: Buffer;
if (videoBounds && videoBounds.width > 0 && videoBounds.height > 0) {
screenshotBuffer = await page.screenshot({
type: 'jpeg',
quality: 85,
clip: videoBounds
});
} else {
console.warn('[Thumbnail] Video element not found or has no size, taking full page screenshot');
screenshotBuffer = await page.screenshot({ type: 'jpeg', quality: 85 });
}
return `data:image/jpeg;base64,${screenshotBuffer.toString('base64')}`;
}
/**
* Helper: Fetch image from URL and convert to base64 data URI
*
* **Validation Criteria:**
* - HTTP status must be exactly 200 (not 2xx, only 200)
* - Content-Type must start with 'image/' (e.g., image/jpeg, image/png, image/webp)
* - Request must complete within 10 seconds
*
* **Failure Scenarios:**
* - Non-200 status → Returns null, reports status code via progress callback
* - Invalid content-type → Returns null, reports content-type via progress callback
* - Timeout → Returns null, reports timeout via progress callback
* - Network error → Returns null, reports error message via progress callback
*
* **Usage in Fallback Chain:**
* This function is used by `extractThumbnailStealth()` which tries multiple URL sources:
* 1. Meta tags (og:image, twitter:image)
* 2. Video poster attribute
* 3. Instagram data structures (display_url, thumbnail_src)
* 4. Screenshot fallback (always succeeds)
*
* When this function returns null, extraction continues to the next method.
*
* @param imageUrl - The image URL to fetch (must be HTTPS)
* @param progressCallback - Optional callback for progress reporting
* @returns Base64 data URI (data:image/*;base64,...) or null if validation fails
*
* @example
* ```typescript
* const thumbnail = await fetchImageAsBase64(
* 'https://instagram.com/image.jpg',
* (event) => console.log(event.message)
* );
*
* if (thumbnail) {
* // thumbnail is a valid base64 data URI
* console.log(thumbnail.substring(0, 50)); // "data:image/jpeg;base64,/9j/4AAQSkZJRg..."
* } else {
* // URL validation failed, try next method
* }
* ```
*/
async function fetchImageAsBase64(
imageUrl: string,
progressCallback?: ProgressCallback
): Promise<string | null> {
try {
// Create abort controller for timeout
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), 10000); // 10s timeout
console.log(`[Thumbnail] Validating URL: ${imageUrl}`);
const response = await fetch(imageUrl, {
signal: controller.signal
});
clearTimeout(timeoutId);
// Strict status validation: must be exactly 200
if (response.status !== 200) {
console.warn(`[Thumbnail] URL validation failed: HTTP ${response.status} for ${imageUrl}`);
progressCallback?.({
type: 'status',
message: `Thumbnail URL returned HTTP ${response.status}, trying next method...`,
timestamp: new Date().toISOString()
});
return null;
}
// Validate content-type
const contentType = response.headers.get('content-type') || '';
if (!contentType.startsWith('image/')) {
console.warn(
`[Thumbnail] URL validation failed: Invalid content-type '${contentType}' for ${imageUrl}`
);
progressCallback?.({
type: 'status',
message: `Thumbnail URL returned non-image content (${contentType}), trying next method...`,
timestamp: new Date().toISOString()
});
return null;
}
console.log(`[Thumbnail] URL validation successful: ${imageUrl} (${contentType})`);
const arrayBuffer = await response.arrayBuffer();
const buffer = Buffer.from(arrayBuffer);
const base64Data = `data:${contentType};base64,${buffer.toString('base64')}`;
progressCallback?.({
type: 'status',
message: 'Thumbnail fetched and validated from URL',
timestamp: new Date().toISOString()
});
return base64Data;
} catch (e) {
if (e instanceof Error) {
if (e.name === 'AbortError') {
console.error(`[Thumbnail] URL fetch timeout: ${imageUrl}`);
progressCallback?.({
type: 'status',
message: 'Thumbnail URL fetch timeout, trying next method...',
timestamp: new Date().toISOString()
});
} else {
console.error(`[Thumbnail] Failed to fetch image from ${imageUrl}:`, e.message);
progressCallback?.({
type: 'status',
message: `Thumbnail URL fetch failed (${e.message}), trying next method...`,
timestamp: new Date().toISOString()
});
}
} else {
console.error('[Thumbnail] Failed to fetch image:', e);
}
return null;
}
}
/**
* Extract thumbnail from Instagram post using stealth techniques
*
* Tries multiple methods in order of stealth:
* 1. Meta tags (og:image, twitter:image) - Returns: Direct HTTPS URL
* 2. Video poster attribute - Returns: Direct HTTPS URL
* 3. Instagram window data structures - Returns: Direct HTTPS URL
* 4. Screenshot fallback - Returns: Base64 data URL (data:image/jpeg;base64,...)
*
* @param page - Playwright page instance
* @param progressCallback - Optional progress callback for SSE updates
* @returns Image URL (either direct HTTPS URL or base64 data URL) or null if all methods fail
*
* **Thumbnail Format Guide:**
* - Methods 1-3: Return direct HTTPS URLs → Tandoor can use URL pass-through (efficient)
* - Method 4: Returns base64 data URL → Requires conversion to file blob for upload
*/
async function extractThumbnailStealth(
page: Page,
progressCallback?: ProgressCallback
): Promise<string | null> {
console.log('[Thumbnail] Starting stealth extraction');
// Method 1: Try meta tags (most stealthy)
try {
const ogImage = await page.getAttribute('meta[property="og:image"]', 'content');
if (ogImage) {
console.log('[Thumbnail] Found og:image meta tag');
const imageBuffer = await fetchImageAsBase64(ogImage, progressCallback);
if (imageBuffer) {
if (progressCallback) {
progressCallback({
type: 'thumbnail',
message: 'Thumbnail extracted from meta tags',
data: { thumbnail: imageBuffer },
timestamp: new Date().toISOString()
});
}
return imageBuffer;
}
}
const twitterImage = await page.getAttribute('meta[name="twitter:image"]', 'content');
if (twitterImage) {
console.log('[Thumbnail] Found twitter:image meta tag');
const imageBuffer = await fetchImageAsBase64(twitterImage, progressCallback);
if (imageBuffer) {
if (progressCallback) {
progressCallback({
type: 'thumbnail',
message: 'Thumbnail extracted from meta tags',
data: { thumbnail: imageBuffer },
timestamp: new Date().toISOString()
});
}
return imageBuffer;
}
}
} catch (e) {
console.log('[Thumbnail] Meta tag method failed:', e);
}
// Method 2: Try video poster attribute
try {
const poster = await page.getAttribute('video', 'poster');
if (poster) {
console.log('[Thumbnail] Found video poster attribute');
const imageBuffer = await fetchImageAsBase64(poster, progressCallback);
if (imageBuffer) {
if (progressCallback) {
progressCallback({
type: 'thumbnail',
message: 'Thumbnail extracted from video poster',
data: { thumbnail: imageBuffer },
timestamp: new Date().toISOString()
});
}
return imageBuffer;
}
}
} catch (e) {
console.log('[Thumbnail] Video poster method failed:', e);
}
// Method 3: Try Instagram window data structures
try {
const thumbnailUrl = await page.evaluate(() => {
// Check for Instagram's internal data structures
const data = (window as any).__additionalDataLoaded;
if (data) {
// Navigate through Instagram's data structure
for (const key in data) {
const item = data[key];
if (item?.graphql?.shortcode_media?.display_url) {
return item.graphql.shortcode_media.display_url;
}
if (item?.graphql?.shortcode_media?.thumbnail_src) {
return item.graphql.shortcode_media.thumbnail_src;
}
}
}
return null;
});
if (thumbnailUrl) {
console.log('[Thumbnail] Found thumbnail in Instagram data structures');
const imageBuffer = await fetchImageAsBase64(thumbnailUrl, progressCallback);
if (imageBuffer) {
if (progressCallback) {
progressCallback({
type: 'thumbnail',
message: 'Thumbnail extracted from Instagram data',
data: { thumbnail: imageBuffer },
timestamp: new Date().toISOString()
});
}
return imageBuffer;
}
}
} catch (e) {
console.log('[Thumbnail] Instagram data method failed:', e);
}
// Method 4: Screenshot fallback (existing method)
console.log('[Thumbnail] Falling back to screenshot method');
const screenshotThumbnail = await extractThumbnailScreenshot(page);
if (screenshotThumbnail && progressCallback) {
progressCallback({
type: 'thumbnail',
message: 'Thumbnail extracted via screenshot',
data: { thumbnail: screenshotThumbnail },
timestamp: new Date().toISOString()
});
}
return screenshotThumbnail;
}