Files
insta-recipe/src/lib/server/extraction.ts
Giancarmine Salucci 226b2e7f15
Some checks failed
Build & Push Docker Image / test-and-build (push) Failing after 33s
fix(extraction): always use DOM extraction, never trust GraphQL caption
Instagram's GraphQL API silently truncates captions WITHOUT '….' markers.
Both DWWxiymssxE (393 chars full, 327 from API) and DXT73izCBoH
(744+ chars full, cut mid-sentence) were affected.

Remove the GraphQL-interception shortcut entirely. Always use DOM
extraction (HTML Section) which clicks '… more' to get the complete text.

The intercepted GraphQL caption is kept only as emergency fallback if
all DOM strategies fail.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-05-13 02:24:40 +02:00

1733 lines
50 KiB
TypeScript

import { createBrowserContext } from './browser';
import { logError } from './utils/logger';
import fs from 'fs';
import path from 'path';
import type { Page, BrowserContext } from 'playwright';
export interface ExtractedContent {
bodyText: string;
thumbnail: string | null;
}
export type ExtractionMethod =
| 'embedded-json'
| 'internal-state'
| 'html-section'
| 'dom-selector'
| 'graphql-api'
| 'graphql-intercept'
| 'legacy';
type CaptionCandidate = {
element: Element;
text: string;
score: number;
innerHTML: string;
brCount: number;
};
export type ProgressEventType =
| 'status'
| 'method'
| 'retry'
| 'error'
| 'thumbnail'
| 'complete'
| 'model_loading';
export interface ProgressEvent {
type: ProgressEventType;
message: string;
method?: ExtractionMethod;
attemptNumber?: number;
maxAttempts?: number;
data?: any;
timestamp?: string;
}
export type ProgressCallback = (event: ProgressEvent) => void;
interface ExtractionResult {
success: boolean;
method?: ExtractionMethod;
data?: ExtractedContent;
error?: string;
}
interface InstagramEmbeddedData {
entry_data?: {
PostPage?: Array<{
graphql?: {
shortcode_media?: {
edge_media_to_caption?: {
edges?: Array<{ node: { text: string } }>;
};
display_url?: string;
video_url?: string;
owner?: {
username: string;
profile_pic_url: string;
};
};
};
}>;
};
}
interface RetryConfig {
maxAttempts: number;
initialDelayMs: number;
maxDelayMs: number;
backoffMultiplier: number;
}
const DEFAULT_RETRY_CONFIG: RetryConfig = {
maxAttempts: 3,
initialDelayMs: 1000,
maxDelayMs: 10000,
backoffMultiplier: 2
};
/**
* Resolve authentication storage path
* Checks Docker path first, then local path
*/
function resolveAuthPath(): string | undefined {
const authPathDocker = '/app/secrets/auth.json';
const authPathLocal = './secrets/auth.json';
if (fs.existsSync(authPathDocker)) {
return authPathDocker;
}
if (fs.existsSync(authPathLocal)) {
return authPathLocal;
}
return undefined;
}
/**
* Sleep utility for retry logic
*/
async function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
/**
* Check if error should not be retried
*/
function isNonRetriableError(error: unknown): boolean {
if (error instanceof Error) {
// Don't retry authentication errors
if (error.message.includes('authentication') || error.message.includes('login required')) {
return true;
}
// Don't retry invalid URLs
if (error.message.includes('invalid url')) {
return true;
}
}
return false;
}
/**
* Get human-readable display name for extraction method
*/
function getMethodDisplayName(method: ExtractionMethod): string {
const names: Record<ExtractionMethod, string> = {
'embedded-json': 'Embedded JSON',
'internal-state': 'Internal State',
'html-section': 'HTML Section',
'dom-selector': 'DOM Selector',
'graphql-api': 'GraphQL API',
'graphql-intercept': 'GraphQL Intercept',
legacy: 'Legacy Parser'
};
return names[method];
}
/**
* Retry wrapper with exponential backoff
*/
async function withRetry<T>(
fn: () => Promise<T>,
config: RetryConfig = DEFAULT_RETRY_CONFIG,
onProgress?: ProgressCallback
): Promise<T> {
let lastError: Error | null = null;
let delay = config.initialDelayMs;
for (let attempt = 1; attempt <= config.maxAttempts; attempt++) {
try {
return await fn();
} catch (error) {
lastError = error as Error;
// Don't retry on certain errors
if (isNonRetriableError(error)) {
onProgress?.({
type: 'error',
message: `Non-retriable error: ${lastError.message}`,
timestamp: new Date().toISOString()
});
throw error;
}
if (attempt < config.maxAttempts) {
const message = `Attempt ${attempt}/${config.maxAttempts} failed. Retrying in ${delay}ms...`;
logError(`[Retry] ${message}`, error);
onProgress?.({
type: 'retry',
message,
attemptNumber: attempt,
maxAttempts: config.maxAttempts,
timestamp: new Date().toISOString()
});
await sleep(delay);
delay = Math.min(delay * config.backoffMultiplier, config.maxDelayMs);
}
}
}
throw lastError || new Error('Max retry attempts exceeded');
}
/**
* Extract shortcode from Instagram URL
*/
function extractShortcode(url: string): string | undefined {
// Extract from /p/, /reel/, /reels/, /tv/ URLs
const match = url.match(/\/(p|reel|reels|tv)\/([A-Za-z0-9_-]+)/);
return match ? match[2] : undefined;
}
/**
* Recipe keywords used for caption scoring
*/
const RECIPE_KEYWORDS = [
'ingredienti',
'procedimento',
'preparazione',
'ricetta',
'recipe',
'instructions'
];
/**
* Timeout configuration constants (in milliseconds)
*/
const TIMEOUTS = {
CONTENT_LOAD: 1500,
MORE_BUTTON_VISIBILITY: 1000,
CAPTION_EXPANSION: 3000,
MORE_BUTTON_VISIBILITY_DOM: 500,
MORE_BUTTON_CLICK: 800,
PAGE_LOAD: 10000,
NETWORK_SETTLE: 2000,
ARTICLE_SELECTOR: 5000,
GRAPHQL_WAIT: 1000,
PAGE_NAVIGATION: 30000,
ANTI_DETECTION_MIN: 1000,
ANTI_DETECTION_MAX: 3000
} as const;
/**
* Try to expand truncated caption by clicking "more" button in HTML section method
*/
async function tryExpandCaptionInHTMLSection(page: Page): Promise<void> {
console.log('[Extractor] Looking for "more" button in primary post container...');
try {
await page.waitForTimeout(TIMEOUTS.CONTENT_LOAD);
const mainContainer = page.locator('article, main, [role="main"]').first();
const containerExists = (await mainContainer.count()) > 0;
if (!containerExists) {
console.log('[Extractor] No main container found');
return;
}
console.log('[Extractor] Found main post container, searching for "more" button...');
const morePatterns = [
{
locator: mainContainer.locator('span').filter({ hasText: /\.\.\.\s*more/i }),
desc: "span with '...more'"
},
{
locator: mainContainer.locator('span').filter({ hasText: /…\s*more/i }),
desc: "span with '… more'"
},
{
locator: mainContainer.locator('div[role="button"]').filter({ hasText: /more/i }),
desc: "button with 'more'"
},
{
locator: mainContainer.locator('span[role="button"]').filter({ hasText: /more/i }),
desc: "span button with 'more'"
}
];
for (const pattern of morePatterns) {
const count = await pattern.locator.count();
console.log(`[Extractor] Checking ${pattern.desc}: found ${count}`);
if (count === 0) continue;
const firstMore = pattern.locator.first();
try {
if (await firstMore.isVisible({ timeout: TIMEOUTS.MORE_BUTTON_VISIBILITY })) {
const text = await firstMore.textContent();
console.log(`[Extractor] Found visible "more": "${text}"`);
await firstMore.click();
console.log('[Extractor] Clicked "more" - waiting for expansion...');
await page.waitForTimeout(TIMEOUTS.CAPTION_EXPANSION);
console.log('[Extractor] Caption expansion complete');
break;
}
} catch (e) {
console.log(`[Extractor] ${pattern.desc} not clickable: ${e}`);
}
}
console.log('[Extractor] Finished "more" button expansion attempt');
} catch (e) {
console.log(`[Extractor] Error while trying to expand caption: ${e}`);
}
}
/**
* Try to expand truncated caption by clicking "more" button in DOM method
*/
async function tryExpandCaptionInDOM(page: Page): Promise<void> {
const moreButtonSelectors = [
'article button:has-text("more")',
'article button:has-text("More")',
'article button:has-text("… more")',
'article span[role="button"]:has-text("more")',
'article [role="button"]:has-text("more")',
'article div[role="button"]:has-text("more")',
'xpath=//article//span[contains(text(), "more")]/..',
'xpath=//article//button[contains(., "more")]'
];
const maxExpandAttempts = 3;
let expandAttempts = 0;
while (expandAttempts < maxExpandAttempts) {
try {
let clicked = false;
for (const selector of moreButtonSelectors) {
try {
const button = page.locator(selector).first();
if (await button.isVisible({ timeout: TIMEOUTS.MORE_BUTTON_VISIBILITY_DOM })) {
await button.click();
await page.waitForTimeout(TIMEOUTS.MORE_BUTTON_CLICK);
console.log(`[Extractor] Clicked "more" button with selector: ${selector}`);
clicked = true;
expandAttempts++;
break;
}
} catch (e) {
// Try next selector
}
}
if (!clicked) break;
} catch (e) {
break;
}
}
}
/**
* Clean up extracted text - removes HTML tags, decodes entities, cleans whitespace
*/
export function cleanText(text: string): string {
let cleaned = text;
// First, convert <br> tags to newlines to preserve line breaks
cleaned = cleaned.replace(/<br\s*\/?>/gi, '\n');
// Strip all other HTML tags while keeping the text content
cleaned = cleaned.replace(/<[^>]+>/g, '');
// Decode HTML entities
cleaned = cleaned
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#039;/g, "'")
.replace(/&nbsp;/g, ' ');
// Remove common UI text patterns
const uiPatterns = [
/More posts from.+/gi,
/View all \d+ comments/gi,
/Add a comment\.\.\./gi,
/Liked by.+?(?=\n|$)/gi
];
uiPatterns.forEach((pattern) => {
cleaned = cleaned.replace(pattern, '');
});
// Clean up whitespace while preserving intentional line breaks
// Remove spaces at the beginning and end of lines
cleaned = cleaned.replace(/[ \t]+$/gm, ''); // trailing spaces on each line
cleaned = cleaned.replace(/^[ \t]+/gm, ''); // leading spaces on each line
// Replace multiple consecutive blank lines with max 2 newlines
cleaned = cleaned.replace(/\n\s*\n\s*\n+/g, '\n\n');
// Remove spaces around newlines
cleaned = cleaned.replace(/ *\n */g, '\n');
// Normalize multiple spaces to single space within lines
cleaned = cleaned.replace(/ {2,}/g, ' ');
// Remove hashtags from end of text
// Pattern: #word #multiple_words (supports international characters)
cleaned = cleaned.replace(/(#[\w\u00C0-\u024F\u1E00-\u1EFF\u0400-\u04FF]+\s*)+$/gi, '').trim();
return cleaned.trim();
}
/**
* Strategy 1: Extract from embedded JSON data in script tags
*/
async function extractFromEmbeddedJSON(
page: Page,
progressCallback?: ProgressCallback
): Promise<ExtractedContent | null> {
try {
// Extract all script tag contents
const scriptInfo = await page.evaluate(() => {
const scripts = Array.from(document.querySelectorAll('script'));
const scriptData = scripts.map((script, idx) => ({
type: script.getAttribute('type') || 'no-type',
hasContent: !!script.textContent,
length: script.textContent?.length || 0,
preview: script.textContent?.substring(0, 100) || ''
}));
console.log(`[Extractor] Found ${scripts.length} script tags`);
return {
contents: scripts.map((script) => script.textContent || ''),
info: scriptData
};
});
console.log(`[Extractor] Script tags summary:`, scriptInfo.info);
// Look for embedded data patterns
for (let i = 0; i < scriptInfo.contents.length; i++) {
const content = scriptInfo.contents[i];
// Try window._sharedData pattern
const sharedDataMatch = content.match(/window\._sharedData\s*=\s*(\{.+?\});/s);
if (sharedDataMatch) {
console.log(`[Extractor] Found _sharedData in script ${i}`);
try {
const data: InstagramEmbeddedData = JSON.parse(sharedDataMatch[1]);
const result = parseInstagramData(data);
if (result) {
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return { ...result, thumbnail };
}
} catch (e) {
logError('[Extractor] Failed to parse _sharedData', e);
}
}
// Try __additionalDataLoaded pattern
const additionalDataMatch = content.match(
/window\.__additionalDataLoaded\([^,]+,\s*(\{.+?\})\);/s
);
if (additionalDataMatch) {
console.log(`[Extractor] Found __additionalDataLoaded in script ${i}`);
try {
const data = JSON.parse(additionalDataMatch[1]);
const result = parseInstagramData(data);
if (result) {
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return { ...result, thumbnail };
}
} catch (e) {
logError('[Extractor] Failed to parse __additionalDataLoaded', e);
}
}
// Try to find any large JSON with caption data (new Instagram format)
if ((content.includes('"caption"') || content.includes('"text"')) && content.length > 10000) {
console.log(
`[Extractor] Attempting to extract from large JSON in script ${i} (length: ${content.length})`
);
try {
// Try to parse as direct JSON
const jsonData = JSON.parse(content);
// Try deep search first
const deepResult = deepSearchForCaption(jsonData);
if (deepResult && deepResult.bodyText && deepResult.bodyText.length > 130) {
console.log(
`[Extractor] Deep search in JSON found caption: ${deepResult.bodyText.length} chars`
);
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return { ...deepResult, thumbnail };
}
// Try standard parsing
const result = parseInstagramData(jsonData);
if (result && result.bodyText && result.bodyText.length > 130) {
console.log(
`[Extractor] Successfully extracted from JSON, text length: ${result.bodyText.length}`
);
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return { ...result, thumbnail };
}
} catch (e) {
// Not direct JSON or parsing failed, try to find caption fields with regex
console.log(`[Extractor] JSON parse failed, trying regex extraction...`);
// Try multiple patterns for different Instagram JSON structures
const patterns = [
/"caption"\s*:\s*\{\s*"text"\s*:\s*"([^"\\]*(\\.[^"\\]*)*)"/, // Escaped quotes
/"text"\s*:\s*"([^"\\]*(\\.[^"\\]*)*)"\s*,?\s*"pk"/, // text field near pk
/"edge_media_to_caption"\s*:\s*\{\s*"edges"\s*:\s*\[\s*\{\s*"node"\s*:\s*\{\s*"text"\s*:\s*"([^"\\]*(\\.[^"\\]*)*)"/
];
for (const pattern of patterns) {
const captionMatch = content.match(pattern);
if (captionMatch) {
// Get the captured group (first non-undefined)
const rawText = captionMatch[1] || '';
const captionText = rawText
.replace(/\\n/g, '\n')
.replace(/\\"/g, '"')
.replace(/\\u([0-9a-fA-F]{4})/g, (_, code) =>
String.fromCharCode(parseInt(code, 16))
)
.replace(/\\\\/g, '\\');
if (captionText.length > 130) {
console.log(
`[Extractor] Extracted caption from regex pattern, length: ${captionText.length}`
);
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return { bodyText: cleanText(captionText), thumbnail };
}
}
}
}
}
}
return null;
} catch (error) {
logError('[Extractor] Failed to extract from embedded JSON', error);
return null;
}
}
/**
* Parse Instagram data structure
*/
function parseInstagramData(data: any): Omit<ExtractedContent, 'thumbnail'> | null {
try {
// Navigate the nested structure
const media = data?.entry_data?.PostPage?.[0]?.graphql?.shortcode_media;
if (!media) {
// Try alternative structures
const items = data?.items || data?.data?.shortcode_media;
if (items) {
return extractFromAlternativeStructure(items);
}
return null;
}
// Extract caption
const captionEdges = media.edge_media_to_caption?.edges || [];
const bodyText = captionEdges.map((edge: any) => edge.node.text).join('\n');
if (!bodyText) {
return null;
}
return {
bodyText: cleanText(bodyText)
};
} catch (error) {
logError('[Extractor] Failed to parse Instagram data structure', error);
return null;
}
}
/**
* Parse alternative Instagram data structures
*/
function extractFromAlternativeStructure(items: any): Omit<ExtractedContent, 'thumbnail'> | null {
try {
if (Array.isArray(items)) {
items = items[0];
}
const caption = items?.caption?.text || items?.edge_media_to_caption?.edges?.[0]?.node?.text;
if (caption) {
return {
bodyText: cleanText(caption)
};
}
return null;
} catch (error) {
logError('[Extractor] Failed to parse alternative structure', error);
return null;
}
}
/**
* Strategy 2.5: Extract caption by finding the span with recipe content characteristics
* Instagram uses obfuscated class names, but the caption span has identifiable patterns:
* - Contains substantial text (> 100 chars)
* - Has multiple <br> tags for formatting
* - Contains <a> tags for mentions and hashtags
* - Usually has a style attribute with line-height
*/
export async function extractFromHTMLSection(
page: Page,
progressCallback?: ProgressCallback,
targetUrl?: string
): Promise<ExtractedContent | null> {
try {
console.log('[Extractor] Waiting for page content to load...');
// Validate we're on the correct page
const currentUrl = page.url();
const targetShortcode = targetUrl ? extractShortcode(targetUrl) : null;
const currentShortcode = extractShortcode(currentUrl);
console.log(`[Extractor] Current page URL: ${currentUrl}`);
console.log(
`[Extractor] Target shortcode: ${targetShortcode}, Current shortcode: ${currentShortcode}`
);
if (targetShortcode && currentShortcode !== targetShortcode) {
console.log(`[Extractor] URL mismatch: expected ${targetShortcode}, got ${currentShortcode}`);
return null;
}
console.log(`[Extractor] Confirmed on correct post: ${currentShortcode}`);
// Wait for network to settle
await page.waitForLoadState('domcontentloaded', { timeout: TIMEOUTS.PAGE_LOAD });
await page.waitForTimeout(TIMEOUTS.NETWORK_SETTLE);
// Try to expand truncated caption by clicking "more" button
// STRATEGY: Since we're already on the correct page (URL validated above),
// the FIRST article/main post container should be our target post.
await tryExpandCaptionInHTMLSection(page);
console.log('[Extractor] Extracting caption using intelligent span detection...');
const result = await page.evaluate((shortcode) => {
// Strategy: Find the caption span that belongs to the correct post
// Instagram loads multiple posts, so we need to find the span associated
// with our target shortcode
const recipeKeywords = [
'ingredienti',
'procedimento',
'preparazione',
'ricetta',
'recipe',
'instructions'
];
// First, try to find links pointing to our target post
const postLinks = document.querySelectorAll(`a[href*="/${shortcode}"]`);
console.log(`[Extractor] Found ${postLinks.length} links to target post ${shortcode}`);
// If we found links to the post, search for spans within those link ancestors
const searchRoots: Element[] = [];
if (postLinks.length > 0) {
postLinks.forEach((link) => {
// Get the article or section container for this post
let container =
link.closest('article') || link.closest('section') || link.closest('[role="main"]');
if (container && !searchRoots.includes(container)) {
searchRoots.push(container);
console.log(`[Extractor] Found container for target post`);
}
});
}
// If no specific containers found, search the whole document (fallback)
if (searchRoots.length === 0) {
console.log(`[Extractor] No specific container found, searching whole document`);
searchRoots.push(document.body);
}
const spans: HTMLElement[] = [];
searchRoots.forEach((root) => {
root.querySelectorAll('span').forEach((span) => spans.push(span as HTMLElement));
});
console.log(`[Extractor] Searching ${spans.length} spans for recipe content`);
let bestCandidate: CaptionCandidate | null = null;
// Search all spans for the best caption candidate
// PRIMARY CRITERIA: Most <br> tags (recipe formatting indicator)
spans.forEach((span, spanIdx) => {
const text = (span.textContent || '').toLowerCase();
const innerHTML = span.innerHTML || '';
// Skip empty or very short spans
if (text.length < 30) return;
// Count <br> tags - this is the MOST reliable indicator for recipes
const brCount = (innerHTML.match(/<br\s*\/?>/gi) || []).length;
// No minimum br count - take what we can get
// Calculate a score based on recipe characteristics
let score = 0;
// <br> tags are the PRIMARY signal
score += brCount * 100; // Massive weight for line breaks
// Check for recipe keywords (strong indicator)
const hasKeywords = recipeKeywords.some((keyword) => text.includes(keyword));
if (hasKeywords) {
score += 500; // Huge boost for recipe keywords
}
// Count <a> tags - captions have hashtags/mentions
const linkCount = span.querySelectorAll('a').length;
if (linkCount > 2) {
score += linkCount * 10;
}
// Text length (longer is better for recipes)
score += Math.min(text.length / 5, 200);
// Check for line-height style (caption formatting)
const style = span.getAttribute('style') || '';
if (style.includes('line-height')) {
score += 30;
}
// Penalize UI elements
if (text.match(/^(follow|following|liked by|view all|more posts|comments)/i)) {
score -= 500;
}
// Penalize audio/music credits
if (text.match(/·|papaoutai|afro soul/i) && text.length < 100) {
score -= 200;
}
// Update best candidate
if (score > 0 && (!bestCandidate || score > bestCandidate.score)) {
console.log(
`[Extractor] New best: score=${score}, len=${text.length}, br=${brCount}, links=${linkCount}, preview="${text.substring(0, 80)}..."`
);
bestCandidate = {
element: span,
text: span.textContent || '',
score: score,
innerHTML: innerHTML,
brCount: brCount
};
}
});
if (!bestCandidate) {
return {
success: false,
error: 'No suitable caption span found',
text: ''
};
}
// Explicit type assertion (safe after null guard)
const candidate: CaptionCandidate = bestCandidate;
console.log(
`[Extractor] Final caption candidate: score=${candidate.score}, length=${candidate.text.length}`
);
// Extract text from the best candidate
// Use innerHTML to preserve <br> tags, which will be converted to newlines in cleanText
let captionText = candidate.innerHTML;
return {
success: true,
text: captionText,
score: candidate.score,
length: captionText.length,
htmlPreview: candidate.innerHTML.substring(0, 500)
};
}, currentShortcode);
console.log(`[Extractor] HTML Section result:`, {
success: result.success,
textLength: result.length,
score: result.score
});
if (result.htmlPreview) {
console.log('[Extractor] HTML preview (first 500 chars):');
console.log(result.htmlPreview);
}
if (!result.success) {
console.log(`[Extractor] ${result.error}`);
return null;
}
const captionText = result.text;
if (!captionText || captionText.length === 0) {
console.log('[Extractor] No text extracted from HTML section');
return null;
}
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return {
bodyText: cleanText(captionText),
thumbnail
};
} catch (error) {
logError('[Extractor] Failed to extract from HTML section', error);
return null;
}
}
/**
* Strategy 3: Extract from DOM using specific selectors
*/
export async function extractFromDOM(
page: Page,
progressCallback?: ProgressCallback
): Promise<ExtractedContent | null> {
try {
// Give Instagram more time to load dynamic content
console.log('[Extractor] Waiting for network idle...');
await page.waitForLoadState('networkidle', { timeout: TIMEOUTS.PAGE_LOAD }).catch(() => {
console.log('[Extractor] Network idle timeout, continuing anyway');
});
// Try to wait for article content
await page.waitForSelector('article', { timeout: TIMEOUTS.ARTICLE_SELECTOR }).catch(() => {});
// Additional wait for dynamic content
await page.waitForTimeout(TIMEOUTS.NETWORK_SETTLE);
// Try to intercept GraphQL responses
let graphqlCaption: string | null = null;
page.on('response', async (response) => {
const url = response.url();
if (url.includes('graphql') || url.includes('api/v1')) {
try {
const json = await response.json();
const captionData = extractCaptionFromGraphQL(json);
if (captionData && captionData.length > 130) {
graphqlCaption = captionData;
console.log(
`[Extractor] Intercepted GraphQL response with ${captionData.length} chars`
);
}
} catch (e) {
// Not JSON or parsing failed
}
}
});
await page.waitForTimeout(TIMEOUTS.GRAPHQL_WAIT);
if (graphqlCaption) {
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return { bodyText: cleanText(graphqlCaption), thumbnail };
}
// Try to expand truncated captions by clicking "more" button
await tryExpandCaptionInDOM(page);
const captionText = await page.evaluate(() => {
// First check og:description for comparison
const metaDesc = document.querySelector('meta[property="og:description"]');
const ogContent = metaDesc?.getAttribute('content') || '';
console.log(`[Extractor] og:description length: ${ogContent.length}`);
if (ogContent.length > 200) {
console.log(`[Extractor] og:description preview: ${ogContent.substring(0, 200)}...`);
}
// SMART APPROACH: Find the truncated text first, then look for full version nearby
// Look for text that ends with "..." or "… more"
const allSpans = Array.from(
document.querySelectorAll('article span, article div, article h1')
);
let longestText = '';
let matchedElement = null;
// Strategy 1: Find elements with substantial text
for (const element of allSpans) {
const text = element.textContent?.trim() || '';
// Skip UI elements
if (text.match(/^(follow|like|comment|share|view all|load more|add a comment)$/i)) {
continue;
}
// Look for text that seems like content
if (text.length > longestText.length) {
longestText = text;
matchedElement = element;
}
}
// Strategy 2: Look in data attributes
const elementsWithData = Array.from(
document.querySelectorAll('[data-caption], [data-text], [data-content]')
);
for (const el of elementsWithData) {
const dataCaption =
el.getAttribute('data-caption') ||
el.getAttribute('data-text') ||
el.getAttribute('data-content');
if (dataCaption && dataCaption.length > longestText.length) {
longestText = dataCaption;
console.log(`[Extractor] Found data attribute with ${dataCaption.length} chars`);
}
}
// Strategy 3: Look for hidden/collapsed content
const hiddenElements = Array.from(
document.querySelectorAll(
'[style*="display: none"], [style*="display:none"], .collapsed, [aria-hidden="true"]'
)
);
for (const el of hiddenElements) {
const text = el.textContent?.trim() || '';
if (text.length > longestText.length && text.length > 200) {
longestText = text;
console.log(`[Extractor] Found hidden element with ${text.length} chars`);
}
}
// Strategy 4: Find parent of truncated text
if (matchedElement && longestText.endsWith('...')) {
// Look at siblings and parent
const parent = matchedElement.parentElement;
if (parent) {
const parentText = parent.textContent?.trim() || '';
if (parentText.length > longestText.length) {
longestText = parentText;
console.log(
`[Extractor] Found fuller text in parent element: ${parentText.length} chars`
);
}
}
// Check next siblings
let sibling = matchedElement.nextElementSibling;
let siblingCount = 0;
while (sibling && siblingCount < 5) {
const siblingText = sibling.textContent?.trim() || '';
if (siblingText.length > 50) {
longestText = longestText + ' ' + siblingText;
console.log(`[Extractor] Found continuation in sibling: ${siblingText.length} chars`);
}
sibling = sibling.nextElementSibling;
siblingCount++;
}
}
if (longestText && longestText.length > 100) {
console.log(`[Extractor] Best extraction: ${longestText.length} chars`);
return longestText;
}
// Fallback to og:description
if (metaDesc) {
const content = ogContent;
const cleanedContent = content.replace(
/^\d+K?\s+likes,\s+\d+\s+comments\s+-\s+[\w.]+\s+on\s+[^:]+:\s*["']?/,
''
);
console.log('[Extractor] DOM selector fallback: og:description (with metadata cleanup)');
return cleanedContent;
}
return null;
});
if (!captionText) {
return null;
}
// Extract thumbnail using existing logic
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return {
bodyText: cleanText(captionText),
thumbnail
};
} catch (error) {
logError('[Extractor] Failed to extract from DOM', error);
return null;
}
}
/**
* Strategy 3: Extract via GraphQL API
*/
async function extractViaGraphQL(
url: string,
context: BrowserContext
): Promise<ExtractedContent | null> {
const shortcode = extractShortcode(url);
if (!shortcode) {
console.warn('Could not extract shortcode from URL:', url);
return null;
}
try {
const page = await context.newPage();
// Make GraphQL request
const response = await page.request.post('https://www.instagram.com/graphql/query/', {
form: {
variables: JSON.stringify({ shortcode }),
doc_id: '7950326061742207' // May need periodic updates
}
});
if (!response.ok()) {
console.warn(`GraphQL request failed: ${response.status()}`);
await page.close();
return null;
}
const data = await response.json();
// Parse GraphQL response
const media = data?.data?.shortcode_media;
if (!media) {
await page.close();
return null;
}
const bodyText = media.edge_media_to_caption?.edges?.[0]?.node?.text || '';
await page.close();
if (!bodyText) {
return null;
}
return {
bodyText: cleanText(bodyText),
thumbnail: null // GraphQL doesn't easily provide thumbnail, would need page context
};
} catch (error) {
logError('[Extractor] GraphQL extraction failed', error);
return null;
}
}
/**
* Strategy 4: Legacy extraction method (fallback)
*/
async function extractCleanTextLegacy(page: Page): Promise<string> {
let text = (await page.evaluate(() => document.body.innerText))
.replace(/^(?:.*\n){6}/, '') // Remove first 6 lines
.split('More posts from')[0] // Cut at "More posts from"
.trim();
// Remove mentions and hashtags
text = text.replace(/@\w+/g, '').replace(/#\w+/g, '');
return text;
}
/**
* Strategy 5: Extract from Instagram's internal state/cache
*/
async function extractFromInternalState(
page: Page,
progressCallback?: ProgressCallback
): Promise<ExtractedContent | null> {
try {
const stateData = await page.evaluate(() => {
// Try to access Instagram's internal React/Apollo cache
const possibleKeys = [
'_sharedData',
'__PRIVATE_STATE__',
'__additionalData',
'__initialData',
'__RELAY_STORE__'
];
for (const key of possibleKeys) {
if ((window as any)[key]) {
const data = (window as any)[key];
console.log(`[Extractor] Found internal state: ${key}`);
return { key, data: JSON.stringify(data).substring(0, 500000) }; // Limit to 500KB
}
}
return null;
});
if (stateData) {
console.log(`[Extractor] Parsing internal state from ${stateData.key}`);
try {
const parsed = JSON.parse(stateData.data);
// Try multiple parsing strategies
let result = parseInstagramData(parsed);
console.log(`[Extractor] Standard parsing result: ${result?.bodyText?.length || 0} chars`);
// Debug: log structure
if (parsed.entry_data) {
console.log(`[Extractor] Found entry_data with keys:`, Object.keys(parsed.entry_data));
}
if (parsed.config) {
console.log(`[Extractor] Found config`);
}
// If standard parsing failed, try deep search for caption text
if (!result || !result.bodyText || result.bodyText.length <= 130) {
console.log(`[Extractor] Attempting deep search in ${stateData.key}...`);
result = deepSearchForCaption(parsed);
if (result) {
console.log(`[Extractor] Deep search found: ${result.bodyText.length} chars`);
} else {
console.log(`[Extractor] Deep search found no caption`);
}
}
if (result && result.bodyText && result.bodyText.length > 130) {
console.log(
`[Extractor] Successfully extracted from ${stateData.key}, length: ${result.bodyText.length}`
);
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return { ...result, thumbnail };
} else if (result?.bodyText) {
console.log(
`[Extractor] Found text in ${stateData.key} but it's truncated (${result.bodyText.length} chars)`
);
}
} catch (e) {
console.log(`[Extractor] Failed to parse ${stateData.key}:`, e);
}
}
return null;
} catch (error) {
logError('[Extractor] Failed to extract from internal state', error);
return null;
}
}
/**
* Deep search for caption text in any nested object structure
*/
function deepSearchForCaption(
obj: any,
maxDepth = 10,
currentDepth = 0
): Omit<ExtractedContent, 'thumbnail'> | null {
if (currentDepth > maxDepth || !obj || typeof obj !== 'object') {
return null;
}
// Look for caption/text fields
if (obj.caption && typeof obj.caption === 'object' && obj.caption.text) {
const text = obj.caption.text;
if (typeof text === 'string' && text.length > 130) {
return { bodyText: cleanText(text) };
}
}
// Look for edge_media_to_caption pattern
if (obj.edge_media_to_caption?.edges?.[0]?.node?.text) {
const text = obj.edge_media_to_caption.edges[0].node.text;
if (typeof text === 'string' && text.length > 130) {
return { bodyText: cleanText(text) };
}
}
// Look for direct text field in media items
if (obj.text && typeof obj.text === 'string' && obj.text.length > 130) {
// Make sure it's not just a UI label
if (!obj.text.match(/^(more|less|follow|like|comment|share)$/i)) {
return { bodyText: cleanText(obj.text) };
}
}
// Recursively search in all properties
for (const key in obj) {
if (obj.hasOwnProperty(key)) {
const result = deepSearchForCaption(obj[key], maxDepth, currentDepth + 1);
if (result && result.bodyText.length > 130) {
return result;
}
}
}
return null;
}
/**
* Extract caption from intercepted GraphQL response
*/
/**
* Extract caption from GraphQL response, validating it matches the expected shortcode
*/
function extractCaptionFromGraphQL(data: any, expectedShortcode?: string): string | null {
// If we have an expected shortcode, verify this GraphQL response is for that content
if (expectedShortcode) {
// Search for shortcode in the response
const hasMatchingShortcode = JSON.stringify(data).includes(expectedShortcode);
if (!hasMatchingShortcode) {
// This GraphQL response is for different content, ignore it
return null;
}
}
const result = deepSearchForCaption(data);
return result?.bodyText || null;
}
/**
* Orchestrate extraction strategies
*/
async function extractWithStrategies(
url: string,
page: Page,
context: BrowserContext,
onProgress?: ProgressCallback
): Promise<ExtractionResult> {
const strategies: Array<{
name: ExtractionMethod;
fn: () => Promise<ExtractedContent | null>;
}> = [
{
name: 'embedded-json',
fn: () => extractFromEmbeddedJSON(page, onProgress)
},
{
name: 'internal-state',
fn: () => extractFromInternalState(page, onProgress)
},
{
name: 'html-section',
fn: () => extractFromHTMLSection(page, onProgress, url)
},
{
name: 'dom-selector',
fn: () => extractFromDOM(page, onProgress)
},
{
name: 'graphql-api',
fn: () => extractViaGraphQL(url, context)
},
{
name: 'legacy',
fn: async () => {
const text = await extractCleanTextLegacy(page);
const thumbnail = await extractThumbnailStealth(page, onProgress);
return { bodyText: text, thumbnail };
}
}
];
for (const strategy of strategies) {
try {
const methodMessage = `Trying extraction method: ${getMethodDisplayName(strategy.name)}`;
console.log(`[Extractor] ${methodMessage}`);
onProgress?.({
type: 'method',
message: methodMessage,
method: strategy.name,
timestamp: new Date().toISOString()
});
const result = await strategy.fn();
if (result && result.bodyText) {
const successMessage = `✓ Success with method: ${getMethodDisplayName(strategy.name)}`;
console.log(`[Extractor] ${successMessage}`);
onProgress?.({
type: 'status',
message: successMessage,
method: strategy.name,
timestamp: new Date().toISOString()
});
return {
success: true,
method: strategy.name,
data: result
};
}
} catch (error) {
logError(`[Extractor] Method ${strategy.name} failed`, error);
// Continue to next strategy
}
}
return {
success: false,
error: 'All extraction methods failed'
};
}
/**
* Extract text content and thumbnail from a URL using Playwright browser
* Uses multiple extraction strategies with fallback
* @param url - The URL to extract from
* @param onProgress - Optional callback to receive progress updates
* @returns Extracted text and thumbnail
*/
export async function extractTextAndThumbnail(
url: string,
onProgress?: ProgressCallback
): Promise<ExtractedContent> {
onProgress?.({
type: 'status',
message: 'Starting extraction...',
timestamp: new Date().toISOString()
});
return withRetry(
async () => {
const authPath = resolveAuthPath();
const context = await createBrowserContext(authPath);
const page = await context.newPage();
// Extract shortcode for validation
const expectedShortcode = extractShortcode(url);
console.log(`[Extractor] Target shortcode: ${expectedShortcode || 'unknown'}`);
try {
// Set timeout
page.setDefaultTimeout(30000);
// Set up GraphQL response interception BEFORE loading the page
// This is critical to catch initial network requests during page load
let interceptedCaption: string | null = null;
page.on('response', async (response) => {
try {
const responseUrl = response.url();
if (
responseUrl.includes('graphql') ||
responseUrl.includes('api/v1') ||
responseUrl.includes('/web/')
) {
try {
const json = await response.json();
const captionData = extractCaptionFromGraphQL(json, expectedShortcode ?? undefined);
if (captionData && captionData.length > 130) {
interceptedCaption = captionData;
console.log(
`[Extractor] ✓ Intercepted GraphQL with full caption: ${captionData.length} chars (shortcode verified)`
);
}
} catch (e) {
// Not JSON or parse error, skip
}
}
} catch (e) {
// Ignore response errors
}
});
onProgress?.({
type: 'status',
message: 'Loading Instagram page...',
timestamp: new Date().toISOString()
});
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
// Add small human-like delay
await page.waitForTimeout(1000 + Math.random() * 2000);
// Try scrolling and waiting to trigger additional GraphQL requests
console.log('[Extractor] Scrolling to trigger lazy loading...');
await page.evaluate(() => {
window.scrollBy(0, 300);
});
await page.waitForTimeout(1500);
await page.evaluate(() => {
window.scrollBy(0, 300);
});
await page.waitForTimeout(1500);
await page.evaluate(() => {
window.scrollTo(0, 0);
});
await page.waitForTimeout(1000);
// Always use DOM extraction (HTML Section) — it clicks "… more" in
// the browser and gets the fully expanded caption. The GraphQL
// interception is unreliable: Instagram often truncates captions
// in API responses without any "…." marker, so we cannot trust
// the intercepted text to be complete.
const capturedCaption = interceptedCaption as string | null;
if (capturedCaption) {
console.log(
`[Extractor] Intercepted GraphQL caption (${capturedCaption.length} chars) — always using DOM extraction for full text`
);
}
const result = await extractWithStrategies(url, page, context, onProgress);
if (!result.success || !result.data) {
// DOM extraction failed — fall back to intercepted caption if available
if (capturedCaption) {
console.log(
'[Extractor] DOM extraction failed — using intercepted GraphQL caption as fallback'
);
const thumbnail = await extractThumbnailStealth(page, onProgress);
return { bodyText: cleanText(capturedCaption), thumbnail };
}
throw new Error(result.error || 'Extraction failed');
}
// Save debug content
fs.writeFileSync(
path.resolve('debug_page.txt'),
`Method: ${result.method}\n\n${result.data.bodyText}`
);
onProgress?.({
type: 'complete',
message: 'Extraction completed successfully',
method: result.method,
timestamp: new Date().toISOString()
});
return result.data;
} finally {
await page.close();
await context.close();
}
},
DEFAULT_RETRY_CONFIG,
onProgress
);
}
/**
* Extract thumbnail from video element or take full page screenshot
*/
/**
* Screenshot-based thumbnail extraction (fallback method)
* Takes a screenshot of the video element or full page if video not found
*/
async function extractThumbnailScreenshot(page: Page): Promise<string | null> {
const videoBounds = await page.evaluate(() => {
const video = document.querySelector('video');
if (!video) return null;
const rect = video.getBoundingClientRect();
return {
x: Math.max(0, rect.left),
y: Math.max(0, rect.top),
width: Math.min(rect.width, window.innerWidth),
height: Math.min(rect.height, window.innerHeight)
};
});
let screenshotBuffer: Buffer;
if (videoBounds && videoBounds.width > 0 && videoBounds.height > 0) {
screenshotBuffer = await page.screenshot({
type: 'jpeg',
quality: 85,
clip: videoBounds
});
} else {
console.warn('[Thumbnail] Video element not found or has no size, taking full page screenshot');
screenshotBuffer = await page.screenshot({ type: 'jpeg', quality: 85 });
}
return `data:image/jpeg;base64,${screenshotBuffer.toString('base64')}`;
}
/**
* Helper: Fetch image from URL and convert to base64 data URI
*
* **Validation Criteria:**
* - HTTP status must be exactly 200 (not 2xx, only 200)
* - Content-Type must start with 'image/' (e.g., image/jpeg, image/png, image/webp)
* - Request must complete within 10 seconds
*
* **Failure Scenarios:**
* - Non-200 status → Returns null, reports status code via progress callback
* - Invalid content-type → Returns null, reports content-type via progress callback
* - Timeout → Returns null, reports timeout via progress callback
* - Network error → Returns null, reports error message via progress callback
*
* **Usage in Fallback Chain:**
* This function is used by `extractThumbnailStealth()` which tries multiple URL sources:
* 1. Meta tags (og:image, twitter:image)
* 2. Video poster attribute
* 3. Instagram data structures (display_url, thumbnail_src)
* 4. Screenshot fallback (always succeeds)
*
* When this function returns null, extraction continues to the next method.
*
* @param imageUrl - The image URL to fetch (must be HTTPS)
* @param progressCallback - Optional callback for progress reporting
* @returns Base64 data URI (data:image/*;base64,...) or null if validation fails
*
* @example
* ```typescript
* const thumbnail = await fetchImageAsBase64(
* 'https://instagram.com/image.jpg',
* (event) => console.log(event.message)
* );
*
* if (thumbnail) {
* // thumbnail is a valid base64 data URI
* console.log(thumbnail.substring(0, 50)); // "data:image/jpeg;base64,/9j/4AAQSkZJRg..."
* } else {
* // URL validation failed, try next method
* }
* ```
*/
async function fetchImageAsBase64(
imageUrl: string,
progressCallback?: ProgressCallback
): Promise<string | null> {
try {
// Create abort controller for timeout
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), 10000); // 10s timeout
console.log(`[Thumbnail] Validating URL: ${imageUrl}`);
const response = await fetch(imageUrl, {
signal: controller.signal
});
clearTimeout(timeoutId);
// Strict status validation: must be exactly 200
if (response.status !== 200) {
console.warn(`[Thumbnail] URL validation failed: HTTP ${response.status} for ${imageUrl}`);
progressCallback?.({
type: 'status',
message: `Thumbnail URL returned HTTP ${response.status}, trying next method...`,
timestamp: new Date().toISOString()
});
return null;
}
// Validate content-type
const contentType = response.headers.get('content-type') || '';
if (!contentType.startsWith('image/')) {
console.warn(
`[Thumbnail] URL validation failed: Invalid content-type '${contentType}' for ${imageUrl}`
);
progressCallback?.({
type: 'status',
message: `Thumbnail URL returned non-image content (${contentType}), trying next method...`,
timestamp: new Date().toISOString()
});
return null;
}
console.log(`[Thumbnail] URL validation successful: ${imageUrl} (${contentType})`);
const arrayBuffer = await response.arrayBuffer();
const buffer = Buffer.from(arrayBuffer);
const base64Data = `data:${contentType};base64,${buffer.toString('base64')}`;
progressCallback?.({
type: 'status',
message: 'Thumbnail fetched and validated from URL',
timestamp: new Date().toISOString()
});
return base64Data;
} catch (e) {
if (e instanceof Error) {
if (e.name === 'AbortError') {
console.error(`[Thumbnail] URL fetch timeout: ${imageUrl}`);
progressCallback?.({
type: 'status',
message: 'Thumbnail URL fetch timeout, trying next method...',
timestamp: new Date().toISOString()
});
} else {
console.error(`[Thumbnail] Failed to fetch image from ${imageUrl}:`, e.message);
progressCallback?.({
type: 'status',
message: `Thumbnail URL fetch failed (${e.message}), trying next method...`,
timestamp: new Date().toISOString()
});
}
} else {
logError('[Thumbnail] Failed to fetch image', e);
}
return null;
}
}
/**
* Extract thumbnail from Instagram post using stealth techniques
*
* Tries multiple methods in order of stealth:
* 1. Meta tags (og:image, twitter:image) - Returns: Direct HTTPS URL
* 2. Video poster attribute - Returns: Direct HTTPS URL
* 3. Instagram window data structures - Returns: Direct HTTPS URL
* 4. Screenshot fallback - Returns: Base64 data URL (data:image/jpeg;base64,...)
*
* @param page - Playwright page instance
* @param progressCallback - Optional progress callback for SSE updates
* @returns Image URL (either direct HTTPS URL or base64 data URL) or null if all methods fail
*
* **Thumbnail Format Guide:**
* - Methods 1-3: Return direct HTTPS URLs → Tandoor can use URL pass-through (efficient)
* - Method 4: Returns base64 data URL → Requires conversion to file blob for upload
*/
async function extractThumbnailStealth(
page: Page,
progressCallback?: ProgressCallback
): Promise<string | null> {
console.log('[Thumbnail] Starting stealth extraction');
// Method 1: Try meta tags (most stealthy)
try {
const ogImage = await page.getAttribute('meta[property="og:image"]', 'content');
if (ogImage) {
console.log('[Thumbnail] Found og:image meta tag');
const imageBuffer = await fetchImageAsBase64(ogImage, progressCallback);
if (imageBuffer) {
if (progressCallback) {
progressCallback({
type: 'thumbnail',
message: 'Thumbnail extracted from meta tags',
data: { thumbnail: imageBuffer },
timestamp: new Date().toISOString()
});
}
return imageBuffer;
}
}
const twitterImage = await page.getAttribute('meta[name="twitter:image"]', 'content');
if (twitterImage) {
console.log('[Thumbnail] Found twitter:image meta tag');
const imageBuffer = await fetchImageAsBase64(twitterImage, progressCallback);
if (imageBuffer) {
if (progressCallback) {
progressCallback({
type: 'thumbnail',
message: 'Thumbnail extracted from meta tags',
data: { thumbnail: imageBuffer },
timestamp: new Date().toISOString()
});
}
return imageBuffer;
}
}
} catch (e) {
logError('[Thumbnail] Meta tag method failed', e);
}
// Method 2: Try video poster attribute
try {
const poster = await page.getAttribute('video', 'poster');
if (poster) {
console.log('[Thumbnail] Found video poster attribute');
const imageBuffer = await fetchImageAsBase64(poster, progressCallback);
if (imageBuffer) {
if (progressCallback) {
progressCallback({
type: 'thumbnail',
message: 'Thumbnail extracted from video poster',
data: { thumbnail: imageBuffer },
timestamp: new Date().toISOString()
});
}
return imageBuffer;
}
}
} catch (e) {
logError('[Thumbnail] Video poster method failed', e);
}
// Method 3: Try Instagram window data structures
try {
const thumbnailUrl = await page.evaluate(() => {
// Check for Instagram's internal data structures
const data = (window as any).__additionalDataLoaded;
if (data) {
// Navigate through Instagram's data structure
for (const key in data) {
const item = data[key];
if (item?.graphql?.shortcode_media?.display_url) {
return item.graphql.shortcode_media.display_url;
}
if (item?.graphql?.shortcode_media?.thumbnail_src) {
return item.graphql.shortcode_media.thumbnail_src;
}
}
}
return null;
});
if (thumbnailUrl) {
console.log('[Thumbnail] Found thumbnail in Instagram data structures');
const imageBuffer = await fetchImageAsBase64(thumbnailUrl, progressCallback);
if (imageBuffer) {
if (progressCallback) {
progressCallback({
type: 'thumbnail',
message: 'Thumbnail extracted from Instagram data',
data: { thumbnail: imageBuffer },
timestamp: new Date().toISOString()
});
}
return imageBuffer;
}
}
} catch (e) {
logError('[Thumbnail] Instagram data method failed', e);
}
// Method 4: Screenshot fallback (existing method)
console.log('[Thumbnail] Falling back to screenshot method');
const screenshotThumbnail = await extractThumbnailScreenshot(page);
if (screenshotThumbnail && progressCallback) {
progressCallback({
type: 'thumbnail',
message: 'Thumbnail extracted via screenshot',
data: { thumbnail: screenshotThumbnail },
timestamp: new Date().toISOString()
});
}
return screenshotThumbnail;
}