- Add instagram-extractor.ts: yt-dlp subprocess backend for Instagram caption extraction. No in-process browser state, maintained against Instagram frontend churn, supports cookies.txt for auth-walled reels. - Add feature flag EXTRACTOR_BACKEND (ytdlp|playwright) in QueueProcessor so the old Playwright path remains available as fallback. - Add 9 unit tests and 2 live-network integration tests for the new extractor. - Dockerfile: install yt-dlp via pip3 alongside existing Chromium deps. - docker-compose: expose EXTRACTOR_BACKEND env var (default: ytdlp). Also in this commit: - LLM: configurable per-request timeout via LLM_REQUEST_TIMEOUT_MS (default 120s); set maxRetries=0 to surface errors immediately; llama-swap /running health probe. - QueueProcessor: thread progress callback through parser phase. - LlmHealthIndicator: surface llama-swap loaded-model name. - Logging: improve error serialization in queue-processor tests. - .env.example: document llama-swap endpoint and model options. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1726 lines
50 KiB
TypeScript
1726 lines
50 KiB
TypeScript
import { createBrowserContext } from './browser';
|
|
import { logError } from './utils/logger';
|
|
import fs from 'fs';
|
|
import path from 'path';
|
|
import type { Page, BrowserContext } from 'playwright';
|
|
|
|
export interface ExtractedContent {
|
|
bodyText: string;
|
|
thumbnail: string | null;
|
|
}
|
|
|
|
export type ExtractionMethod =
|
|
| 'embedded-json'
|
|
| 'internal-state'
|
|
| 'html-section'
|
|
| 'dom-selector'
|
|
| 'graphql-api'
|
|
| 'graphql-intercept'
|
|
| 'legacy';
|
|
|
|
type CaptionCandidate = {
|
|
element: Element;
|
|
text: string;
|
|
score: number;
|
|
innerHTML: string;
|
|
brCount: number;
|
|
};
|
|
|
|
export type ProgressEventType =
|
|
| 'status'
|
|
| 'method'
|
|
| 'retry'
|
|
| 'error'
|
|
| 'thumbnail'
|
|
| 'complete'
|
|
| 'model_loading';
|
|
|
|
export interface ProgressEvent {
|
|
type: ProgressEventType;
|
|
message: string;
|
|
method?: ExtractionMethod;
|
|
attemptNumber?: number;
|
|
maxAttempts?: number;
|
|
data?: any;
|
|
timestamp?: string;
|
|
}
|
|
|
|
export type ProgressCallback = (event: ProgressEvent) => void;
|
|
|
|
interface ExtractionResult {
|
|
success: boolean;
|
|
method?: ExtractionMethod;
|
|
data?: ExtractedContent;
|
|
error?: string;
|
|
}
|
|
|
|
interface InstagramEmbeddedData {
|
|
entry_data?: {
|
|
PostPage?: Array<{
|
|
graphql?: {
|
|
shortcode_media?: {
|
|
edge_media_to_caption?: {
|
|
edges?: Array<{ node: { text: string } }>;
|
|
};
|
|
display_url?: string;
|
|
video_url?: string;
|
|
owner?: {
|
|
username: string;
|
|
profile_pic_url: string;
|
|
};
|
|
};
|
|
};
|
|
}>;
|
|
};
|
|
}
|
|
|
|
interface RetryConfig {
|
|
maxAttempts: number;
|
|
initialDelayMs: number;
|
|
maxDelayMs: number;
|
|
backoffMultiplier: number;
|
|
}
|
|
|
|
const DEFAULT_RETRY_CONFIG: RetryConfig = {
|
|
maxAttempts: 3,
|
|
initialDelayMs: 1000,
|
|
maxDelayMs: 10000,
|
|
backoffMultiplier: 2
|
|
};
|
|
|
|
/**
|
|
* Resolve authentication storage path
|
|
* Checks Docker path first, then local path
|
|
*/
|
|
function resolveAuthPath(): string | undefined {
|
|
const authPathDocker = '/app/secrets/auth.json';
|
|
const authPathLocal = './secrets/auth.json';
|
|
|
|
if (fs.existsSync(authPathDocker)) {
|
|
return authPathDocker;
|
|
}
|
|
|
|
if (fs.existsSync(authPathLocal)) {
|
|
return authPathLocal;
|
|
}
|
|
|
|
return undefined;
|
|
}
|
|
|
|
/**
|
|
* Sleep utility for retry logic
|
|
*/
|
|
async function sleep(ms: number): Promise<void> {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
|
|
/**
|
|
* Check if error should not be retried
|
|
*/
|
|
function isNonRetriableError(error: unknown): boolean {
|
|
if (error instanceof Error) {
|
|
// Don't retry authentication errors
|
|
if (error.message.includes('authentication') || error.message.includes('login required')) {
|
|
return true;
|
|
}
|
|
|
|
// Don't retry invalid URLs
|
|
if (error.message.includes('invalid url')) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Get human-readable display name for extraction method
|
|
*/
|
|
function getMethodDisplayName(method: ExtractionMethod): string {
|
|
const names: Record<ExtractionMethod, string> = {
|
|
'embedded-json': 'Embedded JSON',
|
|
'internal-state': 'Internal State',
|
|
'html-section': 'HTML Section',
|
|
'dom-selector': 'DOM Selector',
|
|
'graphql-api': 'GraphQL API',
|
|
'graphql-intercept': 'GraphQL Intercept',
|
|
legacy: 'Legacy Parser'
|
|
};
|
|
return names[method];
|
|
}
|
|
|
|
/**
|
|
* Retry wrapper with exponential backoff
|
|
*/
|
|
async function withRetry<T>(
|
|
fn: () => Promise<T>,
|
|
config: RetryConfig = DEFAULT_RETRY_CONFIG,
|
|
onProgress?: ProgressCallback
|
|
): Promise<T> {
|
|
let lastError: Error | null = null;
|
|
let delay = config.initialDelayMs;
|
|
|
|
for (let attempt = 1; attempt <= config.maxAttempts; attempt++) {
|
|
try {
|
|
return await fn();
|
|
} catch (error) {
|
|
lastError = error as Error;
|
|
|
|
// Don't retry on certain errors
|
|
if (isNonRetriableError(error)) {
|
|
onProgress?.({
|
|
type: 'error',
|
|
message: `Non-retriable error: ${lastError.message}`,
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
throw error;
|
|
}
|
|
|
|
if (attempt < config.maxAttempts) {
|
|
const message = `Attempt ${attempt}/${config.maxAttempts} failed. Retrying in ${delay}ms...`;
|
|
logError(`[Retry] ${message}`, error);
|
|
|
|
onProgress?.({
|
|
type: 'retry',
|
|
message,
|
|
attemptNumber: attempt,
|
|
maxAttempts: config.maxAttempts,
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
|
|
await sleep(delay);
|
|
delay = Math.min(delay * config.backoffMultiplier, config.maxDelayMs);
|
|
}
|
|
}
|
|
}
|
|
|
|
throw lastError || new Error('Max retry attempts exceeded');
|
|
}
|
|
|
|
/**
|
|
* Extract shortcode from Instagram URL
|
|
*/
|
|
function extractShortcode(url: string): string | undefined {
|
|
// Extract from /p/, /reel/, /reels/, /tv/ URLs
|
|
const match = url.match(/\/(p|reel|reels|tv)\/([A-Za-z0-9_-]+)/);
|
|
return match ? match[2] : undefined;
|
|
}
|
|
|
|
/**
|
|
* Recipe keywords used for caption scoring
|
|
*/
|
|
const RECIPE_KEYWORDS = [
|
|
'ingredienti',
|
|
'procedimento',
|
|
'preparazione',
|
|
'ricetta',
|
|
'recipe',
|
|
'instructions'
|
|
];
|
|
|
|
/**
|
|
* Timeout configuration constants (in milliseconds)
|
|
*/
|
|
const TIMEOUTS = {
|
|
CONTENT_LOAD: 1500,
|
|
MORE_BUTTON_VISIBILITY: 1000,
|
|
CAPTION_EXPANSION: 3000,
|
|
MORE_BUTTON_VISIBILITY_DOM: 500,
|
|
MORE_BUTTON_CLICK: 800,
|
|
PAGE_LOAD: 10000,
|
|
NETWORK_SETTLE: 2000,
|
|
ARTICLE_SELECTOR: 5000,
|
|
GRAPHQL_WAIT: 1000,
|
|
PAGE_NAVIGATION: 30000,
|
|
ANTI_DETECTION_MIN: 1000,
|
|
ANTI_DETECTION_MAX: 3000
|
|
} as const;
|
|
|
|
/**
|
|
* Try to expand truncated caption by clicking "more" button in HTML section method
|
|
*/
|
|
async function tryExpandCaptionInHTMLSection(page: Page): Promise<void> {
|
|
console.log('[Extractor] Looking for "more" button in primary post container...');
|
|
try {
|
|
await page.waitForTimeout(TIMEOUTS.CONTENT_LOAD);
|
|
|
|
const mainContainer = page.locator('article, main, [role="main"]').first();
|
|
const containerExists = (await mainContainer.count()) > 0;
|
|
|
|
if (!containerExists) {
|
|
console.log('[Extractor] No main container found');
|
|
return;
|
|
}
|
|
|
|
console.log('[Extractor] Found main post container, searching for "more" button...');
|
|
|
|
const morePatterns = [
|
|
{
|
|
locator: mainContainer.locator('span').filter({ hasText: /\.\.\.\s*more/i }),
|
|
desc: "span with '...more'"
|
|
},
|
|
{
|
|
locator: mainContainer.locator('span').filter({ hasText: /…\s*more/i }),
|
|
desc: "span with '… more'"
|
|
},
|
|
{
|
|
locator: mainContainer.locator('div[role="button"]').filter({ hasText: /more/i }),
|
|
desc: "button with 'more'"
|
|
},
|
|
{
|
|
locator: mainContainer.locator('span[role="button"]').filter({ hasText: /more/i }),
|
|
desc: "span button with 'more'"
|
|
}
|
|
];
|
|
|
|
for (const pattern of morePatterns) {
|
|
const count = await pattern.locator.count();
|
|
console.log(`[Extractor] Checking ${pattern.desc}: found ${count}`);
|
|
|
|
if (count === 0) continue;
|
|
|
|
const firstMore = pattern.locator.first();
|
|
try {
|
|
if (await firstMore.isVisible({ timeout: TIMEOUTS.MORE_BUTTON_VISIBILITY })) {
|
|
const text = await firstMore.textContent();
|
|
console.log(`[Extractor] Found visible "more": "${text}"`);
|
|
await firstMore.click();
|
|
console.log('[Extractor] Clicked "more" - waiting for expansion...');
|
|
await page.waitForTimeout(TIMEOUTS.CAPTION_EXPANSION);
|
|
console.log('[Extractor] Caption expansion complete');
|
|
break;
|
|
}
|
|
} catch (e) {
|
|
console.log(`[Extractor] ${pattern.desc} not clickable: ${e}`);
|
|
}
|
|
}
|
|
|
|
console.log('[Extractor] Finished "more" button expansion attempt');
|
|
} catch (e) {
|
|
console.log(`[Extractor] Error while trying to expand caption: ${e}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Try to expand truncated caption by clicking "more" button in DOM method
|
|
*/
|
|
async function tryExpandCaptionInDOM(page: Page): Promise<void> {
|
|
const moreButtonSelectors = [
|
|
'article button:has-text("more")',
|
|
'article button:has-text("More")',
|
|
'article button:has-text("… more")',
|
|
'article span[role="button"]:has-text("more")',
|
|
'article [role="button"]:has-text("more")',
|
|
'article div[role="button"]:has-text("more")',
|
|
'xpath=//article//span[contains(text(), "more")]/..',
|
|
'xpath=//article//button[contains(., "more")]'
|
|
];
|
|
|
|
const maxExpandAttempts = 3;
|
|
let expandAttempts = 0;
|
|
|
|
while (expandAttempts < maxExpandAttempts) {
|
|
try {
|
|
let clicked = false;
|
|
for (const selector of moreButtonSelectors) {
|
|
try {
|
|
const button = page.locator(selector).first();
|
|
if (await button.isVisible({ timeout: TIMEOUTS.MORE_BUTTON_VISIBILITY_DOM })) {
|
|
await button.click();
|
|
await page.waitForTimeout(TIMEOUTS.MORE_BUTTON_CLICK);
|
|
console.log(`[Extractor] Clicked "more" button with selector: ${selector}`);
|
|
clicked = true;
|
|
expandAttempts++;
|
|
break;
|
|
}
|
|
} catch (e) {
|
|
// Try next selector
|
|
}
|
|
}
|
|
|
|
if (!clicked) break;
|
|
} catch (e) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Clean up extracted text - removes HTML tags, decodes entities, cleans whitespace
|
|
*/
|
|
export function cleanText(text: string): string {
|
|
let cleaned = text;
|
|
|
|
// First, convert <br> tags to newlines to preserve line breaks
|
|
cleaned = cleaned.replace(/<br\s*\/?>/gi, '\n');
|
|
|
|
// Strip all other HTML tags while keeping the text content
|
|
cleaned = cleaned.replace(/<[^>]+>/g, '');
|
|
|
|
// Decode HTML entities
|
|
cleaned = cleaned
|
|
.replace(/&/g, '&')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, "'")
|
|
.replace(/ /g, ' ');
|
|
|
|
// Remove common UI text patterns
|
|
const uiPatterns = [
|
|
/More posts from.+/gi,
|
|
/View all \d+ comments/gi,
|
|
/Add a comment\.\.\./gi,
|
|
/Liked by.+?(?=\n|$)/gi
|
|
];
|
|
|
|
uiPatterns.forEach((pattern) => {
|
|
cleaned = cleaned.replace(pattern, '');
|
|
});
|
|
|
|
// Clean up whitespace while preserving intentional line breaks
|
|
// Remove spaces at the beginning and end of lines
|
|
cleaned = cleaned.replace(/[ \t]+$/gm, ''); // trailing spaces on each line
|
|
cleaned = cleaned.replace(/^[ \t]+/gm, ''); // leading spaces on each line
|
|
|
|
// Replace multiple consecutive blank lines with max 2 newlines
|
|
cleaned = cleaned.replace(/\n\s*\n\s*\n+/g, '\n\n');
|
|
|
|
// Remove spaces around newlines
|
|
cleaned = cleaned.replace(/ *\n */g, '\n');
|
|
|
|
// Normalize multiple spaces to single space within lines
|
|
cleaned = cleaned.replace(/ {2,}/g, ' ');
|
|
|
|
// Remove hashtags from end of text
|
|
// Pattern: #word #multiple_words (supports international characters)
|
|
cleaned = cleaned.replace(/(#[\w\u00C0-\u024F\u1E00-\u1EFF\u0400-\u04FF]+\s*)+$/gi, '').trim();
|
|
|
|
return cleaned.trim();
|
|
}
|
|
|
|
/**
|
|
* Strategy 1: Extract from embedded JSON data in script tags
|
|
*/
|
|
async function extractFromEmbeddedJSON(
|
|
page: Page,
|
|
progressCallback?: ProgressCallback
|
|
): Promise<ExtractedContent | null> {
|
|
try {
|
|
// Extract all script tag contents
|
|
const scriptInfo = await page.evaluate(() => {
|
|
const scripts = Array.from(document.querySelectorAll('script'));
|
|
const scriptData = scripts.map((script, idx) => ({
|
|
type: script.getAttribute('type') || 'no-type',
|
|
hasContent: !!script.textContent,
|
|
length: script.textContent?.length || 0,
|
|
preview: script.textContent?.substring(0, 100) || ''
|
|
}));
|
|
console.log(`[Extractor] Found ${scripts.length} script tags`);
|
|
return {
|
|
contents: scripts.map((script) => script.textContent || ''),
|
|
info: scriptData
|
|
};
|
|
});
|
|
|
|
console.log(`[Extractor] Script tags summary:`, scriptInfo.info);
|
|
|
|
// Look for embedded data patterns
|
|
for (let i = 0; i < scriptInfo.contents.length; i++) {
|
|
const content = scriptInfo.contents[i];
|
|
|
|
// Try window._sharedData pattern
|
|
const sharedDataMatch = content.match(/window\._sharedData\s*=\s*(\{.+?\});/s);
|
|
if (sharedDataMatch) {
|
|
console.log(`[Extractor] Found _sharedData in script ${i}`);
|
|
try {
|
|
const data: InstagramEmbeddedData = JSON.parse(sharedDataMatch[1]);
|
|
const result = parseInstagramData(data);
|
|
if (result) {
|
|
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
|
return { ...result, thumbnail };
|
|
}
|
|
} catch (e) {
|
|
logError('[Extractor] Failed to parse _sharedData', e);
|
|
}
|
|
}
|
|
|
|
// Try __additionalDataLoaded pattern
|
|
const additionalDataMatch = content.match(
|
|
/window\.__additionalDataLoaded\([^,]+,\s*(\{.+?\})\);/s
|
|
);
|
|
if (additionalDataMatch) {
|
|
console.log(`[Extractor] Found __additionalDataLoaded in script ${i}`);
|
|
try {
|
|
const data = JSON.parse(additionalDataMatch[1]);
|
|
const result = parseInstagramData(data);
|
|
if (result) {
|
|
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
|
return { ...result, thumbnail };
|
|
}
|
|
} catch (e) {
|
|
logError('[Extractor] Failed to parse __additionalDataLoaded', e);
|
|
}
|
|
}
|
|
|
|
// Try to find any large JSON with caption data (new Instagram format)
|
|
if ((content.includes('"caption"') || content.includes('"text"')) && content.length > 10000) {
|
|
console.log(
|
|
`[Extractor] Attempting to extract from large JSON in script ${i} (length: ${content.length})`
|
|
);
|
|
try {
|
|
// Try to parse as direct JSON
|
|
const jsonData = JSON.parse(content);
|
|
|
|
// Try deep search first
|
|
const deepResult = deepSearchForCaption(jsonData);
|
|
if (deepResult && deepResult.bodyText && deepResult.bodyText.length > 130) {
|
|
console.log(
|
|
`[Extractor] Deep search in JSON found caption: ${deepResult.bodyText.length} chars`
|
|
);
|
|
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
|
return { ...deepResult, thumbnail };
|
|
}
|
|
|
|
// Try standard parsing
|
|
const result = parseInstagramData(jsonData);
|
|
if (result && result.bodyText && result.bodyText.length > 130) {
|
|
console.log(
|
|
`[Extractor] Successfully extracted from JSON, text length: ${result.bodyText.length}`
|
|
);
|
|
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
|
return { ...result, thumbnail };
|
|
}
|
|
} catch (e) {
|
|
// Not direct JSON or parsing failed, try to find caption fields with regex
|
|
console.log(`[Extractor] JSON parse failed, trying regex extraction...`);
|
|
// Try multiple patterns for different Instagram JSON structures
|
|
const patterns = [
|
|
/"caption"\s*:\s*\{\s*"text"\s*:\s*"([^"\\]*(\\.[^"\\]*)*)"/, // Escaped quotes
|
|
/"text"\s*:\s*"([^"\\]*(\\.[^"\\]*)*)"\s*,?\s*"pk"/, // text field near pk
|
|
/"edge_media_to_caption"\s*:\s*\{\s*"edges"\s*:\s*\[\s*\{\s*"node"\s*:\s*\{\s*"text"\s*:\s*"([^"\\]*(\\.[^"\\]*)*)"/
|
|
];
|
|
|
|
for (const pattern of patterns) {
|
|
const captionMatch = content.match(pattern);
|
|
if (captionMatch) {
|
|
// Get the captured group (first non-undefined)
|
|
const rawText = captionMatch[1] || '';
|
|
const captionText = rawText
|
|
.replace(/\\n/g, '\n')
|
|
.replace(/\\"/g, '"')
|
|
.replace(/\\u([0-9a-fA-F]{4})/g, (_, code) =>
|
|
String.fromCharCode(parseInt(code, 16))
|
|
)
|
|
.replace(/\\\\/g, '\\');
|
|
|
|
if (captionText.length > 130) {
|
|
console.log(
|
|
`[Extractor] Extracted caption from regex pattern, length: ${captionText.length}`
|
|
);
|
|
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
|
return { bodyText: cleanText(captionText), thumbnail };
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return null;
|
|
} catch (error) {
|
|
logError('[Extractor] Failed to extract from embedded JSON', error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse Instagram data structure
|
|
*/
|
|
function parseInstagramData(data: any): Omit<ExtractedContent, 'thumbnail'> | null {
|
|
try {
|
|
// Navigate the nested structure
|
|
const media = data?.entry_data?.PostPage?.[0]?.graphql?.shortcode_media;
|
|
|
|
if (!media) {
|
|
// Try alternative structures
|
|
const items = data?.items || data?.data?.shortcode_media;
|
|
if (items) {
|
|
return extractFromAlternativeStructure(items);
|
|
}
|
|
return null;
|
|
}
|
|
|
|
// Extract caption
|
|
const captionEdges = media.edge_media_to_caption?.edges || [];
|
|
const bodyText = captionEdges.map((edge: any) => edge.node.text).join('\n');
|
|
|
|
if (!bodyText) {
|
|
return null;
|
|
}
|
|
|
|
return {
|
|
bodyText: cleanText(bodyText)
|
|
};
|
|
} catch (error) {
|
|
logError('[Extractor] Failed to parse Instagram data structure', error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse alternative Instagram data structures
|
|
*/
|
|
function extractFromAlternativeStructure(items: any): Omit<ExtractedContent, 'thumbnail'> | null {
|
|
try {
|
|
if (Array.isArray(items)) {
|
|
items = items[0];
|
|
}
|
|
|
|
const caption = items?.caption?.text || items?.edge_media_to_caption?.edges?.[0]?.node?.text;
|
|
|
|
if (caption) {
|
|
return {
|
|
bodyText: cleanText(caption)
|
|
};
|
|
}
|
|
|
|
return null;
|
|
} catch (error) {
|
|
logError('[Extractor] Failed to parse alternative structure', error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Strategy 2.5: Extract caption by finding the span with recipe content characteristics
|
|
* Instagram uses obfuscated class names, but the caption span has identifiable patterns:
|
|
* - Contains substantial text (> 100 chars)
|
|
* - Has multiple <br> tags for formatting
|
|
* - Contains <a> tags for mentions and hashtags
|
|
* - Usually has a style attribute with line-height
|
|
*/
|
|
export async function extractFromHTMLSection(
|
|
page: Page,
|
|
progressCallback?: ProgressCallback,
|
|
targetUrl?: string
|
|
): Promise<ExtractedContent | null> {
|
|
try {
|
|
console.log('[Extractor] Waiting for page content to load...');
|
|
|
|
// Validate we're on the correct page
|
|
const currentUrl = page.url();
|
|
const targetShortcode = targetUrl ? extractShortcode(targetUrl) : null;
|
|
const currentShortcode = extractShortcode(currentUrl);
|
|
|
|
console.log(`[Extractor] Current page URL: ${currentUrl}`);
|
|
console.log(
|
|
`[Extractor] Target shortcode: ${targetShortcode}, Current shortcode: ${currentShortcode}`
|
|
);
|
|
|
|
if (targetShortcode && currentShortcode !== targetShortcode) {
|
|
console.log(`[Extractor] URL mismatch: expected ${targetShortcode}, got ${currentShortcode}`);
|
|
return null;
|
|
}
|
|
|
|
console.log(`[Extractor] Confirmed on correct post: ${currentShortcode}`);
|
|
|
|
// Wait for network to settle
|
|
await page.waitForLoadState('domcontentloaded', { timeout: TIMEOUTS.PAGE_LOAD });
|
|
await page.waitForTimeout(TIMEOUTS.NETWORK_SETTLE);
|
|
|
|
// Try to expand truncated caption by clicking "more" button
|
|
// STRATEGY: Since we're already on the correct page (URL validated above),
|
|
// the FIRST article/main post container should be our target post.
|
|
await tryExpandCaptionInHTMLSection(page);
|
|
|
|
console.log('[Extractor] Extracting caption using intelligent span detection...');
|
|
|
|
const result = await page.evaluate((shortcode) => {
|
|
// Strategy: Find the caption span that belongs to the correct post
|
|
// Instagram loads multiple posts, so we need to find the span associated
|
|
// with our target shortcode
|
|
|
|
const recipeKeywords = [
|
|
'ingredienti',
|
|
'procedimento',
|
|
'preparazione',
|
|
'ricetta',
|
|
'recipe',
|
|
'instructions'
|
|
];
|
|
|
|
// First, try to find links pointing to our target post
|
|
const postLinks = document.querySelectorAll(`a[href*="/${shortcode}"]`);
|
|
console.log(`[Extractor] Found ${postLinks.length} links to target post ${shortcode}`);
|
|
|
|
// If we found links to the post, search for spans within those link ancestors
|
|
const searchRoots: Element[] = [];
|
|
if (postLinks.length > 0) {
|
|
postLinks.forEach((link) => {
|
|
// Get the article or section container for this post
|
|
let container =
|
|
link.closest('article') || link.closest('section') || link.closest('[role="main"]');
|
|
if (container && !searchRoots.includes(container)) {
|
|
searchRoots.push(container);
|
|
console.log(`[Extractor] Found container for target post`);
|
|
}
|
|
});
|
|
}
|
|
|
|
// If no specific containers found, search the whole document (fallback)
|
|
if (searchRoots.length === 0) {
|
|
console.log(`[Extractor] No specific container found, searching whole document`);
|
|
searchRoots.push(document.body);
|
|
}
|
|
|
|
const spans: HTMLElement[] = [];
|
|
searchRoots.forEach((root) => {
|
|
root.querySelectorAll('span').forEach((span) => spans.push(span as HTMLElement));
|
|
});
|
|
|
|
console.log(`[Extractor] Searching ${spans.length} spans for recipe content`);
|
|
|
|
let bestCandidate: CaptionCandidate | null = null;
|
|
|
|
// Search all spans for the best caption candidate
|
|
// PRIMARY CRITERIA: Most <br> tags (recipe formatting indicator)
|
|
spans.forEach((span, spanIdx) => {
|
|
const text = (span.textContent || '').toLowerCase();
|
|
const innerHTML = span.innerHTML || '';
|
|
|
|
// Skip empty or very short spans
|
|
if (text.length < 30) return;
|
|
|
|
// Count <br> tags - this is the MOST reliable indicator for recipes
|
|
const brCount = (innerHTML.match(/<br\s*\/?>/gi) || []).length;
|
|
|
|
// No minimum br count - take what we can get
|
|
|
|
// Calculate a score based on recipe characteristics
|
|
let score = 0;
|
|
|
|
// <br> tags are the PRIMARY signal
|
|
score += brCount * 100; // Massive weight for line breaks
|
|
|
|
// Check for recipe keywords (strong indicator)
|
|
const hasKeywords = recipeKeywords.some((keyword) => text.includes(keyword));
|
|
if (hasKeywords) {
|
|
score += 500; // Huge boost for recipe keywords
|
|
}
|
|
|
|
// Count <a> tags - captions have hashtags/mentions
|
|
const linkCount = span.querySelectorAll('a').length;
|
|
if (linkCount > 2) {
|
|
score += linkCount * 10;
|
|
}
|
|
|
|
// Text length (longer is better for recipes)
|
|
score += Math.min(text.length / 5, 200);
|
|
|
|
// Check for line-height style (caption formatting)
|
|
const style = span.getAttribute('style') || '';
|
|
if (style.includes('line-height')) {
|
|
score += 30;
|
|
}
|
|
|
|
// Penalize UI elements
|
|
if (text.match(/^(follow|following|liked by|view all|more posts|comments)/i)) {
|
|
score -= 500;
|
|
}
|
|
|
|
// Penalize audio/music credits
|
|
if (text.match(/·|papaoutai|afro soul/i) && text.length < 100) {
|
|
score -= 200;
|
|
}
|
|
|
|
// Update best candidate
|
|
if (score > 0 && (!bestCandidate || score > bestCandidate.score)) {
|
|
console.log(
|
|
`[Extractor] New best: score=${score}, len=${text.length}, br=${brCount}, links=${linkCount}, preview="${text.substring(0, 80)}..."`
|
|
);
|
|
bestCandidate = {
|
|
element: span,
|
|
text: span.textContent || '',
|
|
score: score,
|
|
innerHTML: innerHTML,
|
|
brCount: brCount
|
|
};
|
|
}
|
|
});
|
|
|
|
if (!bestCandidate) {
|
|
return {
|
|
success: false,
|
|
error: 'No suitable caption span found',
|
|
text: ''
|
|
};
|
|
}
|
|
|
|
// Explicit type assertion (safe after null guard)
|
|
const candidate: CaptionCandidate = bestCandidate;
|
|
|
|
console.log(
|
|
`[Extractor] Final caption candidate: score=${candidate.score}, length=${candidate.text.length}`
|
|
);
|
|
|
|
// Extract text from the best candidate
|
|
// Use innerHTML to preserve <br> tags, which will be converted to newlines in cleanText
|
|
let captionText = candidate.innerHTML;
|
|
|
|
return {
|
|
success: true,
|
|
text: captionText,
|
|
score: candidate.score,
|
|
length: captionText.length,
|
|
htmlPreview: candidate.innerHTML.substring(0, 500)
|
|
};
|
|
}, currentShortcode);
|
|
|
|
console.log(`[Extractor] HTML Section result:`, {
|
|
success: result.success,
|
|
textLength: result.length,
|
|
score: result.score
|
|
});
|
|
|
|
if (result.htmlPreview) {
|
|
console.log('[Extractor] HTML preview (first 500 chars):');
|
|
console.log(result.htmlPreview);
|
|
}
|
|
|
|
if (!result.success) {
|
|
console.log(`[Extractor] ${result.error}`);
|
|
return null;
|
|
}
|
|
|
|
const captionText = result.text;
|
|
|
|
if (!captionText || captionText.length === 0) {
|
|
console.log('[Extractor] No text extracted from HTML section');
|
|
return null;
|
|
}
|
|
|
|
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
|
|
|
return {
|
|
bodyText: cleanText(captionText),
|
|
thumbnail
|
|
};
|
|
} catch (error) {
|
|
logError('[Extractor] Failed to extract from HTML section', error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Strategy 3: Extract from DOM using specific selectors
|
|
*/
|
|
export async function extractFromDOM(
|
|
page: Page,
|
|
progressCallback?: ProgressCallback
|
|
): Promise<ExtractedContent | null> {
|
|
try {
|
|
// Give Instagram more time to load dynamic content
|
|
console.log('[Extractor] Waiting for network idle...');
|
|
await page.waitForLoadState('networkidle', { timeout: TIMEOUTS.PAGE_LOAD }).catch(() => {
|
|
console.log('[Extractor] Network idle timeout, continuing anyway');
|
|
});
|
|
|
|
// Try to wait for article content
|
|
await page.waitForSelector('article', { timeout: TIMEOUTS.ARTICLE_SELECTOR }).catch(() => {});
|
|
|
|
// Additional wait for dynamic content
|
|
await page.waitForTimeout(TIMEOUTS.NETWORK_SETTLE);
|
|
|
|
// Try to intercept GraphQL responses
|
|
let graphqlCaption: string | null = null;
|
|
page.on('response', async (response) => {
|
|
const url = response.url();
|
|
if (url.includes('graphql') || url.includes('api/v1')) {
|
|
try {
|
|
const json = await response.json();
|
|
const captionData = extractCaptionFromGraphQL(json);
|
|
if (captionData && captionData.length > 130) {
|
|
graphqlCaption = captionData;
|
|
console.log(
|
|
`[Extractor] Intercepted GraphQL response with ${captionData.length} chars`
|
|
);
|
|
}
|
|
} catch (e) {
|
|
// Not JSON or parsing failed
|
|
}
|
|
}
|
|
});
|
|
|
|
await page.waitForTimeout(TIMEOUTS.GRAPHQL_WAIT);
|
|
|
|
if (graphqlCaption) {
|
|
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
|
return { bodyText: cleanText(graphqlCaption), thumbnail };
|
|
}
|
|
|
|
// Try to expand truncated captions by clicking "more" button
|
|
await tryExpandCaptionInDOM(page);
|
|
|
|
const captionText = await page.evaluate(() => {
|
|
// First check og:description for comparison
|
|
const metaDesc = document.querySelector('meta[property="og:description"]');
|
|
const ogContent = metaDesc?.getAttribute('content') || '';
|
|
console.log(`[Extractor] og:description length: ${ogContent.length}`);
|
|
if (ogContent.length > 200) {
|
|
console.log(`[Extractor] og:description preview: ${ogContent.substring(0, 200)}...`);
|
|
}
|
|
|
|
// SMART APPROACH: Find the truncated text first, then look for full version nearby
|
|
// Look for text that ends with "..." or "… more"
|
|
const allSpans = Array.from(
|
|
document.querySelectorAll('article span, article div, article h1')
|
|
);
|
|
|
|
let longestText = '';
|
|
let matchedElement = null;
|
|
|
|
// Strategy 1: Find elements with substantial text
|
|
for (const element of allSpans) {
|
|
const text = element.textContent?.trim() || '';
|
|
|
|
// Skip UI elements
|
|
if (text.match(/^(follow|like|comment|share|view all|load more|add a comment)$/i)) {
|
|
continue;
|
|
}
|
|
|
|
// Look for text that seems like content
|
|
if (text.length > longestText.length) {
|
|
longestText = text;
|
|
matchedElement = element;
|
|
}
|
|
}
|
|
|
|
// Strategy 2: Look in data attributes
|
|
const elementsWithData = Array.from(
|
|
document.querySelectorAll('[data-caption], [data-text], [data-content]')
|
|
);
|
|
for (const el of elementsWithData) {
|
|
const dataCaption =
|
|
el.getAttribute('data-caption') ||
|
|
el.getAttribute('data-text') ||
|
|
el.getAttribute('data-content');
|
|
if (dataCaption && dataCaption.length > longestText.length) {
|
|
longestText = dataCaption;
|
|
console.log(`[Extractor] Found data attribute with ${dataCaption.length} chars`);
|
|
}
|
|
}
|
|
|
|
// Strategy 3: Look for hidden/collapsed content
|
|
const hiddenElements = Array.from(
|
|
document.querySelectorAll(
|
|
'[style*="display: none"], [style*="display:none"], .collapsed, [aria-hidden="true"]'
|
|
)
|
|
);
|
|
for (const el of hiddenElements) {
|
|
const text = el.textContent?.trim() || '';
|
|
if (text.length > longestText.length && text.length > 200) {
|
|
longestText = text;
|
|
console.log(`[Extractor] Found hidden element with ${text.length} chars`);
|
|
}
|
|
}
|
|
|
|
// Strategy 4: Find parent of truncated text
|
|
if (matchedElement && longestText.endsWith('...')) {
|
|
// Look at siblings and parent
|
|
const parent = matchedElement.parentElement;
|
|
if (parent) {
|
|
const parentText = parent.textContent?.trim() || '';
|
|
if (parentText.length > longestText.length) {
|
|
longestText = parentText;
|
|
console.log(
|
|
`[Extractor] Found fuller text in parent element: ${parentText.length} chars`
|
|
);
|
|
}
|
|
}
|
|
|
|
// Check next siblings
|
|
let sibling = matchedElement.nextElementSibling;
|
|
let siblingCount = 0;
|
|
while (sibling && siblingCount < 5) {
|
|
const siblingText = sibling.textContent?.trim() || '';
|
|
if (siblingText.length > 50) {
|
|
longestText = longestText + ' ' + siblingText;
|
|
console.log(`[Extractor] Found continuation in sibling: ${siblingText.length} chars`);
|
|
}
|
|
sibling = sibling.nextElementSibling;
|
|
siblingCount++;
|
|
}
|
|
}
|
|
|
|
if (longestText && longestText.length > 100) {
|
|
console.log(`[Extractor] Best extraction: ${longestText.length} chars`);
|
|
return longestText;
|
|
}
|
|
|
|
// Fallback to og:description
|
|
if (metaDesc) {
|
|
const content = ogContent;
|
|
const cleanedContent = content.replace(
|
|
/^\d+K?\s+likes,\s+\d+\s+comments\s+-\s+[\w.]+\s+on\s+[^:]+:\s*["']?/,
|
|
''
|
|
);
|
|
console.log('[Extractor] DOM selector fallback: og:description (with metadata cleanup)');
|
|
return cleanedContent;
|
|
}
|
|
|
|
return null;
|
|
});
|
|
|
|
if (!captionText) {
|
|
return null;
|
|
}
|
|
|
|
// Extract thumbnail using existing logic
|
|
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
|
|
|
return {
|
|
bodyText: cleanText(captionText),
|
|
thumbnail
|
|
};
|
|
} catch (error) {
|
|
logError('[Extractor] Failed to extract from DOM', error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Strategy 3: Extract via GraphQL API
|
|
*/
|
|
async function extractViaGraphQL(
|
|
url: string,
|
|
context: BrowserContext
|
|
): Promise<ExtractedContent | null> {
|
|
const shortcode = extractShortcode(url);
|
|
if (!shortcode) {
|
|
console.warn('Could not extract shortcode from URL:', url);
|
|
return null;
|
|
}
|
|
|
|
try {
|
|
const page = await context.newPage();
|
|
|
|
// Make GraphQL request
|
|
const response = await page.request.post('https://www.instagram.com/graphql/query/', {
|
|
form: {
|
|
variables: JSON.stringify({ shortcode }),
|
|
doc_id: '7950326061742207' // May need periodic updates
|
|
}
|
|
});
|
|
|
|
if (!response.ok()) {
|
|
console.warn(`GraphQL request failed: ${response.status()}`);
|
|
await page.close();
|
|
return null;
|
|
}
|
|
|
|
const data = await response.json();
|
|
|
|
// Parse GraphQL response
|
|
const media = data?.data?.shortcode_media;
|
|
if (!media) {
|
|
await page.close();
|
|
return null;
|
|
}
|
|
|
|
const bodyText = media.edge_media_to_caption?.edges?.[0]?.node?.text || '';
|
|
|
|
await page.close();
|
|
|
|
if (!bodyText) {
|
|
return null;
|
|
}
|
|
|
|
return {
|
|
bodyText: cleanText(bodyText),
|
|
thumbnail: null // GraphQL doesn't easily provide thumbnail, would need page context
|
|
};
|
|
} catch (error) {
|
|
logError('[Extractor] GraphQL extraction failed', error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Strategy 4: Legacy extraction method (fallback)
|
|
*/
|
|
|
|
async function extractCleanTextLegacy(page: Page): Promise<string> {
|
|
let text = (await page.evaluate(() => document.body.innerText))
|
|
.replace(/^(?:.*\n){6}/, '') // Remove first 6 lines
|
|
.split('More posts from')[0] // Cut at "More posts from"
|
|
.trim();
|
|
|
|
// Remove mentions and hashtags
|
|
text = text.replace(/@\w+/g, '').replace(/#\w+/g, '');
|
|
|
|
return text;
|
|
}
|
|
|
|
/**
|
|
* Strategy 5: Extract from Instagram's internal state/cache
|
|
*/
|
|
async function extractFromInternalState(
|
|
page: Page,
|
|
progressCallback?: ProgressCallback
|
|
): Promise<ExtractedContent | null> {
|
|
try {
|
|
const stateData = await page.evaluate(() => {
|
|
// Try to access Instagram's internal React/Apollo cache
|
|
const possibleKeys = [
|
|
'_sharedData',
|
|
'__PRIVATE_STATE__',
|
|
'__additionalData',
|
|
'__initialData',
|
|
'__RELAY_STORE__'
|
|
];
|
|
|
|
for (const key of possibleKeys) {
|
|
if ((window as any)[key]) {
|
|
const data = (window as any)[key];
|
|
console.log(`[Extractor] Found internal state: ${key}`);
|
|
return { key, data: JSON.stringify(data).substring(0, 500000) }; // Limit to 500KB
|
|
}
|
|
}
|
|
|
|
return null;
|
|
});
|
|
|
|
if (stateData) {
|
|
console.log(`[Extractor] Parsing internal state from ${stateData.key}`);
|
|
try {
|
|
const parsed = JSON.parse(stateData.data);
|
|
|
|
// Try multiple parsing strategies
|
|
let result = parseInstagramData(parsed);
|
|
|
|
console.log(`[Extractor] Standard parsing result: ${result?.bodyText?.length || 0} chars`);
|
|
|
|
// Debug: log structure
|
|
if (parsed.entry_data) {
|
|
console.log(`[Extractor] Found entry_data with keys:`, Object.keys(parsed.entry_data));
|
|
}
|
|
if (parsed.config) {
|
|
console.log(`[Extractor] Found config`);
|
|
}
|
|
|
|
// If standard parsing failed, try deep search for caption text
|
|
if (!result || !result.bodyText || result.bodyText.length <= 130) {
|
|
console.log(`[Extractor] Attempting deep search in ${stateData.key}...`);
|
|
result = deepSearchForCaption(parsed);
|
|
if (result) {
|
|
console.log(`[Extractor] Deep search found: ${result.bodyText.length} chars`);
|
|
} else {
|
|
console.log(`[Extractor] Deep search found no caption`);
|
|
}
|
|
}
|
|
|
|
if (result && result.bodyText && result.bodyText.length > 130) {
|
|
console.log(
|
|
`[Extractor] Successfully extracted from ${stateData.key}, length: ${result.bodyText.length}`
|
|
);
|
|
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
|
return { ...result, thumbnail };
|
|
} else if (result?.bodyText) {
|
|
console.log(
|
|
`[Extractor] Found text in ${stateData.key} but it's truncated (${result.bodyText.length} chars)`
|
|
);
|
|
}
|
|
} catch (e) {
|
|
console.log(`[Extractor] Failed to parse ${stateData.key}:`, e);
|
|
}
|
|
}
|
|
|
|
return null;
|
|
} catch (error) {
|
|
logError('[Extractor] Failed to extract from internal state', error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Deep search for caption text in any nested object structure
|
|
*/
|
|
function deepSearchForCaption(
|
|
obj: any,
|
|
maxDepth = 10,
|
|
currentDepth = 0
|
|
): Omit<ExtractedContent, 'thumbnail'> | null {
|
|
if (currentDepth > maxDepth || !obj || typeof obj !== 'object') {
|
|
return null;
|
|
}
|
|
|
|
// Look for caption/text fields
|
|
if (obj.caption && typeof obj.caption === 'object' && obj.caption.text) {
|
|
const text = obj.caption.text;
|
|
if (typeof text === 'string' && text.length > 130) {
|
|
return { bodyText: cleanText(text) };
|
|
}
|
|
}
|
|
|
|
// Look for edge_media_to_caption pattern
|
|
if (obj.edge_media_to_caption?.edges?.[0]?.node?.text) {
|
|
const text = obj.edge_media_to_caption.edges[0].node.text;
|
|
if (typeof text === 'string' && text.length > 130) {
|
|
return { bodyText: cleanText(text) };
|
|
}
|
|
}
|
|
|
|
// Look for direct text field in media items
|
|
if (obj.text && typeof obj.text === 'string' && obj.text.length > 130) {
|
|
// Make sure it's not just a UI label
|
|
if (!obj.text.match(/^(more|less|follow|like|comment|share)$/i)) {
|
|
return { bodyText: cleanText(obj.text) };
|
|
}
|
|
}
|
|
|
|
// Recursively search in all properties
|
|
for (const key in obj) {
|
|
if (obj.hasOwnProperty(key)) {
|
|
const result = deepSearchForCaption(obj[key], maxDepth, currentDepth + 1);
|
|
if (result && result.bodyText.length > 130) {
|
|
return result;
|
|
}
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Extract caption from intercepted GraphQL response
|
|
*/
|
|
/**
|
|
* Extract caption from GraphQL response, validating it matches the expected shortcode
|
|
*/
|
|
function extractCaptionFromGraphQL(data: any, expectedShortcode?: string): string | null {
|
|
// If we have an expected shortcode, verify this GraphQL response is for that content
|
|
if (expectedShortcode) {
|
|
// Search for shortcode in the response
|
|
const hasMatchingShortcode = JSON.stringify(data).includes(expectedShortcode);
|
|
if (!hasMatchingShortcode) {
|
|
// This GraphQL response is for different content, ignore it
|
|
return null;
|
|
}
|
|
}
|
|
|
|
const result = deepSearchForCaption(data);
|
|
return result?.bodyText || null;
|
|
}
|
|
|
|
/**
|
|
* Orchestrate extraction strategies
|
|
*/
|
|
async function extractWithStrategies(
|
|
url: string,
|
|
page: Page,
|
|
context: BrowserContext,
|
|
onProgress?: ProgressCallback
|
|
): Promise<ExtractionResult> {
|
|
const strategies: Array<{
|
|
name: ExtractionMethod;
|
|
fn: () => Promise<ExtractedContent | null>;
|
|
}> = [
|
|
{
|
|
name: 'embedded-json',
|
|
fn: () => extractFromEmbeddedJSON(page, onProgress)
|
|
},
|
|
{
|
|
name: 'internal-state',
|
|
fn: () => extractFromInternalState(page, onProgress)
|
|
},
|
|
{
|
|
name: 'html-section',
|
|
fn: () => extractFromHTMLSection(page, onProgress, url)
|
|
},
|
|
{
|
|
name: 'dom-selector',
|
|
fn: () => extractFromDOM(page, onProgress)
|
|
},
|
|
{
|
|
name: 'graphql-api',
|
|
fn: () => extractViaGraphQL(url, context)
|
|
},
|
|
{
|
|
name: 'legacy',
|
|
fn: async () => {
|
|
const text = await extractCleanTextLegacy(page);
|
|
const thumbnail = await extractThumbnailStealth(page, onProgress);
|
|
return { bodyText: text, thumbnail };
|
|
}
|
|
}
|
|
];
|
|
|
|
for (const strategy of strategies) {
|
|
try {
|
|
const methodMessage = `Trying extraction method: ${getMethodDisplayName(strategy.name)}`;
|
|
console.log(`[Extractor] ${methodMessage}`);
|
|
|
|
onProgress?.({
|
|
type: 'method',
|
|
message: methodMessage,
|
|
method: strategy.name,
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
|
|
const result = await strategy.fn();
|
|
|
|
if (result && result.bodyText) {
|
|
const successMessage = `✓ Success with method: ${getMethodDisplayName(strategy.name)}`;
|
|
console.log(`[Extractor] ${successMessage}`);
|
|
|
|
onProgress?.({
|
|
type: 'status',
|
|
message: successMessage,
|
|
method: strategy.name,
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
|
|
return {
|
|
success: true,
|
|
method: strategy.name,
|
|
data: result
|
|
};
|
|
}
|
|
} catch (error) {
|
|
logError(`[Extractor] Method ${strategy.name} failed`, error);
|
|
// Continue to next strategy
|
|
}
|
|
}
|
|
|
|
return {
|
|
success: false,
|
|
error: 'All extraction methods failed'
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Extract text content and thumbnail from a URL using Playwright browser
|
|
* Uses multiple extraction strategies with fallback
|
|
* @param url - The URL to extract from
|
|
* @param onProgress - Optional callback to receive progress updates
|
|
* @returns Extracted text and thumbnail
|
|
*/
|
|
export async function extractTextAndThumbnail(
|
|
url: string,
|
|
onProgress?: ProgressCallback
|
|
): Promise<ExtractedContent> {
|
|
onProgress?.({
|
|
type: 'status',
|
|
message: 'Starting extraction...',
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
|
|
return withRetry(
|
|
async () => {
|
|
const authPath = resolveAuthPath();
|
|
const context = await createBrowserContext(authPath);
|
|
const page = await context.newPage();
|
|
|
|
// Extract shortcode for validation
|
|
const expectedShortcode = extractShortcode(url);
|
|
console.log(`[Extractor] Target shortcode: ${expectedShortcode || 'unknown'}`);
|
|
|
|
try {
|
|
// Set timeout
|
|
page.setDefaultTimeout(30000);
|
|
|
|
// Set up GraphQL response interception BEFORE loading the page
|
|
// This is critical to catch initial network requests during page load
|
|
let interceptedCaption: string | null = null;
|
|
page.on('response', async (response) => {
|
|
try {
|
|
const responseUrl = response.url();
|
|
if (
|
|
responseUrl.includes('graphql') ||
|
|
responseUrl.includes('api/v1') ||
|
|
responseUrl.includes('/web/')
|
|
) {
|
|
try {
|
|
const json = await response.json();
|
|
const captionData = extractCaptionFromGraphQL(json, expectedShortcode ?? undefined);
|
|
if (captionData && captionData.length > 130) {
|
|
interceptedCaption = captionData;
|
|
console.log(
|
|
`[Extractor] ✓ Intercepted GraphQL with full caption: ${captionData.length} chars (shortcode verified)`
|
|
);
|
|
}
|
|
} catch (e) {
|
|
// Not JSON or parse error, skip
|
|
}
|
|
}
|
|
} catch (e) {
|
|
// Ignore response errors
|
|
}
|
|
});
|
|
|
|
onProgress?.({
|
|
type: 'status',
|
|
message: 'Loading Instagram page...',
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
|
|
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
|
|
// Add small human-like delay
|
|
await page.waitForTimeout(1000 + Math.random() * 2000);
|
|
|
|
// Try scrolling and waiting to trigger additional GraphQL requests
|
|
console.log('[Extractor] Scrolling to trigger lazy loading...');
|
|
await page.evaluate(() => {
|
|
window.scrollBy(0, 300);
|
|
});
|
|
await page.waitForTimeout(1500);
|
|
|
|
await page.evaluate(() => {
|
|
window.scrollBy(0, 300);
|
|
});
|
|
await page.waitForTimeout(1500);
|
|
|
|
await page.evaluate(() => {
|
|
window.scrollTo(0, 0);
|
|
});
|
|
await page.waitForTimeout(1000);
|
|
|
|
// If we intercepted a full caption, use it immediately
|
|
if (interceptedCaption) {
|
|
console.log('[Extractor] Using intercepted caption from network traffic');
|
|
const thumbnail = await extractThumbnailStealth(page, onProgress);
|
|
onProgress?.({
|
|
type: 'complete',
|
|
message: 'Extraction completed via GraphQL interception',
|
|
method: 'graphql-intercept',
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
return { bodyText: cleanText(interceptedCaption), thumbnail };
|
|
}
|
|
|
|
const result = await extractWithStrategies(url, page, context, onProgress);
|
|
|
|
if (!result.success || !result.data) {
|
|
throw new Error(result.error || 'Extraction failed');
|
|
}
|
|
|
|
// Save debug content
|
|
fs.writeFileSync(
|
|
path.resolve('debug_page.txt'),
|
|
`Method: ${result.method}\n\n${result.data.bodyText}`
|
|
);
|
|
|
|
onProgress?.({
|
|
type: 'complete',
|
|
message: 'Extraction completed successfully',
|
|
method: result.method,
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
|
|
return result.data;
|
|
} finally {
|
|
await page.close();
|
|
await context.close();
|
|
}
|
|
},
|
|
DEFAULT_RETRY_CONFIG,
|
|
onProgress
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Extract thumbnail from video element or take full page screenshot
|
|
*/
|
|
/**
|
|
* Screenshot-based thumbnail extraction (fallback method)
|
|
* Takes a screenshot of the video element or full page if video not found
|
|
*/
|
|
async function extractThumbnailScreenshot(page: Page): Promise<string | null> {
|
|
const videoBounds = await page.evaluate(() => {
|
|
const video = document.querySelector('video');
|
|
if (!video) return null;
|
|
const rect = video.getBoundingClientRect();
|
|
return {
|
|
x: Math.max(0, rect.left),
|
|
y: Math.max(0, rect.top),
|
|
width: Math.min(rect.width, window.innerWidth),
|
|
height: Math.min(rect.height, window.innerHeight)
|
|
};
|
|
});
|
|
|
|
let screenshotBuffer: Buffer;
|
|
|
|
if (videoBounds && videoBounds.width > 0 && videoBounds.height > 0) {
|
|
screenshotBuffer = await page.screenshot({
|
|
type: 'jpeg',
|
|
quality: 85,
|
|
clip: videoBounds
|
|
});
|
|
} else {
|
|
console.warn('[Thumbnail] Video element not found or has no size, taking full page screenshot');
|
|
screenshotBuffer = await page.screenshot({ type: 'jpeg', quality: 85 });
|
|
}
|
|
|
|
return `data:image/jpeg;base64,${screenshotBuffer.toString('base64')}`;
|
|
}
|
|
|
|
/**
|
|
* Helper: Fetch image from URL and convert to base64 data URI
|
|
*
|
|
* **Validation Criteria:**
|
|
* - HTTP status must be exactly 200 (not 2xx, only 200)
|
|
* - Content-Type must start with 'image/' (e.g., image/jpeg, image/png, image/webp)
|
|
* - Request must complete within 10 seconds
|
|
*
|
|
* **Failure Scenarios:**
|
|
* - Non-200 status → Returns null, reports status code via progress callback
|
|
* - Invalid content-type → Returns null, reports content-type via progress callback
|
|
* - Timeout → Returns null, reports timeout via progress callback
|
|
* - Network error → Returns null, reports error message via progress callback
|
|
*
|
|
* **Usage in Fallback Chain:**
|
|
* This function is used by `extractThumbnailStealth()` which tries multiple URL sources:
|
|
* 1. Meta tags (og:image, twitter:image)
|
|
* 2. Video poster attribute
|
|
* 3. Instagram data structures (display_url, thumbnail_src)
|
|
* 4. Screenshot fallback (always succeeds)
|
|
*
|
|
* When this function returns null, extraction continues to the next method.
|
|
*
|
|
* @param imageUrl - The image URL to fetch (must be HTTPS)
|
|
* @param progressCallback - Optional callback for progress reporting
|
|
* @returns Base64 data URI (data:image/*;base64,...) or null if validation fails
|
|
*
|
|
* @example
|
|
* ```typescript
|
|
* const thumbnail = await fetchImageAsBase64(
|
|
* 'https://instagram.com/image.jpg',
|
|
* (event) => console.log(event.message)
|
|
* );
|
|
*
|
|
* if (thumbnail) {
|
|
* // thumbnail is a valid base64 data URI
|
|
* console.log(thumbnail.substring(0, 50)); // "data:image/jpeg;base64,/9j/4AAQSkZJRg..."
|
|
* } else {
|
|
* // URL validation failed, try next method
|
|
* }
|
|
* ```
|
|
*/
|
|
async function fetchImageAsBase64(
|
|
imageUrl: string,
|
|
progressCallback?: ProgressCallback
|
|
): Promise<string | null> {
|
|
try {
|
|
// Create abort controller for timeout
|
|
const controller = new AbortController();
|
|
const timeoutId = setTimeout(() => controller.abort(), 10000); // 10s timeout
|
|
|
|
console.log(`[Thumbnail] Validating URL: ${imageUrl}`);
|
|
|
|
const response = await fetch(imageUrl, {
|
|
signal: controller.signal
|
|
});
|
|
|
|
clearTimeout(timeoutId);
|
|
|
|
// Strict status validation: must be exactly 200
|
|
if (response.status !== 200) {
|
|
console.warn(`[Thumbnail] URL validation failed: HTTP ${response.status} for ${imageUrl}`);
|
|
progressCallback?.({
|
|
type: 'status',
|
|
message: `Thumbnail URL returned HTTP ${response.status}, trying next method...`,
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
return null;
|
|
}
|
|
|
|
// Validate content-type
|
|
const contentType = response.headers.get('content-type') || '';
|
|
if (!contentType.startsWith('image/')) {
|
|
console.warn(
|
|
`[Thumbnail] URL validation failed: Invalid content-type '${contentType}' for ${imageUrl}`
|
|
);
|
|
progressCallback?.({
|
|
type: 'status',
|
|
message: `Thumbnail URL returned non-image content (${contentType}), trying next method...`,
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
return null;
|
|
}
|
|
|
|
console.log(`[Thumbnail] URL validation successful: ${imageUrl} (${contentType})`);
|
|
|
|
const arrayBuffer = await response.arrayBuffer();
|
|
const buffer = Buffer.from(arrayBuffer);
|
|
|
|
const base64Data = `data:${contentType};base64,${buffer.toString('base64')}`;
|
|
|
|
progressCallback?.({
|
|
type: 'status',
|
|
message: 'Thumbnail fetched and validated from URL',
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
|
|
return base64Data;
|
|
} catch (e) {
|
|
if (e instanceof Error) {
|
|
if (e.name === 'AbortError') {
|
|
console.error(`[Thumbnail] URL fetch timeout: ${imageUrl}`);
|
|
progressCallback?.({
|
|
type: 'status',
|
|
message: 'Thumbnail URL fetch timeout, trying next method...',
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
} else {
|
|
console.error(`[Thumbnail] Failed to fetch image from ${imageUrl}:`, e.message);
|
|
progressCallback?.({
|
|
type: 'status',
|
|
message: `Thumbnail URL fetch failed (${e.message}), trying next method...`,
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
}
|
|
} else {
|
|
logError('[Thumbnail] Failed to fetch image', e);
|
|
}
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract thumbnail from Instagram post using stealth techniques
|
|
*
|
|
* Tries multiple methods in order of stealth:
|
|
* 1. Meta tags (og:image, twitter:image) - Returns: Direct HTTPS URL
|
|
* 2. Video poster attribute - Returns: Direct HTTPS URL
|
|
* 3. Instagram window data structures - Returns: Direct HTTPS URL
|
|
* 4. Screenshot fallback - Returns: Base64 data URL (data:image/jpeg;base64,...)
|
|
*
|
|
* @param page - Playwright page instance
|
|
* @param progressCallback - Optional progress callback for SSE updates
|
|
* @returns Image URL (either direct HTTPS URL or base64 data URL) or null if all methods fail
|
|
*
|
|
* **Thumbnail Format Guide:**
|
|
* - Methods 1-3: Return direct HTTPS URLs → Tandoor can use URL pass-through (efficient)
|
|
* - Method 4: Returns base64 data URL → Requires conversion to file blob for upload
|
|
*/
|
|
async function extractThumbnailStealth(
|
|
page: Page,
|
|
progressCallback?: ProgressCallback
|
|
): Promise<string | null> {
|
|
console.log('[Thumbnail] Starting stealth extraction');
|
|
|
|
// Method 1: Try meta tags (most stealthy)
|
|
try {
|
|
const ogImage = await page.getAttribute('meta[property="og:image"]', 'content');
|
|
if (ogImage) {
|
|
console.log('[Thumbnail] Found og:image meta tag');
|
|
const imageBuffer = await fetchImageAsBase64(ogImage, progressCallback);
|
|
if (imageBuffer) {
|
|
if (progressCallback) {
|
|
progressCallback({
|
|
type: 'thumbnail',
|
|
message: 'Thumbnail extracted from meta tags',
|
|
data: { thumbnail: imageBuffer },
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
}
|
|
return imageBuffer;
|
|
}
|
|
}
|
|
|
|
const twitterImage = await page.getAttribute('meta[name="twitter:image"]', 'content');
|
|
if (twitterImage) {
|
|
console.log('[Thumbnail] Found twitter:image meta tag');
|
|
const imageBuffer = await fetchImageAsBase64(twitterImage, progressCallback);
|
|
if (imageBuffer) {
|
|
if (progressCallback) {
|
|
progressCallback({
|
|
type: 'thumbnail',
|
|
message: 'Thumbnail extracted from meta tags',
|
|
data: { thumbnail: imageBuffer },
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
}
|
|
return imageBuffer;
|
|
}
|
|
}
|
|
} catch (e) {
|
|
logError('[Thumbnail] Meta tag method failed', e);
|
|
}
|
|
|
|
// Method 2: Try video poster attribute
|
|
try {
|
|
const poster = await page.getAttribute('video', 'poster');
|
|
if (poster) {
|
|
console.log('[Thumbnail] Found video poster attribute');
|
|
const imageBuffer = await fetchImageAsBase64(poster, progressCallback);
|
|
if (imageBuffer) {
|
|
if (progressCallback) {
|
|
progressCallback({
|
|
type: 'thumbnail',
|
|
message: 'Thumbnail extracted from video poster',
|
|
data: { thumbnail: imageBuffer },
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
}
|
|
return imageBuffer;
|
|
}
|
|
}
|
|
} catch (e) {
|
|
logError('[Thumbnail] Video poster method failed', e);
|
|
}
|
|
|
|
// Method 3: Try Instagram window data structures
|
|
try {
|
|
const thumbnailUrl = await page.evaluate(() => {
|
|
// Check for Instagram's internal data structures
|
|
const data = (window as any).__additionalDataLoaded;
|
|
if (data) {
|
|
// Navigate through Instagram's data structure
|
|
for (const key in data) {
|
|
const item = data[key];
|
|
if (item?.graphql?.shortcode_media?.display_url) {
|
|
return item.graphql.shortcode_media.display_url;
|
|
}
|
|
if (item?.graphql?.shortcode_media?.thumbnail_src) {
|
|
return item.graphql.shortcode_media.thumbnail_src;
|
|
}
|
|
}
|
|
}
|
|
return null;
|
|
});
|
|
|
|
if (thumbnailUrl) {
|
|
console.log('[Thumbnail] Found thumbnail in Instagram data structures');
|
|
const imageBuffer = await fetchImageAsBase64(thumbnailUrl, progressCallback);
|
|
if (imageBuffer) {
|
|
if (progressCallback) {
|
|
progressCallback({
|
|
type: 'thumbnail',
|
|
message: 'Thumbnail extracted from Instagram data',
|
|
data: { thumbnail: imageBuffer },
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
}
|
|
return imageBuffer;
|
|
}
|
|
}
|
|
} catch (e) {
|
|
logError('[Thumbnail] Instagram data method failed', e);
|
|
}
|
|
|
|
// Method 4: Screenshot fallback (existing method)
|
|
console.log('[Thumbnail] Falling back to screenshot method');
|
|
const screenshotThumbnail = await extractThumbnailScreenshot(page);
|
|
if (screenshotThumbnail && progressCallback) {
|
|
progressCallback({
|
|
type: 'thumbnail',
|
|
message: 'Thumbnail extracted via screenshot',
|
|
data: { thumbnail: screenshotThumbnail },
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
}
|
|
return screenshotThumbnail;
|
|
}
|