Some checks failed
Build & Push Docker Image / test-and-build (push) Failing after 33s
Instagram's GraphQL API silently truncates captions WITHOUT '….' markers. Both DWWxiymssxE (393 chars full, 327 from API) and DXT73izCBoH (744+ chars full, cut mid-sentence) were affected. Remove the GraphQL-interception shortcut entirely. Always use DOM extraction (HTML Section) which clicks '… more' to get the complete text. The intercepted GraphQL caption is kept only as emergency fallback if all DOM strategies fail. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1733 lines
50 KiB
TypeScript
1733 lines
50 KiB
TypeScript
import { createBrowserContext } from './browser';
|
|
import { logError } from './utils/logger';
|
|
import fs from 'fs';
|
|
import path from 'path';
|
|
import type { Page, BrowserContext } from 'playwright';
|
|
|
|
export interface ExtractedContent {
|
|
bodyText: string;
|
|
thumbnail: string | null;
|
|
}
|
|
|
|
export type ExtractionMethod =
|
|
| 'embedded-json'
|
|
| 'internal-state'
|
|
| 'html-section'
|
|
| 'dom-selector'
|
|
| 'graphql-api'
|
|
| 'graphql-intercept'
|
|
| 'legacy';
|
|
|
|
type CaptionCandidate = {
|
|
element: Element;
|
|
text: string;
|
|
score: number;
|
|
innerHTML: string;
|
|
brCount: number;
|
|
};
|
|
|
|
export type ProgressEventType =
|
|
| 'status'
|
|
| 'method'
|
|
| 'retry'
|
|
| 'error'
|
|
| 'thumbnail'
|
|
| 'complete'
|
|
| 'model_loading';
|
|
|
|
export interface ProgressEvent {
|
|
type: ProgressEventType;
|
|
message: string;
|
|
method?: ExtractionMethod;
|
|
attemptNumber?: number;
|
|
maxAttempts?: number;
|
|
data?: any;
|
|
timestamp?: string;
|
|
}
|
|
|
|
export type ProgressCallback = (event: ProgressEvent) => void;
|
|
|
|
interface ExtractionResult {
|
|
success: boolean;
|
|
method?: ExtractionMethod;
|
|
data?: ExtractedContent;
|
|
error?: string;
|
|
}
|
|
|
|
interface InstagramEmbeddedData {
|
|
entry_data?: {
|
|
PostPage?: Array<{
|
|
graphql?: {
|
|
shortcode_media?: {
|
|
edge_media_to_caption?: {
|
|
edges?: Array<{ node: { text: string } }>;
|
|
};
|
|
display_url?: string;
|
|
video_url?: string;
|
|
owner?: {
|
|
username: string;
|
|
profile_pic_url: string;
|
|
};
|
|
};
|
|
};
|
|
}>;
|
|
};
|
|
}
|
|
|
|
interface RetryConfig {
|
|
maxAttempts: number;
|
|
initialDelayMs: number;
|
|
maxDelayMs: number;
|
|
backoffMultiplier: number;
|
|
}
|
|
|
|
const DEFAULT_RETRY_CONFIG: RetryConfig = {
|
|
maxAttempts: 3,
|
|
initialDelayMs: 1000,
|
|
maxDelayMs: 10000,
|
|
backoffMultiplier: 2
|
|
};
|
|
|
|
/**
|
|
* Resolve authentication storage path
|
|
* Checks Docker path first, then local path
|
|
*/
|
|
function resolveAuthPath(): string | undefined {
|
|
const authPathDocker = '/app/secrets/auth.json';
|
|
const authPathLocal = './secrets/auth.json';
|
|
|
|
if (fs.existsSync(authPathDocker)) {
|
|
return authPathDocker;
|
|
}
|
|
|
|
if (fs.existsSync(authPathLocal)) {
|
|
return authPathLocal;
|
|
}
|
|
|
|
return undefined;
|
|
}
|
|
|
|
/**
|
|
* Sleep utility for retry logic
|
|
*/
|
|
async function sleep(ms: number): Promise<void> {
|
|
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
}
|
|
|
|
/**
|
|
* Check if error should not be retried
|
|
*/
|
|
function isNonRetriableError(error: unknown): boolean {
|
|
if (error instanceof Error) {
|
|
// Don't retry authentication errors
|
|
if (error.message.includes('authentication') || error.message.includes('login required')) {
|
|
return true;
|
|
}
|
|
|
|
// Don't retry invalid URLs
|
|
if (error.message.includes('invalid url')) {
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/**
|
|
* Get human-readable display name for extraction method
|
|
*/
|
|
function getMethodDisplayName(method: ExtractionMethod): string {
|
|
const names: Record<ExtractionMethod, string> = {
|
|
'embedded-json': 'Embedded JSON',
|
|
'internal-state': 'Internal State',
|
|
'html-section': 'HTML Section',
|
|
'dom-selector': 'DOM Selector',
|
|
'graphql-api': 'GraphQL API',
|
|
'graphql-intercept': 'GraphQL Intercept',
|
|
legacy: 'Legacy Parser'
|
|
};
|
|
return names[method];
|
|
}
|
|
|
|
/**
|
|
* Retry wrapper with exponential backoff
|
|
*/
|
|
async function withRetry<T>(
|
|
fn: () => Promise<T>,
|
|
config: RetryConfig = DEFAULT_RETRY_CONFIG,
|
|
onProgress?: ProgressCallback
|
|
): Promise<T> {
|
|
let lastError: Error | null = null;
|
|
let delay = config.initialDelayMs;
|
|
|
|
for (let attempt = 1; attempt <= config.maxAttempts; attempt++) {
|
|
try {
|
|
return await fn();
|
|
} catch (error) {
|
|
lastError = error as Error;
|
|
|
|
// Don't retry on certain errors
|
|
if (isNonRetriableError(error)) {
|
|
onProgress?.({
|
|
type: 'error',
|
|
message: `Non-retriable error: ${lastError.message}`,
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
throw error;
|
|
}
|
|
|
|
if (attempt < config.maxAttempts) {
|
|
const message = `Attempt ${attempt}/${config.maxAttempts} failed. Retrying in ${delay}ms...`;
|
|
logError(`[Retry] ${message}`, error);
|
|
|
|
onProgress?.({
|
|
type: 'retry',
|
|
message,
|
|
attemptNumber: attempt,
|
|
maxAttempts: config.maxAttempts,
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
|
|
await sleep(delay);
|
|
delay = Math.min(delay * config.backoffMultiplier, config.maxDelayMs);
|
|
}
|
|
}
|
|
}
|
|
|
|
throw lastError || new Error('Max retry attempts exceeded');
|
|
}
|
|
|
|
/**
|
|
* Extract shortcode from Instagram URL
|
|
*/
|
|
function extractShortcode(url: string): string | undefined {
|
|
// Extract from /p/, /reel/, /reels/, /tv/ URLs
|
|
const match = url.match(/\/(p|reel|reels|tv)\/([A-Za-z0-9_-]+)/);
|
|
return match ? match[2] : undefined;
|
|
}
|
|
|
|
/**
|
|
* Recipe keywords used for caption scoring
|
|
*/
|
|
const RECIPE_KEYWORDS = [
|
|
'ingredienti',
|
|
'procedimento',
|
|
'preparazione',
|
|
'ricetta',
|
|
'recipe',
|
|
'instructions'
|
|
];
|
|
|
|
/**
|
|
* Timeout configuration constants (in milliseconds)
|
|
*/
|
|
const TIMEOUTS = {
|
|
CONTENT_LOAD: 1500,
|
|
MORE_BUTTON_VISIBILITY: 1000,
|
|
CAPTION_EXPANSION: 3000,
|
|
MORE_BUTTON_VISIBILITY_DOM: 500,
|
|
MORE_BUTTON_CLICK: 800,
|
|
PAGE_LOAD: 10000,
|
|
NETWORK_SETTLE: 2000,
|
|
ARTICLE_SELECTOR: 5000,
|
|
GRAPHQL_WAIT: 1000,
|
|
PAGE_NAVIGATION: 30000,
|
|
ANTI_DETECTION_MIN: 1000,
|
|
ANTI_DETECTION_MAX: 3000
|
|
} as const;
|
|
|
|
/**
|
|
* Try to expand truncated caption by clicking "more" button in HTML section method
|
|
*/
|
|
async function tryExpandCaptionInHTMLSection(page: Page): Promise<void> {
|
|
console.log('[Extractor] Looking for "more" button in primary post container...');
|
|
try {
|
|
await page.waitForTimeout(TIMEOUTS.CONTENT_LOAD);
|
|
|
|
const mainContainer = page.locator('article, main, [role="main"]').first();
|
|
const containerExists = (await mainContainer.count()) > 0;
|
|
|
|
if (!containerExists) {
|
|
console.log('[Extractor] No main container found');
|
|
return;
|
|
}
|
|
|
|
console.log('[Extractor] Found main post container, searching for "more" button...');
|
|
|
|
const morePatterns = [
|
|
{
|
|
locator: mainContainer.locator('span').filter({ hasText: /\.\.\.\s*more/i }),
|
|
desc: "span with '...more'"
|
|
},
|
|
{
|
|
locator: mainContainer.locator('span').filter({ hasText: /…\s*more/i }),
|
|
desc: "span with '… more'"
|
|
},
|
|
{
|
|
locator: mainContainer.locator('div[role="button"]').filter({ hasText: /more/i }),
|
|
desc: "button with 'more'"
|
|
},
|
|
{
|
|
locator: mainContainer.locator('span[role="button"]').filter({ hasText: /more/i }),
|
|
desc: "span button with 'more'"
|
|
}
|
|
];
|
|
|
|
for (const pattern of morePatterns) {
|
|
const count = await pattern.locator.count();
|
|
console.log(`[Extractor] Checking ${pattern.desc}: found ${count}`);
|
|
|
|
if (count === 0) continue;
|
|
|
|
const firstMore = pattern.locator.first();
|
|
try {
|
|
if (await firstMore.isVisible({ timeout: TIMEOUTS.MORE_BUTTON_VISIBILITY })) {
|
|
const text = await firstMore.textContent();
|
|
console.log(`[Extractor] Found visible "more": "${text}"`);
|
|
await firstMore.click();
|
|
console.log('[Extractor] Clicked "more" - waiting for expansion...');
|
|
await page.waitForTimeout(TIMEOUTS.CAPTION_EXPANSION);
|
|
console.log('[Extractor] Caption expansion complete');
|
|
break;
|
|
}
|
|
} catch (e) {
|
|
console.log(`[Extractor] ${pattern.desc} not clickable: ${e}`);
|
|
}
|
|
}
|
|
|
|
console.log('[Extractor] Finished "more" button expansion attempt');
|
|
} catch (e) {
|
|
console.log(`[Extractor] Error while trying to expand caption: ${e}`);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Try to expand truncated caption by clicking "more" button in DOM method
|
|
*/
|
|
async function tryExpandCaptionInDOM(page: Page): Promise<void> {
|
|
const moreButtonSelectors = [
|
|
'article button:has-text("more")',
|
|
'article button:has-text("More")',
|
|
'article button:has-text("… more")',
|
|
'article span[role="button"]:has-text("more")',
|
|
'article [role="button"]:has-text("more")',
|
|
'article div[role="button"]:has-text("more")',
|
|
'xpath=//article//span[contains(text(), "more")]/..',
|
|
'xpath=//article//button[contains(., "more")]'
|
|
];
|
|
|
|
const maxExpandAttempts = 3;
|
|
let expandAttempts = 0;
|
|
|
|
while (expandAttempts < maxExpandAttempts) {
|
|
try {
|
|
let clicked = false;
|
|
for (const selector of moreButtonSelectors) {
|
|
try {
|
|
const button = page.locator(selector).first();
|
|
if (await button.isVisible({ timeout: TIMEOUTS.MORE_BUTTON_VISIBILITY_DOM })) {
|
|
await button.click();
|
|
await page.waitForTimeout(TIMEOUTS.MORE_BUTTON_CLICK);
|
|
console.log(`[Extractor] Clicked "more" button with selector: ${selector}`);
|
|
clicked = true;
|
|
expandAttempts++;
|
|
break;
|
|
}
|
|
} catch (e) {
|
|
// Try next selector
|
|
}
|
|
}
|
|
|
|
if (!clicked) break;
|
|
} catch (e) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Clean up extracted text - removes HTML tags, decodes entities, cleans whitespace
|
|
*/
|
|
export function cleanText(text: string): string {
|
|
let cleaned = text;
|
|
|
|
// First, convert <br> tags to newlines to preserve line breaks
|
|
cleaned = cleaned.replace(/<br\s*\/?>/gi, '\n');
|
|
|
|
// Strip all other HTML tags while keeping the text content
|
|
cleaned = cleaned.replace(/<[^>]+>/g, '');
|
|
|
|
// Decode HTML entities
|
|
cleaned = cleaned
|
|
.replace(/&/g, '&')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, "'")
|
|
.replace(/ /g, ' ');
|
|
|
|
// Remove common UI text patterns
|
|
const uiPatterns = [
|
|
/More posts from.+/gi,
|
|
/View all \d+ comments/gi,
|
|
/Add a comment\.\.\./gi,
|
|
/Liked by.+?(?=\n|$)/gi
|
|
];
|
|
|
|
uiPatterns.forEach((pattern) => {
|
|
cleaned = cleaned.replace(pattern, '');
|
|
});
|
|
|
|
// Clean up whitespace while preserving intentional line breaks
|
|
// Remove spaces at the beginning and end of lines
|
|
cleaned = cleaned.replace(/[ \t]+$/gm, ''); // trailing spaces on each line
|
|
cleaned = cleaned.replace(/^[ \t]+/gm, ''); // leading spaces on each line
|
|
|
|
// Replace multiple consecutive blank lines with max 2 newlines
|
|
cleaned = cleaned.replace(/\n\s*\n\s*\n+/g, '\n\n');
|
|
|
|
// Remove spaces around newlines
|
|
cleaned = cleaned.replace(/ *\n */g, '\n');
|
|
|
|
// Normalize multiple spaces to single space within lines
|
|
cleaned = cleaned.replace(/ {2,}/g, ' ');
|
|
|
|
// Remove hashtags from end of text
|
|
// Pattern: #word #multiple_words (supports international characters)
|
|
cleaned = cleaned.replace(/(#[\w\u00C0-\u024F\u1E00-\u1EFF\u0400-\u04FF]+\s*)+$/gi, '').trim();
|
|
|
|
return cleaned.trim();
|
|
}
|
|
|
|
/**
|
|
* Strategy 1: Extract from embedded JSON data in script tags
|
|
*/
|
|
async function extractFromEmbeddedJSON(
|
|
page: Page,
|
|
progressCallback?: ProgressCallback
|
|
): Promise<ExtractedContent | null> {
|
|
try {
|
|
// Extract all script tag contents
|
|
const scriptInfo = await page.evaluate(() => {
|
|
const scripts = Array.from(document.querySelectorAll('script'));
|
|
const scriptData = scripts.map((script, idx) => ({
|
|
type: script.getAttribute('type') || 'no-type',
|
|
hasContent: !!script.textContent,
|
|
length: script.textContent?.length || 0,
|
|
preview: script.textContent?.substring(0, 100) || ''
|
|
}));
|
|
console.log(`[Extractor] Found ${scripts.length} script tags`);
|
|
return {
|
|
contents: scripts.map((script) => script.textContent || ''),
|
|
info: scriptData
|
|
};
|
|
});
|
|
|
|
console.log(`[Extractor] Script tags summary:`, scriptInfo.info);
|
|
|
|
// Look for embedded data patterns
|
|
for (let i = 0; i < scriptInfo.contents.length; i++) {
|
|
const content = scriptInfo.contents[i];
|
|
|
|
// Try window._sharedData pattern
|
|
const sharedDataMatch = content.match(/window\._sharedData\s*=\s*(\{.+?\});/s);
|
|
if (sharedDataMatch) {
|
|
console.log(`[Extractor] Found _sharedData in script ${i}`);
|
|
try {
|
|
const data: InstagramEmbeddedData = JSON.parse(sharedDataMatch[1]);
|
|
const result = parseInstagramData(data);
|
|
if (result) {
|
|
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
|
return { ...result, thumbnail };
|
|
}
|
|
} catch (e) {
|
|
logError('[Extractor] Failed to parse _sharedData', e);
|
|
}
|
|
}
|
|
|
|
// Try __additionalDataLoaded pattern
|
|
const additionalDataMatch = content.match(
|
|
/window\.__additionalDataLoaded\([^,]+,\s*(\{.+?\})\);/s
|
|
);
|
|
if (additionalDataMatch) {
|
|
console.log(`[Extractor] Found __additionalDataLoaded in script ${i}`);
|
|
try {
|
|
const data = JSON.parse(additionalDataMatch[1]);
|
|
const result = parseInstagramData(data);
|
|
if (result) {
|
|
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
|
return { ...result, thumbnail };
|
|
}
|
|
} catch (e) {
|
|
logError('[Extractor] Failed to parse __additionalDataLoaded', e);
|
|
}
|
|
}
|
|
|
|
// Try to find any large JSON with caption data (new Instagram format)
|
|
if ((content.includes('"caption"') || content.includes('"text"')) && content.length > 10000) {
|
|
console.log(
|
|
`[Extractor] Attempting to extract from large JSON in script ${i} (length: ${content.length})`
|
|
);
|
|
try {
|
|
// Try to parse as direct JSON
|
|
const jsonData = JSON.parse(content);
|
|
|
|
// Try deep search first
|
|
const deepResult = deepSearchForCaption(jsonData);
|
|
if (deepResult && deepResult.bodyText && deepResult.bodyText.length > 130) {
|
|
console.log(
|
|
`[Extractor] Deep search in JSON found caption: ${deepResult.bodyText.length} chars`
|
|
);
|
|
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
|
return { ...deepResult, thumbnail };
|
|
}
|
|
|
|
// Try standard parsing
|
|
const result = parseInstagramData(jsonData);
|
|
if (result && result.bodyText && result.bodyText.length > 130) {
|
|
console.log(
|
|
`[Extractor] Successfully extracted from JSON, text length: ${result.bodyText.length}`
|
|
);
|
|
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
|
return { ...result, thumbnail };
|
|
}
|
|
} catch (e) {
|
|
// Not direct JSON or parsing failed, try to find caption fields with regex
|
|
console.log(`[Extractor] JSON parse failed, trying regex extraction...`);
|
|
// Try multiple patterns for different Instagram JSON structures
|
|
const patterns = [
|
|
/"caption"\s*:\s*\{\s*"text"\s*:\s*"([^"\\]*(\\.[^"\\]*)*)"/, // Escaped quotes
|
|
/"text"\s*:\s*"([^"\\]*(\\.[^"\\]*)*)"\s*,?\s*"pk"/, // text field near pk
|
|
/"edge_media_to_caption"\s*:\s*\{\s*"edges"\s*:\s*\[\s*\{\s*"node"\s*:\s*\{\s*"text"\s*:\s*"([^"\\]*(\\.[^"\\]*)*)"/
|
|
];
|
|
|
|
for (const pattern of patterns) {
|
|
const captionMatch = content.match(pattern);
|
|
if (captionMatch) {
|
|
// Get the captured group (first non-undefined)
|
|
const rawText = captionMatch[1] || '';
|
|
const captionText = rawText
|
|
.replace(/\\n/g, '\n')
|
|
.replace(/\\"/g, '"')
|
|
.replace(/\\u([0-9a-fA-F]{4})/g, (_, code) =>
|
|
String.fromCharCode(parseInt(code, 16))
|
|
)
|
|
.replace(/\\\\/g, '\\');
|
|
|
|
if (captionText.length > 130) {
|
|
console.log(
|
|
`[Extractor] Extracted caption from regex pattern, length: ${captionText.length}`
|
|
);
|
|
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
|
return { bodyText: cleanText(captionText), thumbnail };
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return null;
|
|
} catch (error) {
|
|
logError('[Extractor] Failed to extract from embedded JSON', error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse Instagram data structure
|
|
*/
|
|
function parseInstagramData(data: any): Omit<ExtractedContent, 'thumbnail'> | null {
|
|
try {
|
|
// Navigate the nested structure
|
|
const media = data?.entry_data?.PostPage?.[0]?.graphql?.shortcode_media;
|
|
|
|
if (!media) {
|
|
// Try alternative structures
|
|
const items = data?.items || data?.data?.shortcode_media;
|
|
if (items) {
|
|
return extractFromAlternativeStructure(items);
|
|
}
|
|
return null;
|
|
}
|
|
|
|
// Extract caption
|
|
const captionEdges = media.edge_media_to_caption?.edges || [];
|
|
const bodyText = captionEdges.map((edge: any) => edge.node.text).join('\n');
|
|
|
|
if (!bodyText) {
|
|
return null;
|
|
}
|
|
|
|
return {
|
|
bodyText: cleanText(bodyText)
|
|
};
|
|
} catch (error) {
|
|
logError('[Extractor] Failed to parse Instagram data structure', error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Parse alternative Instagram data structures
|
|
*/
|
|
function extractFromAlternativeStructure(items: any): Omit<ExtractedContent, 'thumbnail'> | null {
|
|
try {
|
|
if (Array.isArray(items)) {
|
|
items = items[0];
|
|
}
|
|
|
|
const caption = items?.caption?.text || items?.edge_media_to_caption?.edges?.[0]?.node?.text;
|
|
|
|
if (caption) {
|
|
return {
|
|
bodyText: cleanText(caption)
|
|
};
|
|
}
|
|
|
|
return null;
|
|
} catch (error) {
|
|
logError('[Extractor] Failed to parse alternative structure', error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Strategy 2.5: Extract caption by finding the span with recipe content characteristics
|
|
* Instagram uses obfuscated class names, but the caption span has identifiable patterns:
|
|
* - Contains substantial text (> 100 chars)
|
|
* - Has multiple <br> tags for formatting
|
|
* - Contains <a> tags for mentions and hashtags
|
|
* - Usually has a style attribute with line-height
|
|
*/
|
|
export async function extractFromHTMLSection(
|
|
page: Page,
|
|
progressCallback?: ProgressCallback,
|
|
targetUrl?: string
|
|
): Promise<ExtractedContent | null> {
|
|
try {
|
|
console.log('[Extractor] Waiting for page content to load...');
|
|
|
|
// Validate we're on the correct page
|
|
const currentUrl = page.url();
|
|
const targetShortcode = targetUrl ? extractShortcode(targetUrl) : null;
|
|
const currentShortcode = extractShortcode(currentUrl);
|
|
|
|
console.log(`[Extractor] Current page URL: ${currentUrl}`);
|
|
console.log(
|
|
`[Extractor] Target shortcode: ${targetShortcode}, Current shortcode: ${currentShortcode}`
|
|
);
|
|
|
|
if (targetShortcode && currentShortcode !== targetShortcode) {
|
|
console.log(`[Extractor] URL mismatch: expected ${targetShortcode}, got ${currentShortcode}`);
|
|
return null;
|
|
}
|
|
|
|
console.log(`[Extractor] Confirmed on correct post: ${currentShortcode}`);
|
|
|
|
// Wait for network to settle
|
|
await page.waitForLoadState('domcontentloaded', { timeout: TIMEOUTS.PAGE_LOAD });
|
|
await page.waitForTimeout(TIMEOUTS.NETWORK_SETTLE);
|
|
|
|
// Try to expand truncated caption by clicking "more" button
|
|
// STRATEGY: Since we're already on the correct page (URL validated above),
|
|
// the FIRST article/main post container should be our target post.
|
|
await tryExpandCaptionInHTMLSection(page);
|
|
|
|
console.log('[Extractor] Extracting caption using intelligent span detection...');
|
|
|
|
const result = await page.evaluate((shortcode) => {
|
|
// Strategy: Find the caption span that belongs to the correct post
|
|
// Instagram loads multiple posts, so we need to find the span associated
|
|
// with our target shortcode
|
|
|
|
const recipeKeywords = [
|
|
'ingredienti',
|
|
'procedimento',
|
|
'preparazione',
|
|
'ricetta',
|
|
'recipe',
|
|
'instructions'
|
|
];
|
|
|
|
// First, try to find links pointing to our target post
|
|
const postLinks = document.querySelectorAll(`a[href*="/${shortcode}"]`);
|
|
console.log(`[Extractor] Found ${postLinks.length} links to target post ${shortcode}`);
|
|
|
|
// If we found links to the post, search for spans within those link ancestors
|
|
const searchRoots: Element[] = [];
|
|
if (postLinks.length > 0) {
|
|
postLinks.forEach((link) => {
|
|
// Get the article or section container for this post
|
|
let container =
|
|
link.closest('article') || link.closest('section') || link.closest('[role="main"]');
|
|
if (container && !searchRoots.includes(container)) {
|
|
searchRoots.push(container);
|
|
console.log(`[Extractor] Found container for target post`);
|
|
}
|
|
});
|
|
}
|
|
|
|
// If no specific containers found, search the whole document (fallback)
|
|
if (searchRoots.length === 0) {
|
|
console.log(`[Extractor] No specific container found, searching whole document`);
|
|
searchRoots.push(document.body);
|
|
}
|
|
|
|
const spans: HTMLElement[] = [];
|
|
searchRoots.forEach((root) => {
|
|
root.querySelectorAll('span').forEach((span) => spans.push(span as HTMLElement));
|
|
});
|
|
|
|
console.log(`[Extractor] Searching ${spans.length} spans for recipe content`);
|
|
|
|
let bestCandidate: CaptionCandidate | null = null;
|
|
|
|
// Search all spans for the best caption candidate
|
|
// PRIMARY CRITERIA: Most <br> tags (recipe formatting indicator)
|
|
spans.forEach((span, spanIdx) => {
|
|
const text = (span.textContent || '').toLowerCase();
|
|
const innerHTML = span.innerHTML || '';
|
|
|
|
// Skip empty or very short spans
|
|
if (text.length < 30) return;
|
|
|
|
// Count <br> tags - this is the MOST reliable indicator for recipes
|
|
const brCount = (innerHTML.match(/<br\s*\/?>/gi) || []).length;
|
|
|
|
// No minimum br count - take what we can get
|
|
|
|
// Calculate a score based on recipe characteristics
|
|
let score = 0;
|
|
|
|
// <br> tags are the PRIMARY signal
|
|
score += brCount * 100; // Massive weight for line breaks
|
|
|
|
// Check for recipe keywords (strong indicator)
|
|
const hasKeywords = recipeKeywords.some((keyword) => text.includes(keyword));
|
|
if (hasKeywords) {
|
|
score += 500; // Huge boost for recipe keywords
|
|
}
|
|
|
|
// Count <a> tags - captions have hashtags/mentions
|
|
const linkCount = span.querySelectorAll('a').length;
|
|
if (linkCount > 2) {
|
|
score += linkCount * 10;
|
|
}
|
|
|
|
// Text length (longer is better for recipes)
|
|
score += Math.min(text.length / 5, 200);
|
|
|
|
// Check for line-height style (caption formatting)
|
|
const style = span.getAttribute('style') || '';
|
|
if (style.includes('line-height')) {
|
|
score += 30;
|
|
}
|
|
|
|
// Penalize UI elements
|
|
if (text.match(/^(follow|following|liked by|view all|more posts|comments)/i)) {
|
|
score -= 500;
|
|
}
|
|
|
|
// Penalize audio/music credits
|
|
if (text.match(/·|papaoutai|afro soul/i) && text.length < 100) {
|
|
score -= 200;
|
|
}
|
|
|
|
// Update best candidate
|
|
if (score > 0 && (!bestCandidate || score > bestCandidate.score)) {
|
|
console.log(
|
|
`[Extractor] New best: score=${score}, len=${text.length}, br=${brCount}, links=${linkCount}, preview="${text.substring(0, 80)}..."`
|
|
);
|
|
bestCandidate = {
|
|
element: span,
|
|
text: span.textContent || '',
|
|
score: score,
|
|
innerHTML: innerHTML,
|
|
brCount: brCount
|
|
};
|
|
}
|
|
});
|
|
|
|
if (!bestCandidate) {
|
|
return {
|
|
success: false,
|
|
error: 'No suitable caption span found',
|
|
text: ''
|
|
};
|
|
}
|
|
|
|
// Explicit type assertion (safe after null guard)
|
|
const candidate: CaptionCandidate = bestCandidate;
|
|
|
|
console.log(
|
|
`[Extractor] Final caption candidate: score=${candidate.score}, length=${candidate.text.length}`
|
|
);
|
|
|
|
// Extract text from the best candidate
|
|
// Use innerHTML to preserve <br> tags, which will be converted to newlines in cleanText
|
|
let captionText = candidate.innerHTML;
|
|
|
|
return {
|
|
success: true,
|
|
text: captionText,
|
|
score: candidate.score,
|
|
length: captionText.length,
|
|
htmlPreview: candidate.innerHTML.substring(0, 500)
|
|
};
|
|
}, currentShortcode);
|
|
|
|
console.log(`[Extractor] HTML Section result:`, {
|
|
success: result.success,
|
|
textLength: result.length,
|
|
score: result.score
|
|
});
|
|
|
|
if (result.htmlPreview) {
|
|
console.log('[Extractor] HTML preview (first 500 chars):');
|
|
console.log(result.htmlPreview);
|
|
}
|
|
|
|
if (!result.success) {
|
|
console.log(`[Extractor] ${result.error}`);
|
|
return null;
|
|
}
|
|
|
|
const captionText = result.text;
|
|
|
|
if (!captionText || captionText.length === 0) {
|
|
console.log('[Extractor] No text extracted from HTML section');
|
|
return null;
|
|
}
|
|
|
|
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
|
|
|
return {
|
|
bodyText: cleanText(captionText),
|
|
thumbnail
|
|
};
|
|
} catch (error) {
|
|
logError('[Extractor] Failed to extract from HTML section', error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Strategy 3: Extract from DOM using specific selectors
|
|
*/
|
|
export async function extractFromDOM(
|
|
page: Page,
|
|
progressCallback?: ProgressCallback
|
|
): Promise<ExtractedContent | null> {
|
|
try {
|
|
// Give Instagram more time to load dynamic content
|
|
console.log('[Extractor] Waiting for network idle...');
|
|
await page.waitForLoadState('networkidle', { timeout: TIMEOUTS.PAGE_LOAD }).catch(() => {
|
|
console.log('[Extractor] Network idle timeout, continuing anyway');
|
|
});
|
|
|
|
// Try to wait for article content
|
|
await page.waitForSelector('article', { timeout: TIMEOUTS.ARTICLE_SELECTOR }).catch(() => {});
|
|
|
|
// Additional wait for dynamic content
|
|
await page.waitForTimeout(TIMEOUTS.NETWORK_SETTLE);
|
|
|
|
// Try to intercept GraphQL responses
|
|
let graphqlCaption: string | null = null;
|
|
page.on('response', async (response) => {
|
|
const url = response.url();
|
|
if (url.includes('graphql') || url.includes('api/v1')) {
|
|
try {
|
|
const json = await response.json();
|
|
const captionData = extractCaptionFromGraphQL(json);
|
|
if (captionData && captionData.length > 130) {
|
|
graphqlCaption = captionData;
|
|
console.log(
|
|
`[Extractor] Intercepted GraphQL response with ${captionData.length} chars`
|
|
);
|
|
}
|
|
} catch (e) {
|
|
// Not JSON or parsing failed
|
|
}
|
|
}
|
|
});
|
|
|
|
await page.waitForTimeout(TIMEOUTS.GRAPHQL_WAIT);
|
|
|
|
if (graphqlCaption) {
|
|
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
|
return { bodyText: cleanText(graphqlCaption), thumbnail };
|
|
}
|
|
|
|
// Try to expand truncated captions by clicking "more" button
|
|
await tryExpandCaptionInDOM(page);
|
|
|
|
const captionText = await page.evaluate(() => {
|
|
// First check og:description for comparison
|
|
const metaDesc = document.querySelector('meta[property="og:description"]');
|
|
const ogContent = metaDesc?.getAttribute('content') || '';
|
|
console.log(`[Extractor] og:description length: ${ogContent.length}`);
|
|
if (ogContent.length > 200) {
|
|
console.log(`[Extractor] og:description preview: ${ogContent.substring(0, 200)}...`);
|
|
}
|
|
|
|
// SMART APPROACH: Find the truncated text first, then look for full version nearby
|
|
// Look for text that ends with "..." or "… more"
|
|
const allSpans = Array.from(
|
|
document.querySelectorAll('article span, article div, article h1')
|
|
);
|
|
|
|
let longestText = '';
|
|
let matchedElement = null;
|
|
|
|
// Strategy 1: Find elements with substantial text
|
|
for (const element of allSpans) {
|
|
const text = element.textContent?.trim() || '';
|
|
|
|
// Skip UI elements
|
|
if (text.match(/^(follow|like|comment|share|view all|load more|add a comment)$/i)) {
|
|
continue;
|
|
}
|
|
|
|
// Look for text that seems like content
|
|
if (text.length > longestText.length) {
|
|
longestText = text;
|
|
matchedElement = element;
|
|
}
|
|
}
|
|
|
|
// Strategy 2: Look in data attributes
|
|
const elementsWithData = Array.from(
|
|
document.querySelectorAll('[data-caption], [data-text], [data-content]')
|
|
);
|
|
for (const el of elementsWithData) {
|
|
const dataCaption =
|
|
el.getAttribute('data-caption') ||
|
|
el.getAttribute('data-text') ||
|
|
el.getAttribute('data-content');
|
|
if (dataCaption && dataCaption.length > longestText.length) {
|
|
longestText = dataCaption;
|
|
console.log(`[Extractor] Found data attribute with ${dataCaption.length} chars`);
|
|
}
|
|
}
|
|
|
|
// Strategy 3: Look for hidden/collapsed content
|
|
const hiddenElements = Array.from(
|
|
document.querySelectorAll(
|
|
'[style*="display: none"], [style*="display:none"], .collapsed, [aria-hidden="true"]'
|
|
)
|
|
);
|
|
for (const el of hiddenElements) {
|
|
const text = el.textContent?.trim() || '';
|
|
if (text.length > longestText.length && text.length > 200) {
|
|
longestText = text;
|
|
console.log(`[Extractor] Found hidden element with ${text.length} chars`);
|
|
}
|
|
}
|
|
|
|
// Strategy 4: Find parent of truncated text
|
|
if (matchedElement && longestText.endsWith('...')) {
|
|
// Look at siblings and parent
|
|
const parent = matchedElement.parentElement;
|
|
if (parent) {
|
|
const parentText = parent.textContent?.trim() || '';
|
|
if (parentText.length > longestText.length) {
|
|
longestText = parentText;
|
|
console.log(
|
|
`[Extractor] Found fuller text in parent element: ${parentText.length} chars`
|
|
);
|
|
}
|
|
}
|
|
|
|
// Check next siblings
|
|
let sibling = matchedElement.nextElementSibling;
|
|
let siblingCount = 0;
|
|
while (sibling && siblingCount < 5) {
|
|
const siblingText = sibling.textContent?.trim() || '';
|
|
if (siblingText.length > 50) {
|
|
longestText = longestText + ' ' + siblingText;
|
|
console.log(`[Extractor] Found continuation in sibling: ${siblingText.length} chars`);
|
|
}
|
|
sibling = sibling.nextElementSibling;
|
|
siblingCount++;
|
|
}
|
|
}
|
|
|
|
if (longestText && longestText.length > 100) {
|
|
console.log(`[Extractor] Best extraction: ${longestText.length} chars`);
|
|
return longestText;
|
|
}
|
|
|
|
// Fallback to og:description
|
|
if (metaDesc) {
|
|
const content = ogContent;
|
|
const cleanedContent = content.replace(
|
|
/^\d+K?\s+likes,\s+\d+\s+comments\s+-\s+[\w.]+\s+on\s+[^:]+:\s*["']?/,
|
|
''
|
|
);
|
|
console.log('[Extractor] DOM selector fallback: og:description (with metadata cleanup)');
|
|
return cleanedContent;
|
|
}
|
|
|
|
return null;
|
|
});
|
|
|
|
if (!captionText) {
|
|
return null;
|
|
}
|
|
|
|
// Extract thumbnail using existing logic
|
|
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
|
|
|
return {
|
|
bodyText: cleanText(captionText),
|
|
thumbnail
|
|
};
|
|
} catch (error) {
|
|
logError('[Extractor] Failed to extract from DOM', error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Strategy 3: Extract via GraphQL API
|
|
*/
|
|
async function extractViaGraphQL(
|
|
url: string,
|
|
context: BrowserContext
|
|
): Promise<ExtractedContent | null> {
|
|
const shortcode = extractShortcode(url);
|
|
if (!shortcode) {
|
|
console.warn('Could not extract shortcode from URL:', url);
|
|
return null;
|
|
}
|
|
|
|
try {
|
|
const page = await context.newPage();
|
|
|
|
// Make GraphQL request
|
|
const response = await page.request.post('https://www.instagram.com/graphql/query/', {
|
|
form: {
|
|
variables: JSON.stringify({ shortcode }),
|
|
doc_id: '7950326061742207' // May need periodic updates
|
|
}
|
|
});
|
|
|
|
if (!response.ok()) {
|
|
console.warn(`GraphQL request failed: ${response.status()}`);
|
|
await page.close();
|
|
return null;
|
|
}
|
|
|
|
const data = await response.json();
|
|
|
|
// Parse GraphQL response
|
|
const media = data?.data?.shortcode_media;
|
|
if (!media) {
|
|
await page.close();
|
|
return null;
|
|
}
|
|
|
|
const bodyText = media.edge_media_to_caption?.edges?.[0]?.node?.text || '';
|
|
|
|
await page.close();
|
|
|
|
if (!bodyText) {
|
|
return null;
|
|
}
|
|
|
|
return {
|
|
bodyText: cleanText(bodyText),
|
|
thumbnail: null // GraphQL doesn't easily provide thumbnail, would need page context
|
|
};
|
|
} catch (error) {
|
|
logError('[Extractor] GraphQL extraction failed', error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Strategy 4: Legacy extraction method (fallback)
|
|
*/
|
|
|
|
async function extractCleanTextLegacy(page: Page): Promise<string> {
|
|
let text = (await page.evaluate(() => document.body.innerText))
|
|
.replace(/^(?:.*\n){6}/, '') // Remove first 6 lines
|
|
.split('More posts from')[0] // Cut at "More posts from"
|
|
.trim();
|
|
|
|
// Remove mentions and hashtags
|
|
text = text.replace(/@\w+/g, '').replace(/#\w+/g, '');
|
|
|
|
return text;
|
|
}
|
|
|
|
/**
|
|
* Strategy 5: Extract from Instagram's internal state/cache
|
|
*/
|
|
async function extractFromInternalState(
|
|
page: Page,
|
|
progressCallback?: ProgressCallback
|
|
): Promise<ExtractedContent | null> {
|
|
try {
|
|
const stateData = await page.evaluate(() => {
|
|
// Try to access Instagram's internal React/Apollo cache
|
|
const possibleKeys = [
|
|
'_sharedData',
|
|
'__PRIVATE_STATE__',
|
|
'__additionalData',
|
|
'__initialData',
|
|
'__RELAY_STORE__'
|
|
];
|
|
|
|
for (const key of possibleKeys) {
|
|
if ((window as any)[key]) {
|
|
const data = (window as any)[key];
|
|
console.log(`[Extractor] Found internal state: ${key}`);
|
|
return { key, data: JSON.stringify(data).substring(0, 500000) }; // Limit to 500KB
|
|
}
|
|
}
|
|
|
|
return null;
|
|
});
|
|
|
|
if (stateData) {
|
|
console.log(`[Extractor] Parsing internal state from ${stateData.key}`);
|
|
try {
|
|
const parsed = JSON.parse(stateData.data);
|
|
|
|
// Try multiple parsing strategies
|
|
let result = parseInstagramData(parsed);
|
|
|
|
console.log(`[Extractor] Standard parsing result: ${result?.bodyText?.length || 0} chars`);
|
|
|
|
// Debug: log structure
|
|
if (parsed.entry_data) {
|
|
console.log(`[Extractor] Found entry_data with keys:`, Object.keys(parsed.entry_data));
|
|
}
|
|
if (parsed.config) {
|
|
console.log(`[Extractor] Found config`);
|
|
}
|
|
|
|
// If standard parsing failed, try deep search for caption text
|
|
if (!result || !result.bodyText || result.bodyText.length <= 130) {
|
|
console.log(`[Extractor] Attempting deep search in ${stateData.key}...`);
|
|
result = deepSearchForCaption(parsed);
|
|
if (result) {
|
|
console.log(`[Extractor] Deep search found: ${result.bodyText.length} chars`);
|
|
} else {
|
|
console.log(`[Extractor] Deep search found no caption`);
|
|
}
|
|
}
|
|
|
|
if (result && result.bodyText && result.bodyText.length > 130) {
|
|
console.log(
|
|
`[Extractor] Successfully extracted from ${stateData.key}, length: ${result.bodyText.length}`
|
|
);
|
|
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
|
return { ...result, thumbnail };
|
|
} else if (result?.bodyText) {
|
|
console.log(
|
|
`[Extractor] Found text in ${stateData.key} but it's truncated (${result.bodyText.length} chars)`
|
|
);
|
|
}
|
|
} catch (e) {
|
|
console.log(`[Extractor] Failed to parse ${stateData.key}:`, e);
|
|
}
|
|
}
|
|
|
|
return null;
|
|
} catch (error) {
|
|
logError('[Extractor] Failed to extract from internal state', error);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Deep search for caption text in any nested object structure
|
|
*/
|
|
function deepSearchForCaption(
|
|
obj: any,
|
|
maxDepth = 10,
|
|
currentDepth = 0
|
|
): Omit<ExtractedContent, 'thumbnail'> | null {
|
|
if (currentDepth > maxDepth || !obj || typeof obj !== 'object') {
|
|
return null;
|
|
}
|
|
|
|
// Look for caption/text fields
|
|
if (obj.caption && typeof obj.caption === 'object' && obj.caption.text) {
|
|
const text = obj.caption.text;
|
|
if (typeof text === 'string' && text.length > 130) {
|
|
return { bodyText: cleanText(text) };
|
|
}
|
|
}
|
|
|
|
// Look for edge_media_to_caption pattern
|
|
if (obj.edge_media_to_caption?.edges?.[0]?.node?.text) {
|
|
const text = obj.edge_media_to_caption.edges[0].node.text;
|
|
if (typeof text === 'string' && text.length > 130) {
|
|
return { bodyText: cleanText(text) };
|
|
}
|
|
}
|
|
|
|
// Look for direct text field in media items
|
|
if (obj.text && typeof obj.text === 'string' && obj.text.length > 130) {
|
|
// Make sure it's not just a UI label
|
|
if (!obj.text.match(/^(more|less|follow|like|comment|share)$/i)) {
|
|
return { bodyText: cleanText(obj.text) };
|
|
}
|
|
}
|
|
|
|
// Recursively search in all properties
|
|
for (const key in obj) {
|
|
if (obj.hasOwnProperty(key)) {
|
|
const result = deepSearchForCaption(obj[key], maxDepth, currentDepth + 1);
|
|
if (result && result.bodyText.length > 130) {
|
|
return result;
|
|
}
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Extract caption from intercepted GraphQL response
|
|
*/
|
|
/**
|
|
* Extract caption from GraphQL response, validating it matches the expected shortcode
|
|
*/
|
|
function extractCaptionFromGraphQL(data: any, expectedShortcode?: string): string | null {
|
|
// If we have an expected shortcode, verify this GraphQL response is for that content
|
|
if (expectedShortcode) {
|
|
// Search for shortcode in the response
|
|
const hasMatchingShortcode = JSON.stringify(data).includes(expectedShortcode);
|
|
if (!hasMatchingShortcode) {
|
|
// This GraphQL response is for different content, ignore it
|
|
return null;
|
|
}
|
|
}
|
|
|
|
const result = deepSearchForCaption(data);
|
|
return result?.bodyText || null;
|
|
}
|
|
|
|
/**
|
|
* Orchestrate extraction strategies
|
|
*/
|
|
async function extractWithStrategies(
|
|
url: string,
|
|
page: Page,
|
|
context: BrowserContext,
|
|
onProgress?: ProgressCallback
|
|
): Promise<ExtractionResult> {
|
|
const strategies: Array<{
|
|
name: ExtractionMethod;
|
|
fn: () => Promise<ExtractedContent | null>;
|
|
}> = [
|
|
{
|
|
name: 'embedded-json',
|
|
fn: () => extractFromEmbeddedJSON(page, onProgress)
|
|
},
|
|
{
|
|
name: 'internal-state',
|
|
fn: () => extractFromInternalState(page, onProgress)
|
|
},
|
|
{
|
|
name: 'html-section',
|
|
fn: () => extractFromHTMLSection(page, onProgress, url)
|
|
},
|
|
{
|
|
name: 'dom-selector',
|
|
fn: () => extractFromDOM(page, onProgress)
|
|
},
|
|
{
|
|
name: 'graphql-api',
|
|
fn: () => extractViaGraphQL(url, context)
|
|
},
|
|
{
|
|
name: 'legacy',
|
|
fn: async () => {
|
|
const text = await extractCleanTextLegacy(page);
|
|
const thumbnail = await extractThumbnailStealth(page, onProgress);
|
|
return { bodyText: text, thumbnail };
|
|
}
|
|
}
|
|
];
|
|
|
|
for (const strategy of strategies) {
|
|
try {
|
|
const methodMessage = `Trying extraction method: ${getMethodDisplayName(strategy.name)}`;
|
|
console.log(`[Extractor] ${methodMessage}`);
|
|
|
|
onProgress?.({
|
|
type: 'method',
|
|
message: methodMessage,
|
|
method: strategy.name,
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
|
|
const result = await strategy.fn();
|
|
|
|
if (result && result.bodyText) {
|
|
const successMessage = `✓ Success with method: ${getMethodDisplayName(strategy.name)}`;
|
|
console.log(`[Extractor] ${successMessage}`);
|
|
|
|
onProgress?.({
|
|
type: 'status',
|
|
message: successMessage,
|
|
method: strategy.name,
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
|
|
return {
|
|
success: true,
|
|
method: strategy.name,
|
|
data: result
|
|
};
|
|
}
|
|
} catch (error) {
|
|
logError(`[Extractor] Method ${strategy.name} failed`, error);
|
|
// Continue to next strategy
|
|
}
|
|
}
|
|
|
|
return {
|
|
success: false,
|
|
error: 'All extraction methods failed'
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Extract text content and thumbnail from a URL using Playwright browser
|
|
* Uses multiple extraction strategies with fallback
|
|
* @param url - The URL to extract from
|
|
* @param onProgress - Optional callback to receive progress updates
|
|
* @returns Extracted text and thumbnail
|
|
*/
|
|
export async function extractTextAndThumbnail(
|
|
url: string,
|
|
onProgress?: ProgressCallback
|
|
): Promise<ExtractedContent> {
|
|
onProgress?.({
|
|
type: 'status',
|
|
message: 'Starting extraction...',
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
|
|
return withRetry(
|
|
async () => {
|
|
const authPath = resolveAuthPath();
|
|
const context = await createBrowserContext(authPath);
|
|
const page = await context.newPage();
|
|
|
|
// Extract shortcode for validation
|
|
const expectedShortcode = extractShortcode(url);
|
|
console.log(`[Extractor] Target shortcode: ${expectedShortcode || 'unknown'}`);
|
|
|
|
try {
|
|
// Set timeout
|
|
page.setDefaultTimeout(30000);
|
|
|
|
// Set up GraphQL response interception BEFORE loading the page
|
|
// This is critical to catch initial network requests during page load
|
|
let interceptedCaption: string | null = null;
|
|
page.on('response', async (response) => {
|
|
try {
|
|
const responseUrl = response.url();
|
|
if (
|
|
responseUrl.includes('graphql') ||
|
|
responseUrl.includes('api/v1') ||
|
|
responseUrl.includes('/web/')
|
|
) {
|
|
try {
|
|
const json = await response.json();
|
|
const captionData = extractCaptionFromGraphQL(json, expectedShortcode ?? undefined);
|
|
if (captionData && captionData.length > 130) {
|
|
interceptedCaption = captionData;
|
|
console.log(
|
|
`[Extractor] ✓ Intercepted GraphQL with full caption: ${captionData.length} chars (shortcode verified)`
|
|
);
|
|
}
|
|
} catch (e) {
|
|
// Not JSON or parse error, skip
|
|
}
|
|
}
|
|
} catch (e) {
|
|
// Ignore response errors
|
|
}
|
|
});
|
|
|
|
onProgress?.({
|
|
type: 'status',
|
|
message: 'Loading Instagram page...',
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
|
|
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
|
|
// Add small human-like delay
|
|
await page.waitForTimeout(1000 + Math.random() * 2000);
|
|
|
|
// Try scrolling and waiting to trigger additional GraphQL requests
|
|
console.log('[Extractor] Scrolling to trigger lazy loading...');
|
|
await page.evaluate(() => {
|
|
window.scrollBy(0, 300);
|
|
});
|
|
await page.waitForTimeout(1500);
|
|
|
|
await page.evaluate(() => {
|
|
window.scrollBy(0, 300);
|
|
});
|
|
await page.waitForTimeout(1500);
|
|
|
|
await page.evaluate(() => {
|
|
window.scrollTo(0, 0);
|
|
});
|
|
await page.waitForTimeout(1000);
|
|
|
|
// Always use DOM extraction (HTML Section) — it clicks "… more" in
|
|
// the browser and gets the fully expanded caption. The GraphQL
|
|
// interception is unreliable: Instagram often truncates captions
|
|
// in API responses without any "…." marker, so we cannot trust
|
|
// the intercepted text to be complete.
|
|
const capturedCaption = interceptedCaption as string | null;
|
|
if (capturedCaption) {
|
|
console.log(
|
|
`[Extractor] Intercepted GraphQL caption (${capturedCaption.length} chars) — always using DOM extraction for full text`
|
|
);
|
|
}
|
|
|
|
const result = await extractWithStrategies(url, page, context, onProgress);
|
|
|
|
if (!result.success || !result.data) {
|
|
// DOM extraction failed — fall back to intercepted caption if available
|
|
if (capturedCaption) {
|
|
console.log(
|
|
'[Extractor] DOM extraction failed — using intercepted GraphQL caption as fallback'
|
|
);
|
|
const thumbnail = await extractThumbnailStealth(page, onProgress);
|
|
return { bodyText: cleanText(capturedCaption), thumbnail };
|
|
}
|
|
throw new Error(result.error || 'Extraction failed');
|
|
}
|
|
|
|
// Save debug content
|
|
fs.writeFileSync(
|
|
path.resolve('debug_page.txt'),
|
|
`Method: ${result.method}\n\n${result.data.bodyText}`
|
|
);
|
|
|
|
onProgress?.({
|
|
type: 'complete',
|
|
message: 'Extraction completed successfully',
|
|
method: result.method,
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
|
|
return result.data;
|
|
} finally {
|
|
await page.close();
|
|
await context.close();
|
|
}
|
|
},
|
|
DEFAULT_RETRY_CONFIG,
|
|
onProgress
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Extract thumbnail from video element or take full page screenshot
|
|
*/
|
|
/**
|
|
* Screenshot-based thumbnail extraction (fallback method)
|
|
* Takes a screenshot of the video element or full page if video not found
|
|
*/
|
|
async function extractThumbnailScreenshot(page: Page): Promise<string | null> {
|
|
const videoBounds = await page.evaluate(() => {
|
|
const video = document.querySelector('video');
|
|
if (!video) return null;
|
|
const rect = video.getBoundingClientRect();
|
|
return {
|
|
x: Math.max(0, rect.left),
|
|
y: Math.max(0, rect.top),
|
|
width: Math.min(rect.width, window.innerWidth),
|
|
height: Math.min(rect.height, window.innerHeight)
|
|
};
|
|
});
|
|
|
|
let screenshotBuffer: Buffer;
|
|
|
|
if (videoBounds && videoBounds.width > 0 && videoBounds.height > 0) {
|
|
screenshotBuffer = await page.screenshot({
|
|
type: 'jpeg',
|
|
quality: 85,
|
|
clip: videoBounds
|
|
});
|
|
} else {
|
|
console.warn('[Thumbnail] Video element not found or has no size, taking full page screenshot');
|
|
screenshotBuffer = await page.screenshot({ type: 'jpeg', quality: 85 });
|
|
}
|
|
|
|
return `data:image/jpeg;base64,${screenshotBuffer.toString('base64')}`;
|
|
}
|
|
|
|
/**
|
|
* Helper: Fetch image from URL and convert to base64 data URI
|
|
*
|
|
* **Validation Criteria:**
|
|
* - HTTP status must be exactly 200 (not 2xx, only 200)
|
|
* - Content-Type must start with 'image/' (e.g., image/jpeg, image/png, image/webp)
|
|
* - Request must complete within 10 seconds
|
|
*
|
|
* **Failure Scenarios:**
|
|
* - Non-200 status → Returns null, reports status code via progress callback
|
|
* - Invalid content-type → Returns null, reports content-type via progress callback
|
|
* - Timeout → Returns null, reports timeout via progress callback
|
|
* - Network error → Returns null, reports error message via progress callback
|
|
*
|
|
* **Usage in Fallback Chain:**
|
|
* This function is used by `extractThumbnailStealth()` which tries multiple URL sources:
|
|
* 1. Meta tags (og:image, twitter:image)
|
|
* 2. Video poster attribute
|
|
* 3. Instagram data structures (display_url, thumbnail_src)
|
|
* 4. Screenshot fallback (always succeeds)
|
|
*
|
|
* When this function returns null, extraction continues to the next method.
|
|
*
|
|
* @param imageUrl - The image URL to fetch (must be HTTPS)
|
|
* @param progressCallback - Optional callback for progress reporting
|
|
* @returns Base64 data URI (data:image/*;base64,...) or null if validation fails
|
|
*
|
|
* @example
|
|
* ```typescript
|
|
* const thumbnail = await fetchImageAsBase64(
|
|
* 'https://instagram.com/image.jpg',
|
|
* (event) => console.log(event.message)
|
|
* );
|
|
*
|
|
* if (thumbnail) {
|
|
* // thumbnail is a valid base64 data URI
|
|
* console.log(thumbnail.substring(0, 50)); // "data:image/jpeg;base64,/9j/4AAQSkZJRg..."
|
|
* } else {
|
|
* // URL validation failed, try next method
|
|
* }
|
|
* ```
|
|
*/
|
|
async function fetchImageAsBase64(
|
|
imageUrl: string,
|
|
progressCallback?: ProgressCallback
|
|
): Promise<string | null> {
|
|
try {
|
|
// Create abort controller for timeout
|
|
const controller = new AbortController();
|
|
const timeoutId = setTimeout(() => controller.abort(), 10000); // 10s timeout
|
|
|
|
console.log(`[Thumbnail] Validating URL: ${imageUrl}`);
|
|
|
|
const response = await fetch(imageUrl, {
|
|
signal: controller.signal
|
|
});
|
|
|
|
clearTimeout(timeoutId);
|
|
|
|
// Strict status validation: must be exactly 200
|
|
if (response.status !== 200) {
|
|
console.warn(`[Thumbnail] URL validation failed: HTTP ${response.status} for ${imageUrl}`);
|
|
progressCallback?.({
|
|
type: 'status',
|
|
message: `Thumbnail URL returned HTTP ${response.status}, trying next method...`,
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
return null;
|
|
}
|
|
|
|
// Validate content-type
|
|
const contentType = response.headers.get('content-type') || '';
|
|
if (!contentType.startsWith('image/')) {
|
|
console.warn(
|
|
`[Thumbnail] URL validation failed: Invalid content-type '${contentType}' for ${imageUrl}`
|
|
);
|
|
progressCallback?.({
|
|
type: 'status',
|
|
message: `Thumbnail URL returned non-image content (${contentType}), trying next method...`,
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
return null;
|
|
}
|
|
|
|
console.log(`[Thumbnail] URL validation successful: ${imageUrl} (${contentType})`);
|
|
|
|
const arrayBuffer = await response.arrayBuffer();
|
|
const buffer = Buffer.from(arrayBuffer);
|
|
|
|
const base64Data = `data:${contentType};base64,${buffer.toString('base64')}`;
|
|
|
|
progressCallback?.({
|
|
type: 'status',
|
|
message: 'Thumbnail fetched and validated from URL',
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
|
|
return base64Data;
|
|
} catch (e) {
|
|
if (e instanceof Error) {
|
|
if (e.name === 'AbortError') {
|
|
console.error(`[Thumbnail] URL fetch timeout: ${imageUrl}`);
|
|
progressCallback?.({
|
|
type: 'status',
|
|
message: 'Thumbnail URL fetch timeout, trying next method...',
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
} else {
|
|
console.error(`[Thumbnail] Failed to fetch image from ${imageUrl}:`, e.message);
|
|
progressCallback?.({
|
|
type: 'status',
|
|
message: `Thumbnail URL fetch failed (${e.message}), trying next method...`,
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
}
|
|
} else {
|
|
logError('[Thumbnail] Failed to fetch image', e);
|
|
}
|
|
return null;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract thumbnail from Instagram post using stealth techniques
|
|
*
|
|
* Tries multiple methods in order of stealth:
|
|
* 1. Meta tags (og:image, twitter:image) - Returns: Direct HTTPS URL
|
|
* 2. Video poster attribute - Returns: Direct HTTPS URL
|
|
* 3. Instagram window data structures - Returns: Direct HTTPS URL
|
|
* 4. Screenshot fallback - Returns: Base64 data URL (data:image/jpeg;base64,...)
|
|
*
|
|
* @param page - Playwright page instance
|
|
* @param progressCallback - Optional progress callback for SSE updates
|
|
* @returns Image URL (either direct HTTPS URL or base64 data URL) or null if all methods fail
|
|
*
|
|
* **Thumbnail Format Guide:**
|
|
* - Methods 1-3: Return direct HTTPS URLs → Tandoor can use URL pass-through (efficient)
|
|
* - Method 4: Returns base64 data URL → Requires conversion to file blob for upload
|
|
*/
|
|
async function extractThumbnailStealth(
|
|
page: Page,
|
|
progressCallback?: ProgressCallback
|
|
): Promise<string | null> {
|
|
console.log('[Thumbnail] Starting stealth extraction');
|
|
|
|
// Method 1: Try meta tags (most stealthy)
|
|
try {
|
|
const ogImage = await page.getAttribute('meta[property="og:image"]', 'content');
|
|
if (ogImage) {
|
|
console.log('[Thumbnail] Found og:image meta tag');
|
|
const imageBuffer = await fetchImageAsBase64(ogImage, progressCallback);
|
|
if (imageBuffer) {
|
|
if (progressCallback) {
|
|
progressCallback({
|
|
type: 'thumbnail',
|
|
message: 'Thumbnail extracted from meta tags',
|
|
data: { thumbnail: imageBuffer },
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
}
|
|
return imageBuffer;
|
|
}
|
|
}
|
|
|
|
const twitterImage = await page.getAttribute('meta[name="twitter:image"]', 'content');
|
|
if (twitterImage) {
|
|
console.log('[Thumbnail] Found twitter:image meta tag');
|
|
const imageBuffer = await fetchImageAsBase64(twitterImage, progressCallback);
|
|
if (imageBuffer) {
|
|
if (progressCallback) {
|
|
progressCallback({
|
|
type: 'thumbnail',
|
|
message: 'Thumbnail extracted from meta tags',
|
|
data: { thumbnail: imageBuffer },
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
}
|
|
return imageBuffer;
|
|
}
|
|
}
|
|
} catch (e) {
|
|
logError('[Thumbnail] Meta tag method failed', e);
|
|
}
|
|
|
|
// Method 2: Try video poster attribute
|
|
try {
|
|
const poster = await page.getAttribute('video', 'poster');
|
|
if (poster) {
|
|
console.log('[Thumbnail] Found video poster attribute');
|
|
const imageBuffer = await fetchImageAsBase64(poster, progressCallback);
|
|
if (imageBuffer) {
|
|
if (progressCallback) {
|
|
progressCallback({
|
|
type: 'thumbnail',
|
|
message: 'Thumbnail extracted from video poster',
|
|
data: { thumbnail: imageBuffer },
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
}
|
|
return imageBuffer;
|
|
}
|
|
}
|
|
} catch (e) {
|
|
logError('[Thumbnail] Video poster method failed', e);
|
|
}
|
|
|
|
// Method 3: Try Instagram window data structures
|
|
try {
|
|
const thumbnailUrl = await page.evaluate(() => {
|
|
// Check for Instagram's internal data structures
|
|
const data = (window as any).__additionalDataLoaded;
|
|
if (data) {
|
|
// Navigate through Instagram's data structure
|
|
for (const key in data) {
|
|
const item = data[key];
|
|
if (item?.graphql?.shortcode_media?.display_url) {
|
|
return item.graphql.shortcode_media.display_url;
|
|
}
|
|
if (item?.graphql?.shortcode_media?.thumbnail_src) {
|
|
return item.graphql.shortcode_media.thumbnail_src;
|
|
}
|
|
}
|
|
}
|
|
return null;
|
|
});
|
|
|
|
if (thumbnailUrl) {
|
|
console.log('[Thumbnail] Found thumbnail in Instagram data structures');
|
|
const imageBuffer = await fetchImageAsBase64(thumbnailUrl, progressCallback);
|
|
if (imageBuffer) {
|
|
if (progressCallback) {
|
|
progressCallback({
|
|
type: 'thumbnail',
|
|
message: 'Thumbnail extracted from Instagram data',
|
|
data: { thumbnail: imageBuffer },
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
}
|
|
return imageBuffer;
|
|
}
|
|
}
|
|
} catch (e) {
|
|
logError('[Thumbnail] Instagram data method failed', e);
|
|
}
|
|
|
|
// Method 4: Screenshot fallback (existing method)
|
|
console.log('[Thumbnail] Falling back to screenshot method');
|
|
const screenshotThumbnail = await extractThumbnailScreenshot(page);
|
|
if (screenshotThumbnail && progressCallback) {
|
|
progressCallback({
|
|
type: 'thumbnail',
|
|
message: 'Thumbnail extracted via screenshot',
|
|
data: { thumbnail: screenshotThumbnail },
|
|
timestamp: new Date().toISOString()
|
|
});
|
|
}
|
|
return screenshotThumbnail;
|
|
}
|