fix instagram extraction

This commit is contained in:
Giancarmine Salucci
2026-02-17 19:52:25 +01:00
parent 56d3aec3e2
commit ea535bd9dd
6 changed files with 1390 additions and 97 deletions

View File

@@ -1,6 +1,11 @@
import { chromium, type Browser, type BrowserContext } from 'playwright';
import { chromium } from 'playwright-extra';
import type { Browser, BrowserContext } from 'playwright';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import fs from 'fs';
// Apply stealth plugin with all evasion techniques
chromium.use(StealthPlugin());
let browser: Browser | null = null;
interface BrowserOptions {
@@ -16,8 +21,11 @@ export async function initializeBrowser(): Promise<Browser> {
}
console.log('Initializing Playwright browser...');
browser = await chromium.launch({
executablePath: '/usr/bin/chromium-browser',
// Use environment variable or let Playwright use its bundled browser
const executablePath = process.env.CHROMIUM_EXECUTABLE_PATH || '/usr/bin/google-chrome';
const launchOptions: Parameters<typeof chromium.launch>[0] = {
headless: true,
args: [
'--disable-blink-features=AutomationControlled',
@@ -26,7 +34,14 @@ export async function initializeBrowser(): Promise<Browser> {
'--disable-setuid-sandbox',
'--disable-gpu'
]
});
};
// In test environment, let Playwright use bundled browser
if (process.env.NODE_ENV !== 'test' && process.env.VITEST !== 'true') {
launchOptions.executablePath = executablePath;
}
browser = await chromium.launch(launchOptions);
console.log('Browser initialized successfully');
return browser;
@@ -85,25 +100,13 @@ export async function createBrowserContext(
context = await browserInstance.newContext(contextOptions);
// Mask automation indicators
await context.addInitScript(() => {
// Override navigator.webdriver
Object.defineProperty(navigator, 'webdriver', {
get: () => false
});
// Mock Chrome runtime
(window as any).chrome = {
runtime: {}
};
// Mock permissions
const originalQuery = window.navigator.permissions.query;
window.navigator.permissions.query = (parameters: any) =>
parameters.name === 'notifications'
? Promise.resolve({ state: 'denied' } as PermissionStatus)
: originalQuery(parameters);
});
// Note: Anti-detection scripts are now handled automatically by the stealth plugin
// The plugin applies 15+ evasion techniques including:
// - navigator.webdriver masking
// - chrome.runtime mocking
// - User-Agent override
// - WebGL fingerprinting evasion
// - And many more...
return context;
}

View File

@@ -9,7 +9,7 @@ export interface ExtractedContent {
thumbnail: string | null;
}
export type ExtractionMethod = 'embedded-json' | 'dom-selector' | 'graphql-api' | 'legacy';
export type ExtractionMethod = 'embedded-json' | 'internal-state' | 'html-section' | 'dom-selector' | 'graphql-api' | 'legacy';
export type ProgressEventType = 'status' | 'method' | 'retry' | 'error' | 'thumbnail' | 'complete';
@@ -116,6 +116,8 @@ function isNonRetriableError(error: unknown): boolean {
function getMethodDisplayName(method: ExtractionMethod): string {
const names: Record<ExtractionMethod, string> = {
'embedded-json': 'Embedded JSON',
'internal-state': 'Internal State',
'html-section': 'HTML Section',
'dom-selector': 'DOM Selector',
'graphql-api': 'GraphQL API',
legacy: 'Legacy Parser'
@@ -175,8 +177,8 @@ async function withRetry<T>(
* Extract shortcode from Instagram URL
*/
function extractShortcode(url: string): string | null {
// Extract from /p/, /reel/, /tv/ URLs
const match = url.match(/\/(p|reel|tv)\/([A-Za-z0-9_-]+)/);
// Extract from /p/, /reel/, /reels/, /tv/ URLs
const match = url.match(/\/(p|reel|reels|tv)\/([A-Za-z0-9_-]+)/);
return match ? match[2] : null;
}
@@ -186,8 +188,22 @@ function extractShortcode(url: string): string | null {
export function cleanText(text: string): string {
let cleaned = text;
// Remove common UI text patterns BEFORE normalizing whitespace
// This way patterns like "Liked by..." and "View all..." can be matched across lines
// First, convert <br> tags to newlines to preserve line breaks
cleaned = cleaned.replace(/<br\s*\/?>/gi, '\n');
// Strip all other HTML tags while keeping the text content
cleaned = cleaned.replace(/<[^>]+>/g, '');
// Decode HTML entities
cleaned = cleaned
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#039;/g, "'")
.replace(/&nbsp;/g, ' ');
// Remove common UI text patterns
const uiPatterns = [
/More posts from.+/gi,
/View all \d+ comments/gi,
@@ -199,8 +215,16 @@ export function cleanText(text: string): string {
cleaned = cleaned.replace(pattern, '');
});
// Remove excessive whitespace and normalize (after UI pattern removal)
cleaned = cleaned.replace(/\s+/g, ' ').trim();
// Clean up whitespace while preserving intentional line breaks
// Remove spaces at the beginning and end of lines
cleaned = cleaned.replace(/[ \t]+$/gm, ''); // trailing spaces on each line
cleaned = cleaned.replace(/^[ \t]+/gm, ''); // leading spaces on each line
// Replace multiple consecutive blank lines with max 2 newlines
cleaned = cleaned.replace(/\n\s*\n\s*\n+/g, '\n\n');
// Remove spaces around newlines
cleaned = cleaned.replace(/ *\n */g, '\n');
// Remove hashtags from end of text
// Pattern: #word #multiple_words (supports international characters)
@@ -218,16 +242,31 @@ async function extractFromEmbeddedJSON(
): Promise<ExtractedContent | null> {
try {
// Extract all script tag contents
const scriptContents = await page.evaluate(() => {
const scripts = Array.from(document.querySelectorAll('script[type="text/javascript"]'));
return scripts.map((script) => script.textContent || '');
const scriptInfo = await page.evaluate(() => {
const scripts = Array.from(document.querySelectorAll('script'));
const scriptData = scripts.map((script, idx) => ({
type: script.getAttribute('type') || 'no-type',
hasContent: !!script.textContent,
length: script.textContent?.length || 0,
preview: script.textContent?.substring(0, 100) || ''
}));
console.log(`[Extractor] Found ${scripts.length} script tags`);
return {
contents: scripts.map((script) => script.textContent || ''),
info: scriptData
};
});
console.log(`[Extractor] Script tags summary:`, scriptInfo.info);
// Look for embedded data patterns
for (const content of scriptContents) {
for (let i = 0; i < scriptInfo.contents.length; i++) {
const content = scriptInfo.contents[i];
// Try window._sharedData pattern
const sharedDataMatch = content.match(/window\._sharedData\s*=\s*(\{.+?\});/s);
if (sharedDataMatch) {
console.log(`[Extractor] Found _sharedData in script ${i}`);
try {
const data: InstagramEmbeddedData = JSON.parse(sharedDataMatch[1]);
const result = parseInstagramData(data);
@@ -243,6 +282,7 @@ async function extractFromEmbeddedJSON(
// Try __additionalDataLoaded pattern
const additionalDataMatch = content.match(/window\.__additionalDataLoaded\([^,]+,\s*(\{.+?\})\);/s);
if (additionalDataMatch) {
console.log(`[Extractor] Found __additionalDataLoaded in script ${i}`);
try {
const data = JSON.parse(additionalDataMatch[1]);
const result = parseInstagramData(data);
@@ -254,6 +294,59 @@ async function extractFromEmbeddedJSON(
logError('[Extractor] Failed to parse __additionalDataLoaded', e);
}
}
// Try to find any large JSON with caption data (new Instagram format)
if ((content.includes('"caption"') || content.includes('"text"')) && content.length > 10000) {
console.log(`[Extractor] Attempting to extract from large JSON in script ${i} (length: ${content.length})`);
try {
// Try to parse as direct JSON
const jsonData = JSON.parse(content);
// Try deep search first
const deepResult = deepSearchForCaption(jsonData);
if (deepResult && deepResult.bodyText && deepResult.bodyText.length > 130) {
console.log(`[Extractor] Deep search in JSON found caption: ${deepResult.bodyText.length} chars`);
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return { ...deepResult, thumbnail };
}
// Try standard parsing
const result = parseInstagramData(jsonData);
if (result && result.bodyText && result.bodyText.length > 130) {
console.log(`[Extractor] Successfully extracted from JSON, text length: ${result.bodyText.length}`);
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return { ...result, thumbnail };
}
} catch (e) {
// Not direct JSON or parsing failed, try to find caption fields with regex
console.log(`[Extractor] JSON parse failed, trying regex extraction...`);
// Try multiple patterns for different Instagram JSON structures
const patterns = [
/"caption"\s*:\s*\{\s*"text"\s*:\s*"([^"\\]*(\\.[^"\\]*)*)"/, // Escaped quotes
/"text"\s*:\s*"([^"\\]*(\\.[^"\\]*)*)"\s*,?\s*"pk"/, // text field near pk
/"edge_media_to_caption"\s*:\s*\{\s*"edges"\s*:\s*\[\s*\{\s*"node"\s*:\s*\{\s*"text"\s*:\s*"([^"\\]*(\\.[^"\\]*)*)"/,
];
for (const pattern of patterns) {
const captionMatch = content.match(pattern);
if (captionMatch) {
// Get the captured group (first non-undefined)
const rawText = captionMatch[1] || '';
const captionText = rawText
.replace(/\\n/g, '\n')
.replace(/\\"/g, '"')
.replace(/\\u([0-9a-fA-F]{4})/g, (_, code) => String.fromCharCode(parseInt(code, 16)))
.replace(/\\\\/g, '\\');
if (captionText.length > 130) {
console.log(`[Extractor] Extracted caption from regex pattern, length: ${captionText.length}`);
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return { bodyText: cleanText(captionText), thumbnail };
}
}
}
}
}
}
return null;
@@ -322,37 +415,446 @@ function extractFromAlternativeStructure(items: any): Omit<ExtractedContent, 'th
}
/**
* Strategy 2: Extract from DOM using specific selectors
* Strategy 2.5: Extract caption by finding the span with recipe content characteristics
* Instagram uses obfuscated class names, but the caption span has identifiable patterns:
* - Contains substantial text (> 100 chars)
* - Has multiple <br> tags for formatting
* - Contains <a> tags for mentions and hashtags
* - Usually has a style attribute with line-height
*/
export async function extractFromHTMLSection(
page: Page,
progressCallback?: ProgressCallback,
targetUrl?: string
): Promise<ExtractedContent | null> {
try {
console.log('[Extractor] Waiting for page content to load...');
// Validate we're on the correct page
const currentUrl = page.url();
const targetShortcode = targetUrl ? extractShortcode(targetUrl) : null;
const currentShortcode = extractShortcode(currentUrl);
console.log(`[Extractor] Current page URL: ${currentUrl}`);
console.log(`[Extractor] Target shortcode: ${targetShortcode}, Current shortcode: ${currentShortcode}`);
if (targetShortcode && currentShortcode !== targetShortcode) {
console.log(`[Extractor] URL mismatch: expected ${targetShortcode}, got ${currentShortcode}`);
return null;
}
console.log(`[Extractor] Confirmed on correct post: ${currentShortcode}`);
// Wait for network to settle
await page.waitForLoadState('domcontentloaded', { timeout: 10000 });
await page.waitForTimeout(2000);
//Try to expand truncated caption by clicking "more" button
// STRATEGY: Since we're already on the correct page (URL validated above),
// the FIRST article/main post container should be our target post.
// Instagram uses JS routing so links don't have shortcodes in hrefs.
console.log('[Extractor] Looking for "more" button in primary post container...');
try {
// Wait for content to load
await page.waitForTimeout(1500);
// Find the MAIN post container - should be the first article or main content area
const mainContainer = page.locator('article, main, [role="main"]').first();
const containerExists = await mainContainer.count() > 0;
if (containerExists) {
console.log('[Extractor] Found main post container, searching for "more" button...');
// Try different patterns for the "more" button within the main container
const morePatterns = [
{ locator: mainContainer.locator('span').filter({ hasText: /\.\.\.\s*more/i }), desc: "span with '...more'" },
{ locator: mainContainer.locator('span').filter({ hasText: /…\s*more/i }), desc: "span with '… more'" },
{ locator: mainContainer.locator('div[role="button"]').filter({ hasText: /more/i }), desc: "button with 'more'" },
{ locator: mainContainer.locator('span[role="button"]').filter({ hasText: /more/i }), desc: "span button with 'more'" }
];
for (const pattern of morePatterns) {
const count = await pattern.locator.count();
console.log(`[Extractor] Checking ${pattern.desc}: found ${count}`);
if (count > 0) {
const firstMore = pattern.locator.first();
try {
if (await firstMore.isVisible({ timeout: 1000 })) {
const text = await firstMore.textContent();
console.log(`[Extractor] Found visible "more": "${text}"`);
await firstMore.click();
console.log('[Extractor] Clicked "more" - waiting for expansion...');
await page.waitForTimeout(3000);
console.log('[Extractor] Caption expansion complete');
break; // Success!
}
} catch (e) {
console.log(`[Extractor] ${pattern.desc} not clickable: ${e}`);
}
}
}
} else {
console.log('[Extractor] No main container found');
}
console.log('[Extractor] Finished "more" button expansion attempt');
} catch (e) {
console.log(`[Extractor] Error while trying to expand caption: ${e}`);
}
console.log('[Extractor] Extracting caption using intelligent span detection...');
const result = await page.evaluate((shortcode) => {
// Strategy: Find the caption span that belongs to the correct post
// Instagram loads multiple posts, so we need to find the span associated
// with our target shortcode
const recipeKeywords = [
'ingredienti',
'procedimento',
'preparazione',
'ricetta',
'recipe',
'instructions'
];
// First, try to find links pointing to our target post
const postLinks = document.querySelectorAll(`a[href*="/${shortcode}"]`);
console.log(`[Extractor] Found ${postLinks.length} links to target post ${shortcode}`);
// If we found links to the post, search for spans within those link ancestors
const searchRoots: Element[] = [];
if (postLinks.length > 0) {
postLinks.forEach(link => {
// Get the article or section container for this post
let container = link.closest('article') || link.closest('section') || link.closest('[role="main"]');
if (container && !searchRoots.includes(container)) {
searchRoots.push(container);
console.log(`[Extractor] Found container for target post`);
}
});
}
// If no specific containers found, search the whole document (fallback)
if (searchRoots.length === 0) {
console.log(`[Extractor] No specific container found, searching whole document`);
searchRoots.push(document.body);
}
const spans: HTMLElement[] = [];
searchRoots.forEach(root => {
root.querySelectorAll('span').forEach(span => spans.push(span as HTMLElement));
});
console.log(`[Extractor] Searching ${spans.length} spans for recipe content`);
let bestCandidate: {
element: Element;
text: string;
score: number;
innerHTML: string;
brCount: number;
} | null = null;
// Search all spans for the best caption candidate
// PRIMARY CRITERIA: Most <br> tags (recipe formatting indicator)
spans.forEach((span, spanIdx) => {
const text = (span.textContent || '').toLowerCase();
const innerHTML = span.innerHTML || '';
// Skip empty or very short spans
if (text.length < 30) return;
// Count <br> tags - this is the MOST reliable indicator for recipes
const brCount = (innerHTML.match(/<br\s*\/?>/gi) || []).length;
// No minimum br count - take what we can get
// Calculate a score based on recipe characteristics
let score = 0;
// <br> tags are the PRIMARY signal
score += brCount * 100; // Massive weight for line breaks
// Check for recipe keywords (strong indicator)
const hasKeywords = recipeKeywords.some(keyword => text.includes(keyword));
if (hasKeywords) {
score += 500; // Huge boost for recipe keywords
}
// Count <a> tags - captions have hashtags/mentions
const linkCount = span.querySelectorAll('a').length;
if (linkCount > 2) {
score += linkCount * 10;
}
// Text length (longer is better for recipes)
score += Math.min(text.length / 5, 200);
// Check for line-height style (caption formatting)
const style = span.getAttribute('style') || '';
if (style.includes('line-height')) {
score += 30;
}
// Penalize UI elements
if (text.match(/^(follow|following|liked by|view all|more posts|comments)/i)) {
score -= 500;
}
// Penalize audio/music credits
if (text.match(/·|papaoutai|afro soul/i) && text.length < 100) {
score -= 200;
}
// Update best candidate
if (score > 0 && (!bestCandidate || score > bestCandidate.score)) {
console.log(`[Extractor] New best: score=${score}, len=${text.length}, br=${brCount}, links=${linkCount}, preview="${text.substring(0, 80)}..."`);
bestCandidate = {
element: span,
text: span.textContent || '',
score: score,
innerHTML: innerHTML,
brCount: brCount
};
}
});
if (!bestCandidate) {
return {
success: false,
error: 'No suitable caption span found',
text: ''
};
}
console.log(`[Extractor] Final caption candidate: score=${bestCandidate.score}, length=${bestCandidate.text.length}`);
// Extract text from the best candidate
// Use innerHTML to preserve <br> tags, which will be converted to newlines in cleanText
let captionText = bestCandidate.innerHTML;
return {
success: true,
text: captionText,
score: bestCandidate.score,
length: captionText.length,
htmlPreview: bestCandidate.innerHTML.substring(0, 500)
};
}, currentShortcode);
console.log(`[Extractor] HTML Section result:`, {
success: result.success,
textLength: result.length,
score: result.score
});
if (result.htmlPreview) {
console.log('[Extractor] HTML preview (first 500 chars):');
console.log(result.htmlPreview);
}
if (!result.success) {
console.log(`[Extractor] ${result.error}`);
return null;
}
const captionText = result.text;
if (!captionText || captionText.length === 0) {
console.log('[Extractor] No text extracted from HTML section');
return null;
}
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return {
bodyText: cleanText(captionText),
thumbnail
};
} catch (error) {
logError('[Extractor] Failed to extract from HTML section', error);
return null;
}
}
/**
* Strategy 3: Extract from DOM using specific selectors
*/
export async function extractFromDOM(
page: Page,
progressCallback?: ProgressCallback
): Promise<ExtractedContent | null> {
try {
const captionText = await page.evaluate(() => {
// Try multiple selectors in order of reliability
const selectors = [
'article h1', // Semantic title element
'article span[dir="auto"]', // Caption with dir attribute
'article div[role="button"] + span', // Caption after interactive element
'article span:not([aria-label])', // Non-labeled spans (likely caption)
];
// Give Instagram more time to load dynamic content
console.log('[Extractor] Waiting for network idle...');
await page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => {
console.log('[Extractor] Network idle timeout, continuing anyway');
});
// Try to wait for article content
await page.waitForSelector('article', { timeout: 5000 }).catch(() => {});
// Additional wait for dynamic content
await page.waitForTimeout(2000);
// Try to intercept GraphQL responses
let graphqlCaption: string | null = null;
page.on('response', async (response) => {
const url = response.url();
if (url.includes('graphql') || url.includes('api/v1')) {
try {
const json = await response.json();
// Try to find caption in the response
const captionData = extractCaptionFromGraphQL(json);
if (captionData && captionData.length > 130) {
graphqlCaption = captionData;
console.log(`[Extractor] Intercepted GraphQL response with ${captionData.length} chars`);
}
} catch (e) {
// Not JSON or parsing failed
}
}
});
// Wait a bit for any GraphQL requests to complete
await page.waitForTimeout(1000);
if (graphqlCaption) {
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return { bodyText: cleanText(graphqlCaption), thumbnail };
}
// First, try to expand truncated captions by clicking "more" button
// Try multiple times with different selectors
let expandAttempts = 0;
const maxExpandAttempts = 3;
while (expandAttempts < maxExpandAttempts) {
try {
const moreButtonSelectors = [
'article button:has-text("more")',
'article button:has-text("More")',
'article button:has-text("… more")',
'article span[role="button"]:has-text("more")',
'article [role="button"]:has-text("more")',
'article div[role="button"]:has-text("more")',
'xpath=//article//span[contains(text(), "more")]/..',
'xpath=//article//button[contains(., "more")]'
];
let clicked = false;
for (const selector of moreButtonSelectors) {
try {
const button = page.locator(selector).first();
if (await button.isVisible({ timeout: 500 })) {
await button.click();
await page.waitForTimeout(800);
console.log(`[Extractor] Clicked "more" button with selector: ${selector}`);
clicked = true;
expandAttempts++;
break;
}
} catch (e) {
// Try next selector
}
}
if (!clicked) break; // No more buttons found
} catch (e) {
break;
}
}
for (const selector of selectors) {
const element = document.querySelector(selector);
if (element?.textContent && element.textContent.length > 100) {
// Only accept elements with substantial text (not UI labels)
console.log(`[Extractor] DOM selector matched: ${selector}`);
return element.textContent.trim();
const captionText = await page.evaluate(() => {
// First check og:description for comparison
const metaDesc = document.querySelector('meta[property="og:description"]');
const ogContent = metaDesc?.getAttribute('content') || '';
console.log(`[Extractor] og:description length: ${ogContent.length}`);
if (ogContent.length > 200) {
console.log(`[Extractor] og:description preview: ${ogContent.substring(0, 200)}...`);
}
// SMART APPROACH: Find the truncated text first, then look for full version nearby
// Look for text that ends with "..." or "… more"
const allSpans = Array.from(document.querySelectorAll('article span, article div, article h1'));
let longestText = '';
let matchedElement = null;
// Strategy 1: Find elements with substantial text
for (const element of allSpans) {
const text = element.textContent?.trim() || '';
// Skip UI elements
if (text.match(/^(follow|like|comment|share|view all|load more|add a comment)$/i)) {
continue;
}
// Look for text that seems like content
if (text.length > longestText.length) {
longestText = text;
matchedElement = element;
}
}
// Strategy 2: Look in data attributes
const elementsWithData = Array.from(document.querySelectorAll('[data-caption], [data-text], [data-content]'));
for (const el of elementsWithData) {
const dataCaption = el.getAttribute('data-caption') ||
el.getAttribute('data-text') ||
el.getAttribute('data-content');
if (dataCaption && dataCaption.length > longestText.length) {
longestText = dataCaption;
console.log(`[Extractor] Found data attribute with ${dataCaption.length} chars`);
}
}
// Strategy 3: Look for hidden/collapsed content
const hiddenElements = Array.from(document.querySelectorAll('[style*="display: none"], [style*="display:none"], .collapsed, [aria-hidden="true"]'));
for (const el of hiddenElements) {
const text = el.textContent?.trim() || '';
if (text.length > longestText.length && text.length > 200) {
longestText = text;
console.log(`[Extractor] Found hidden element with ${text.length} chars`);
}
}
// Strategy 4: Find parent of truncated text
if (matchedElement && longestText.endsWith('...')) {
// Look at siblings and parent
const parent = matchedElement.parentElement;
if (parent) {
const parentText = parent.textContent?.trim() || '';
if (parentText.length > longestText.length) {
longestText = parentText;
console.log(`[Extractor] Found fuller text in parent element: ${parentText.length} chars`);
}
}
// Check next siblings
let sibling = matchedElement.nextElementSibling;
let siblingCount = 0;
while (sibling && siblingCount < 5) {
const siblingText = sibling.textContent?.trim() || '';
if (siblingText.length > 50) {
longestText = longestText + ' ' + siblingText;
console.log(`[Extractor] Found continuation in sibling: ${siblingText.length} chars`);
}
sibling = sibling.nextElementSibling;
siblingCount++;
}
}
// Fallback to og:description ONLY if all other methods fail
// NOTE: This contains metadata prefix but better than nothing
const metaDesc = document.querySelector('meta[property="og:description"]');
if (longestText && longestText.length > 100) {
console.log(`[Extractor] Best extraction: ${longestText.length} chars`);
return longestText;
}
// Fallback to og:description
if (metaDesc) {
const content = metaDesc.getAttribute('content') || '';
// Try to strip metadata prefix pattern: "X likes, Y comments - username on date: "
const content = ogContent;
const cleanedContent = content.replace(/^\d+K?\s+likes,\s+\d+\s+comments\s+-\s+[\w.]+\s+on\s+[^:]+:\s*["']?/, '');
console.log('[Extractor] DOM selector fallback: og:description (with metadata cleanup)');
return cleanedContent;
@@ -451,6 +953,149 @@ async function extractCleanTextLegacy(page: Page): Promise<string> {
return text;
}
/**
* Strategy 5: Extract from Instagram's internal state/cache
*/
async function extractFromInternalState(
page: Page,
progressCallback?: ProgressCallback
): Promise<ExtractedContent | null> {
try {
const stateData = await page.evaluate(() => {
// Try to access Instagram's internal React/Apollo cache
const possibleKeys = [
'_sharedData',
'__PRIVATE_STATE__',
'__additionalData',
'__initialData',
'__RELAY_STORE__'
];
for (const key of possibleKeys) {
if ((window as any)[key]) {
const data = (window as any)[key];
console.log(`[Extractor] Found internal state: ${key}`);
return { key, data: JSON.stringify(data).substring(0, 500000) }; // Limit to 500KB
}
}
return null;
});
if (stateData) {
console.log(`[Extractor] Parsing internal state from ${stateData.key}`);
try {
const parsed = JSON.parse(stateData.data);
// Try multiple parsing strategies
let result = parseInstagramData(parsed);
console.log(`[Extractor] Standard parsing result: ${result?.bodyText?.length || 0} chars`);
// Debug: log structure
if (parsed.entry_data) {
console.log(`[Extractor] Found entry_data with keys:`, Object.keys(parsed.entry_data));
}
if (parsed.config) {
console.log(`[Extractor] Found config`);
}
// If standard parsing failed, try deep search for caption text
if (!result || !result.bodyText || result.bodyText.length <= 130) {
console.log(`[Extractor] Attempting deep search in ${stateData.key}...`);
result = deepSearchForCaption(parsed);
if (result) {
console.log(`[Extractor] Deep search found: ${result.bodyText.length} chars`);
} else {
console.log(`[Extractor] Deep search found no caption`);
}
}
if (result && result.bodyText && result.bodyText.length > 130) {
console.log(`[Extractor] Successfully extracted from ${stateData.key}, length: ${result.bodyText.length}`);
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return { ...result, thumbnail };
} else if (result?.bodyText) {
console.log(`[Extractor] Found text in ${stateData.key} but it's truncated (${result.bodyText.length} chars)`);
}
} catch (e) {
console.log(`[Extractor] Failed to parse ${stateData.key}:`, e);
}
}
return null;
} catch (error) {
logError('[Extractor] Failed to extract from internal state', error);
return null;
}
}
/**
* Deep search for caption text in any nested object structure
*/
function deepSearchForCaption(obj: any, maxDepth = 10, currentDepth = 0): Omit<ExtractedContent, 'thumbnail'> | null {
if (currentDepth > maxDepth || !obj || typeof obj !== 'object') {
return null;
}
// Look for caption/text fields
if (obj.caption && typeof obj.caption === 'object' && obj.caption.text) {
const text = obj.caption.text;
if (typeof text === 'string' && text.length > 130) {
return { bodyText: cleanText(text) };
}
}
// Look for edge_media_to_caption pattern
if (obj.edge_media_to_caption?.edges?.[0]?.node?.text) {
const text = obj.edge_media_to_caption.edges[0].node.text;
if (typeof text === 'string' && text.length > 130) {
return { bodyText: cleanText(text) };
}
}
// Look for direct text field in media items
if (obj.text && typeof obj.text === 'string' && obj.text.length > 130) {
// Make sure it's not just a UI label
if (!obj.text.match(/^(more|less|follow|like|comment|share)$/i)) {
return { bodyText: cleanText(obj.text) };
}
}
// Recursively search in all properties
for (const key in obj) {
if (obj.hasOwnProperty(key)) {
const result = deepSearchForCaption(obj[key], maxDepth, currentDepth + 1);
if (result && result.bodyText.length > 130) {
return result;
}
}
}
return null;
}
/**
* Extract caption from intercepted GraphQL response
*/
/**
* Extract caption from GraphQL response, validating it matches the expected shortcode
*/
function extractCaptionFromGraphQL(data: any, expectedShortcode?: string): string | null {
// If we have an expected shortcode, verify this GraphQL response is for that content
if (expectedShortcode) {
// Search for shortcode in the response
const hasMatchingShortcode = JSON.stringify(data).includes(expectedShortcode);
if (!hasMatchingShortcode) {
// This GraphQL response is for different content, ignore it
return null;
}
}
const result = deepSearchForCaption(data);
return result?.bodyText || null;
}
/**
* Orchestrate extraction strategies
*/
@@ -468,6 +1113,14 @@ async function extractWithStrategies(
name: 'embedded-json',
fn: () => extractFromEmbeddedJSON(page, onProgress)
},
{
name: 'internal-state',
fn: () => extractFromInternalState(page, onProgress)
},
{
name: 'html-section',
fn: () => extractFromHTMLSection(page, onProgress, url)
},
{
name: 'dom-selector',
fn: () => extractFromDOM(page, onProgress)
@@ -550,11 +1203,38 @@ export async function extractTextAndThumbnail(
const authPath = resolveAuthPath();
const context = await createBrowserContext(authPath);
const page = await context.newPage();
// Extract shortcode for validation
const expectedShortcode = extractShortcode(url);
console.log(`[Extractor] Target shortcode: ${expectedShortcode || 'unknown'}`);
try {
// Set timeout
page.setDefaultTimeout(30000);
// Set up GraphQL response interception BEFORE loading the page
// This is critical to catch initial network requests during page load
let interceptedCaption: string | null = null;
page.on('response', async (response) => {
try {
const responseUrl = response.url();
if (responseUrl.includes('graphql') || responseUrl.includes('api/v1') || responseUrl.includes('/web/')) {
try {
const json = await response.json();
const captionData = extractCaptionFromGraphQL(json, expectedShortcode);
if (captionData && captionData.length > 130) {
interceptedCaption = captionData;
console.log(`[Extractor] ✓ Intercepted GraphQL with full caption: ${captionData.length} chars (shortcode verified)`);
}
} catch (e) {
// Not JSON or parse error, skip
}
}
} catch (e) {
// Ignore response errors
}
});
onProgress?.({
type: 'status',
message: 'Loading Instagram page...',
@@ -566,6 +1246,36 @@ export async function extractTextAndThumbnail(
// Add small human-like delay
await page.waitForTimeout(1000 + Math.random() * 2000);
// Try scrolling and waiting to trigger additional GraphQL requests
console.log('[Extractor] Scrolling to trigger lazy loading...');
await page.evaluate(() => {
window.scrollBy(0, 300);
});
await page.waitForTimeout(1500);
await page.evaluate(() => {
window.scrollBy(0, 300);
});
await page.waitForTimeout(1500);
await page.evaluate(() => {
window.scrollTo(0, 0);
});
await page.waitForTimeout(1000);
// If we intercepted a full caption, use it immediately
if (interceptedCaption) {
console.log('[Extractor] Using intercepted caption from network traffic');
const thumbnail = await extractThumbnailStealth(page, onProgress);
onProgress?.({
type: 'complete',
message: 'Extraction completed via GraphQL interception',
method: 'graphql-intercept',
timestamp: new Date().toISOString()
});
return { bodyText: cleanText(interceptedCaption), thumbnail };
}
const result = await extractWithStrategies(url, page, context, onProgress);
if (!result.success || !result.data) {