|
|
|
|
@@ -9,7 +9,15 @@ export interface ExtractedContent {
|
|
|
|
|
thumbnail: string | null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export type ExtractionMethod = 'embedded-json' | 'internal-state' | 'html-section' | 'dom-selector' | 'graphql-api' | 'legacy';
|
|
|
|
|
export type ExtractionMethod = 'embedded-json' | 'internal-state' | 'html-section' | 'dom-selector' | 'graphql-api' | 'graphql-intercept' | 'legacy';
|
|
|
|
|
|
|
|
|
|
type CaptionCandidate = {
|
|
|
|
|
element: Element;
|
|
|
|
|
text: string;
|
|
|
|
|
score: number;
|
|
|
|
|
innerHTML: string;
|
|
|
|
|
brCount: number;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
export type ProgressEventType = 'status' | 'method' | 'retry' | 'error' | 'thumbnail' | 'complete';
|
|
|
|
|
|
|
|
|
|
@@ -120,6 +128,7 @@ function getMethodDisplayName(method: ExtractionMethod): string {
|
|
|
|
|
'html-section': 'HTML Section',
|
|
|
|
|
'dom-selector': 'DOM Selector',
|
|
|
|
|
'graphql-api': 'GraphQL API',
|
|
|
|
|
'graphql-intercept': 'GraphQL Intercept',
|
|
|
|
|
legacy: 'Legacy Parser'
|
|
|
|
|
};
|
|
|
|
|
return names[method];
|
|
|
|
|
@@ -176,10 +185,10 @@ async function withRetry<T>(
|
|
|
|
|
/**
|
|
|
|
|
* Extract shortcode from Instagram URL
|
|
|
|
|
*/
|
|
|
|
|
function extractShortcode(url: string): string | null {
|
|
|
|
|
function extractShortcode(url: string): string | undefined {
|
|
|
|
|
// Extract from /p/, /reel/, /reels/, /tv/ URLs
|
|
|
|
|
const match = url.match(/\/(p|reel|reels|tv)\/([A-Za-z0-9_-]+)/);
|
|
|
|
|
return match ? match[2] : null;
|
|
|
|
|
return match ? match[2] : undefined;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
@@ -549,13 +558,7 @@ export async function extractFromHTMLSection(
|
|
|
|
|
|
|
|
|
|
console.log(`[Extractor] Searching ${spans.length} spans for recipe content`);
|
|
|
|
|
|
|
|
|
|
let bestCandidate: {
|
|
|
|
|
element: Element;
|
|
|
|
|
text: string;
|
|
|
|
|
score: number;
|
|
|
|
|
innerHTML: string;
|
|
|
|
|
brCount: number;
|
|
|
|
|
} | null = null;
|
|
|
|
|
let bestCandidate: CaptionCandidate | null = null;
|
|
|
|
|
|
|
|
|
|
// Search all spans for the best caption candidate
|
|
|
|
|
// PRIMARY CRITERIA: Most <br> tags (recipe formatting indicator)
|
|
|
|
|
@@ -629,18 +632,21 @@ export async function extractFromHTMLSection(
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
console.log(`[Extractor] Final caption candidate: score=${bestCandidate.score}, length=${bestCandidate.text.length}`);
|
|
|
|
|
// Explicit type assertion (safe after null guard)
|
|
|
|
|
const candidate: CaptionCandidate = bestCandidate;
|
|
|
|
|
|
|
|
|
|
console.log(`[Extractor] Final caption candidate: score=${candidate.score}, length=${candidate.text.length}`);
|
|
|
|
|
|
|
|
|
|
// Extract text from the best candidate
|
|
|
|
|
// Use innerHTML to preserve <br> tags, which will be converted to newlines in cleanText
|
|
|
|
|
let captionText = bestCandidate.innerHTML;
|
|
|
|
|
let captionText = candidate.innerHTML;
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
success: true,
|
|
|
|
|
text: captionText,
|
|
|
|
|
score: bestCandidate.score,
|
|
|
|
|
score: candidate.score,
|
|
|
|
|
length: captionText.length,
|
|
|
|
|
htmlPreview: bestCandidate.innerHTML.substring(0, 500)
|
|
|
|
|
htmlPreview: candidate.innerHTML.substring(0, 500)
|
|
|
|
|
};
|
|
|
|
|
}, currentShortcode);
|
|
|
|
|
|
|
|
|
|
@@ -1221,7 +1227,7 @@ export async function extractTextAndThumbnail(
|
|
|
|
|
if (responseUrl.includes('graphql') || responseUrl.includes('api/v1') || responseUrl.includes('/web/')) {
|
|
|
|
|
try {
|
|
|
|
|
const json = await response.json();
|
|
|
|
|
const captionData = extractCaptionFromGraphQL(json, expectedShortcode);
|
|
|
|
|
const captionData = extractCaptionFromGraphQL(json, expectedShortcode ?? undefined);
|
|
|
|
|
if (captionData && captionData.length > 130) {
|
|
|
|
|
interceptedCaption = captionData;
|
|
|
|
|
console.log(`[Extractor] ✓ Intercepted GraphQL with full caption: ${captionData.length} chars (shortcode verified)`);
|
|
|
|
|
|