feat(share): refactor page and enhance thumbnail extraction
- Extract 8 reusable components from monolithic share page - Add LLM health indicator with 30s polling - Implement stealth thumbnail extraction with 4-method cascade - Integrate real-time thumbnail preview component - Reduce share page from 306 to ~140 lines - Add comprehensive outcome documentation Components: - UrlInputSection: URL input and extraction trigger - ProgressIndicator: Loading state display - ExtractedTextViewer: Collapsible text preview - RecipeCard: Recipe display with Tandoor integration - ErrorState: Error handling UI - LogViewer: System logs with color coding - LlmHealthIndicator: LLM status with polling - ThumbnailPreview: Real-time thumbnail display Thumbnail Methods: 1. Meta tag extraction (og:image, twitter:image) 2. Video poster attribute 3. Instagram embedded JSON data 4. Screenshot fallback Stories Completed: - Story 1: Component extraction and refactoring - Story 2: LLM health status indicator - Story 3: Enhanced stealth thumbnail extraction - Story 4: Thumbnail preview integration Closes: RefactorSharePageAndEnhanceThumbnails
This commit is contained in:
@@ -10,7 +10,7 @@ export interface ExtractedContent {
|
||||
|
||||
export type ExtractionMethod = 'embedded-json' | 'dom-selector' | 'graphql-api' | 'legacy';
|
||||
|
||||
export type ProgressEventType = 'status' | 'method' | 'retry' | 'error' | 'complete';
|
||||
export type ProgressEventType = 'status' | 'method' | 'retry' | 'error' | 'thumbnail' | 'complete';
|
||||
|
||||
export interface ProgressEvent {
|
||||
type: ProgressEventType;
|
||||
@@ -221,7 +221,7 @@ async function extractFromEmbeddedJSON(page: Page): Promise<ExtractedContent | n
|
||||
const data: InstagramEmbeddedData = JSON.parse(sharedDataMatch[1]);
|
||||
const result = parseInstagramData(data);
|
||||
if (result) {
|
||||
const thumbnail = await extractThumbnail(page);
|
||||
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
||||
return { ...result, thumbnail };
|
||||
}
|
||||
} catch (e) {
|
||||
@@ -236,7 +236,7 @@ async function extractFromEmbeddedJSON(page: Page): Promise<ExtractedContent | n
|
||||
const data = JSON.parse(additionalDataMatch[1]);
|
||||
const result = parseInstagramData(data);
|
||||
if (result) {
|
||||
const thumbnail = await extractThumbnail(page);
|
||||
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
||||
return { ...result, thumbnail };
|
||||
}
|
||||
} catch (e) {
|
||||
@@ -343,7 +343,7 @@ async function extractFromDOM(page: Page): Promise<ExtractedContent | null> {
|
||||
}
|
||||
|
||||
// Extract thumbnail using existing logic
|
||||
const thumbnail = await extractThumbnail(page);
|
||||
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
||||
|
||||
return {
|
||||
bodyText: cleanText(captionText),
|
||||
@@ -456,7 +456,7 @@ async function extractWithStrategies(
|
||||
name: 'legacy',
|
||||
fn: async () => {
|
||||
const text = await extractCleanTextLegacy(page);
|
||||
const thumbnail = await extractThumbnail(page);
|
||||
const thumbnail = await extractThumbnailStealth(page, progressCallback);
|
||||
return { bodyText: text, thumbnail };
|
||||
}
|
||||
}
|
||||
@@ -572,7 +572,11 @@ export async function extractTextAndThumbnail(
|
||||
/**
|
||||
* Extract thumbnail from video element or take full page screenshot
|
||||
*/
|
||||
async function extractThumbnail(page: Page): Promise<string | null> {
|
||||
/**
|
||||
* Screenshot-based thumbnail extraction (fallback method)
|
||||
* Takes a screenshot of the video element or full page if video not found
|
||||
*/
|
||||
async function extractThumbnailScreenshot(page: Page): Promise<string | null> {
|
||||
const videoBounds = await page.evaluate(() => {
|
||||
const video = document.querySelector('video');
|
||||
if (!video) return null;
|
||||
@@ -594,9 +598,156 @@ async function extractThumbnail(page: Page): Promise<string | null> {
|
||||
clip: videoBounds
|
||||
});
|
||||
} else {
|
||||
console.warn('Video element not found or has no size, taking full page screenshot');
|
||||
console.warn('[Thumbnail] Video element not found or has no size, taking full page screenshot');
|
||||
screenshotBuffer = await page.screenshot({ type: 'jpeg', quality: 85 });
|
||||
}
|
||||
|
||||
return `data:image/jpeg;base64,${screenshotBuffer.toString('base64')}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper: Fetch image from URL and convert to base64 data URI
|
||||
*/
|
||||
async function fetchImageAsBase64(imageUrl: string): Promise<string | null> {
|
||||
try {
|
||||
const response = await fetch(imageUrl);
|
||||
if (!response.ok) return null;
|
||||
|
||||
const arrayBuffer = await response.arrayBuffer();
|
||||
const buffer = Buffer.from(arrayBuffer);
|
||||
const contentType = response.headers.get('content-type') || 'image/jpeg';
|
||||
|
||||
return `data:${contentType};base64,${buffer.toString('base64')}`;
|
||||
} catch (e) {
|
||||
console.error('[Thumbnail] Failed to fetch image:', e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract thumbnail from Instagram post using stealth techniques
|
||||
* Tries multiple methods in order of stealth:
|
||||
* 1. Meta tags (og:image, twitter:image)
|
||||
* 2. Video poster attribute
|
||||
* 3. Instagram window data structures
|
||||
* 4. Screenshot fallback
|
||||
*/
|
||||
async function extractThumbnailStealth(
|
||||
page: Page,
|
||||
progressCallback?: ProgressCallback
|
||||
): Promise<string | null> {
|
||||
console.log('[Thumbnail] Starting stealth extraction');
|
||||
|
||||
// Method 1: Try meta tags (most stealthy)
|
||||
try {
|
||||
const ogImage = await page.getAttribute('meta[property="og:image"]', 'content');
|
||||
if (ogImage) {
|
||||
console.log('[Thumbnail] Found og:image meta tag');
|
||||
const imageBuffer = await fetchImageAsBase64(ogImage);
|
||||
if (imageBuffer) {
|
||||
if (progressCallback) {
|
||||
progressCallback({
|
||||
type: 'thumbnail',
|
||||
message: 'Thumbnail extracted from meta tags',
|
||||
data: { thumbnail: imageBuffer },
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
return imageBuffer;
|
||||
}
|
||||
}
|
||||
|
||||
const twitterImage = await page.getAttribute('meta[name="twitter:image"]', 'content');
|
||||
if (twitterImage) {
|
||||
console.log('[Thumbnail] Found twitter:image meta tag');
|
||||
const imageBuffer = await fetchImageAsBase64(twitterImage);
|
||||
if (imageBuffer) {
|
||||
if (progressCallback) {
|
||||
progressCallback({
|
||||
type: 'thumbnail',
|
||||
message: 'Thumbnail extracted from meta tags',
|
||||
data: { thumbnail: imageBuffer },
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
return imageBuffer;
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.log('[Thumbnail] Meta tag method failed:', e);
|
||||
}
|
||||
|
||||
// Method 2: Try video poster attribute
|
||||
try {
|
||||
const poster = await page.getAttribute('video', 'poster');
|
||||
if (poster) {
|
||||
console.log('[Thumbnail] Found video poster attribute');
|
||||
const imageBuffer = await fetchImageAsBase64(poster);
|
||||
if (imageBuffer) {
|
||||
if (progressCallback) {
|
||||
progressCallback({
|
||||
type: 'thumbnail',
|
||||
message: 'Thumbnail extracted from video poster',
|
||||
data: { thumbnail: imageBuffer },
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
return imageBuffer;
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.log('[Thumbnail] Video poster method failed:', e);
|
||||
}
|
||||
|
||||
// Method 3: Try Instagram window data structures
|
||||
try {
|
||||
const thumbnailUrl = await page.evaluate(() => {
|
||||
// Check for Instagram's internal data structures
|
||||
const data = (window as any).__additionalDataLoaded;
|
||||
if (data) {
|
||||
// Navigate through Instagram's data structure
|
||||
for (const key in data) {
|
||||
const item = data[key];
|
||||
if (item?.graphql?.shortcode_media?.display_url) {
|
||||
return item.graphql.shortcode_media.display_url;
|
||||
}
|
||||
if (item?.graphql?.shortcode_media?.thumbnail_src) {
|
||||
return item.graphql.shortcode_media.thumbnail_src;
|
||||
}
|
||||
}
|
||||
}
|
||||
return null;
|
||||
});
|
||||
|
||||
if (thumbnailUrl) {
|
||||
console.log('[Thumbnail] Found thumbnail in Instagram data structures');
|
||||
const imageBuffer = await fetchImageAsBase64(thumbnailUrl);
|
||||
if (imageBuffer) {
|
||||
if (progressCallback) {
|
||||
progressCallback({
|
||||
type: 'thumbnail',
|
||||
message: 'Thumbnail extracted from Instagram data',
|
||||
data: { thumbnail: imageBuffer },
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
return imageBuffer;
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.log('[Thumbnail] Instagram data method failed:', e);
|
||||
}
|
||||
|
||||
// Method 4: Screenshot fallback (existing method)
|
||||
console.log('[Thumbnail] Falling back to screenshot method');
|
||||
const screenshotThumbnail = await extractThumbnailScreenshot(page);
|
||||
if (screenshotThumbnail && progressCallback) {
|
||||
progressCallback({
|
||||
type: 'thumbnail',
|
||||
message: 'Thumbnail extracted via screenshot',
|
||||
data: { thumbnail: screenshotThumbnail },
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
return screenshotThumbnail;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user