feat(share): refactor page and enhance thumbnail extraction

- Extract 8 reusable components from monolithic share page
- Add LLM health indicator with 30s polling
- Implement stealth thumbnail extraction with 4-method cascade
- Integrate real-time thumbnail preview component
- Reduce share page from 306 to ~140 lines
- Add comprehensive outcome documentation

Components:
- UrlInputSection: URL input and extraction trigger
- ProgressIndicator: Loading state display
- ExtractedTextViewer: Collapsible text preview
- RecipeCard: Recipe display with Tandoor integration
- ErrorState: Error handling UI
- LogViewer: System logs with color coding
- LlmHealthIndicator: LLM status with polling
- ThumbnailPreview: Real-time thumbnail display

Thumbnail Methods:
1. Meta tag extraction (og:image, twitter:image)
2. Video poster attribute
3. Instagram embedded JSON data
4. Screenshot fallback

Stories Completed:
- Story 1: Component extraction and refactoring
- Story 2: LLM health status indicator
- Story 3: Enhanced stealth thumbnail extraction
- Story 4: Thumbnail preview integration

Closes: RefactorSharePageAndEnhanceThumbnails
This commit is contained in:
Giancarmine Salucci
2025-12-21 04:18:38 +01:00
parent 44823c365f
commit 7e4d82de8d
13 changed files with 1890 additions and 310 deletions

View File

@@ -10,7 +10,7 @@ export interface ExtractedContent {
export type ExtractionMethod = 'embedded-json' | 'dom-selector' | 'graphql-api' | 'legacy';
export type ProgressEventType = 'status' | 'method' | 'retry' | 'error' | 'complete';
export type ProgressEventType = 'status' | 'method' | 'retry' | 'error' | 'thumbnail' | 'complete';
export interface ProgressEvent {
type: ProgressEventType;
@@ -221,7 +221,7 @@ async function extractFromEmbeddedJSON(page: Page): Promise<ExtractedContent | n
const data: InstagramEmbeddedData = JSON.parse(sharedDataMatch[1]);
const result = parseInstagramData(data);
if (result) {
const thumbnail = await extractThumbnail(page);
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return { ...result, thumbnail };
}
} catch (e) {
@@ -236,7 +236,7 @@ async function extractFromEmbeddedJSON(page: Page): Promise<ExtractedContent | n
const data = JSON.parse(additionalDataMatch[1]);
const result = parseInstagramData(data);
if (result) {
const thumbnail = await extractThumbnail(page);
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return { ...result, thumbnail };
}
} catch (e) {
@@ -343,7 +343,7 @@ async function extractFromDOM(page: Page): Promise<ExtractedContent | null> {
}
// Extract thumbnail using existing logic
const thumbnail = await extractThumbnail(page);
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return {
bodyText: cleanText(captionText),
@@ -456,7 +456,7 @@ async function extractWithStrategies(
name: 'legacy',
fn: async () => {
const text = await extractCleanTextLegacy(page);
const thumbnail = await extractThumbnail(page);
const thumbnail = await extractThumbnailStealth(page, progressCallback);
return { bodyText: text, thumbnail };
}
}
@@ -572,7 +572,11 @@ export async function extractTextAndThumbnail(
/**
* Extract thumbnail from video element or take full page screenshot
*/
async function extractThumbnail(page: Page): Promise<string | null> {
/**
* Screenshot-based thumbnail extraction (fallback method)
* Takes a screenshot of the video element or full page if video not found
*/
async function extractThumbnailScreenshot(page: Page): Promise<string | null> {
const videoBounds = await page.evaluate(() => {
const video = document.querySelector('video');
if (!video) return null;
@@ -594,9 +598,156 @@ async function extractThumbnail(page: Page): Promise<string | null> {
clip: videoBounds
});
} else {
console.warn('Video element not found or has no size, taking full page screenshot');
console.warn('[Thumbnail] Video element not found or has no size, taking full page screenshot');
screenshotBuffer = await page.screenshot({ type: 'jpeg', quality: 85 });
}
return `data:image/jpeg;base64,${screenshotBuffer.toString('base64')}`;
}
/**
* Helper: Fetch image from URL and convert to base64 data URI
*/
async function fetchImageAsBase64(imageUrl: string): Promise<string | null> {
try {
const response = await fetch(imageUrl);
if (!response.ok) return null;
const arrayBuffer = await response.arrayBuffer();
const buffer = Buffer.from(arrayBuffer);
const contentType = response.headers.get('content-type') || 'image/jpeg';
return `data:${contentType};base64,${buffer.toString('base64')}`;
} catch (e) {
console.error('[Thumbnail] Failed to fetch image:', e);
return null;
}
}
/**
* Extract thumbnail from Instagram post using stealth techniques
* Tries multiple methods in order of stealth:
* 1. Meta tags (og:image, twitter:image)
* 2. Video poster attribute
* 3. Instagram window data structures
* 4. Screenshot fallback
*/
async function extractThumbnailStealth(
page: Page,
progressCallback?: ProgressCallback
): Promise<string | null> {
console.log('[Thumbnail] Starting stealth extraction');
// Method 1: Try meta tags (most stealthy)
try {
const ogImage = await page.getAttribute('meta[property="og:image"]', 'content');
if (ogImage) {
console.log('[Thumbnail] Found og:image meta tag');
const imageBuffer = await fetchImageAsBase64(ogImage);
if (imageBuffer) {
if (progressCallback) {
progressCallback({
type: 'thumbnail',
message: 'Thumbnail extracted from meta tags',
data: { thumbnail: imageBuffer },
timestamp: new Date().toISOString()
});
}
return imageBuffer;
}
}
const twitterImage = await page.getAttribute('meta[name="twitter:image"]', 'content');
if (twitterImage) {
console.log('[Thumbnail] Found twitter:image meta tag');
const imageBuffer = await fetchImageAsBase64(twitterImage);
if (imageBuffer) {
if (progressCallback) {
progressCallback({
type: 'thumbnail',
message: 'Thumbnail extracted from meta tags',
data: { thumbnail: imageBuffer },
timestamp: new Date().toISOString()
});
}
return imageBuffer;
}
}
} catch (e) {
console.log('[Thumbnail] Meta tag method failed:', e);
}
// Method 2: Try video poster attribute
try {
const poster = await page.getAttribute('video', 'poster');
if (poster) {
console.log('[Thumbnail] Found video poster attribute');
const imageBuffer = await fetchImageAsBase64(poster);
if (imageBuffer) {
if (progressCallback) {
progressCallback({
type: 'thumbnail',
message: 'Thumbnail extracted from video poster',
data: { thumbnail: imageBuffer },
timestamp: new Date().toISOString()
});
}
return imageBuffer;
}
}
} catch (e) {
console.log('[Thumbnail] Video poster method failed:', e);
}
// Method 3: Try Instagram window data structures
try {
const thumbnailUrl = await page.evaluate(() => {
// Check for Instagram's internal data structures
const data = (window as any).__additionalDataLoaded;
if (data) {
// Navigate through Instagram's data structure
for (const key in data) {
const item = data[key];
if (item?.graphql?.shortcode_media?.display_url) {
return item.graphql.shortcode_media.display_url;
}
if (item?.graphql?.shortcode_media?.thumbnail_src) {
return item.graphql.shortcode_media.thumbnail_src;
}
}
}
return null;
});
if (thumbnailUrl) {
console.log('[Thumbnail] Found thumbnail in Instagram data structures');
const imageBuffer = await fetchImageAsBase64(thumbnailUrl);
if (imageBuffer) {
if (progressCallback) {
progressCallback({
type: 'thumbnail',
message: 'Thumbnail extracted from Instagram data',
data: { thumbnail: imageBuffer },
timestamp: new Date().toISOString()
});
}
return imageBuffer;
}
}
} catch (e) {
console.log('[Thumbnail] Instagram data method failed:', e);
}
// Method 4: Screenshot fallback (existing method)
console.log('[Thumbnail] Falling back to screenshot method');
const screenshotThumbnail = await extractThumbnailScreenshot(page);
if (screenshotThumbnail && progressCallback) {
progressCallback({
type: 'thumbnail',
message: 'Thumbnail extracted via screenshot',
data: { thumbnail: screenshotThumbnail },
timestamp: new Date().toISOString()
});
}
return screenshotThumbnail;
}