feat(extraction): enhance thumbnail URL validation with strict HTTP 200 check

- Implement strict HTTP 200 validation (reject all other status codes)
- Add content-type validation (must be image/*)
- Add 10-second timeout protection with AbortController
- Thread progressCallback through all fetchImageAsBase64 calls
- Add detailed logging for each validation failure scenario
- Report validation failures via SSE progress callbacks

Unit tests:
- Add comprehensive test coverage for all validation scenarios
- Test HTTP status codes (200, 404, 403, 500, etc.)
- Test content-type validation (image/* vs text/html, etc.)
- Test timeout behavior with AbortController
- Test error handling (network errors, DNS, SSL, etc.)
- Test progress callback reporting

Integration tests:
- Add tests for complete extraction flow with URL failures
- Test fallback chain behavior (meta tags → poster → Instagram data → screenshot)
- Test real-world scenarios (redirects, query params, different post types)

Documentation:
- Enhanced JSDoc with validation criteria
- Added examples showing fallback behavior
- Documented all failure scenarios and their handling

All tests passing 
This commit is contained in:
Giancarmine Salucci
2025-12-21 05:33:48 +01:00
parent a04763c1da
commit 767b8a1b37
6 changed files with 1608 additions and 18 deletions

View File

@@ -613,19 +613,122 @@ async function extractThumbnailScreenshot(page: Page): Promise<string | null> {
/**
* Helper: Fetch image from URL and convert to base64 data URI
*
* **Validation Criteria:**
* - HTTP status must be exactly 200 (not 2xx, only 200)
* - Content-Type must start with 'image/' (e.g., image/jpeg, image/png, image/webp)
* - Request must complete within 10 seconds
*
* **Failure Scenarios:**
* - Non-200 status → Returns null, reports status code via progress callback
* - Invalid content-type → Returns null, reports content-type via progress callback
* - Timeout → Returns null, reports timeout via progress callback
* - Network error → Returns null, reports error message via progress callback
*
* **Usage in Fallback Chain:**
* This function is used by `extractThumbnailStealth()` which tries multiple URL sources:
* 1. Meta tags (og:image, twitter:image)
* 2. Video poster attribute
* 3. Instagram data structures (display_url, thumbnail_src)
* 4. Screenshot fallback (always succeeds)
*
* When this function returns null, extraction continues to the next method.
*
* @param imageUrl - The image URL to fetch (must be HTTPS)
* @param progressCallback - Optional callback for progress reporting
* @returns Base64 data URI (data:image/*;base64,...) or null if validation fails
*
* @example
* ```typescript
* const thumbnail = await fetchImageAsBase64(
* 'https://instagram.com/image.jpg',
* (event) => console.log(event.message)
* );
*
* if (thumbnail) {
* // thumbnail is a valid base64 data URI
* console.log(thumbnail.substring(0, 50)); // "data:image/jpeg;base64,/9j/4AAQSkZJRg..."
* } else {
* // URL validation failed, try next method
* }
* ```
*/
async function fetchImageAsBase64(imageUrl: string): Promise<string | null> {
async function fetchImageAsBase64(
imageUrl: string,
progressCallback?: ProgressCallback
): Promise<string | null> {
try {
const response = await fetch(imageUrl);
if (!response.ok) return null;
// Create abort controller for timeout
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), 10000); // 10s timeout
console.log(`[Thumbnail] Validating URL: ${imageUrl}`);
const response = await fetch(imageUrl, {
signal: controller.signal
});
clearTimeout(timeoutId);
// Strict status validation: must be exactly 200
if (response.status !== 200) {
console.warn(`[Thumbnail] URL validation failed: HTTP ${response.status} for ${imageUrl}`);
progressCallback?.({
type: 'status',
message: `Thumbnail URL returned HTTP ${response.status}, trying next method...`,
timestamp: new Date().toISOString()
});
return null;
}
// Validate content-type
const contentType = response.headers.get('content-type') || '';
if (!contentType.startsWith('image/')) {
console.warn(
`[Thumbnail] URL validation failed: Invalid content-type '${contentType}' for ${imageUrl}`
);
progressCallback?.({
type: 'status',
message: `Thumbnail URL returned non-image content (${contentType}), trying next method...`,
timestamp: new Date().toISOString()
});
return null;
}
console.log(`[Thumbnail] URL validation successful: ${imageUrl} (${contentType})`);
const arrayBuffer = await response.arrayBuffer();
const buffer = Buffer.from(arrayBuffer);
const contentType = response.headers.get('content-type') || 'image/jpeg';
return `data:${contentType};base64,${buffer.toString('base64')}`;
const base64Data = `data:${contentType};base64,${buffer.toString('base64')}`;
progressCallback?.({
type: 'status',
message: 'Thumbnail fetched and validated from URL',
timestamp: new Date().toISOString()
});
return base64Data;
} catch (e) {
console.error('[Thumbnail] Failed to fetch image:', e);
if (e instanceof Error) {
if (e.name === 'AbortError') {
console.error(`[Thumbnail] URL fetch timeout: ${imageUrl}`);
progressCallback?.({
type: 'status',
message: 'Thumbnail URL fetch timeout, trying next method...',
timestamp: new Date().toISOString()
});
} else {
console.error(`[Thumbnail] Failed to fetch image from ${imageUrl}:`, e.message);
progressCallback?.({
type: 'status',
message: `Thumbnail URL fetch failed (${e.message}), trying next method...`,
timestamp: new Date().toISOString()
});
}
} else {
console.error('[Thumbnail] Failed to fetch image:', e);
}
return null;
}
}
@@ -658,7 +761,7 @@ async function extractThumbnailStealth(
const ogImage = await page.getAttribute('meta[property="og:image"]', 'content');
if (ogImage) {
console.log('[Thumbnail] Found og:image meta tag');
const imageBuffer = await fetchImageAsBase64(ogImage);
const imageBuffer = await fetchImageAsBase64(ogImage, progressCallback);
if (imageBuffer) {
if (progressCallback) {
progressCallback({
@@ -675,7 +778,7 @@ async function extractThumbnailStealth(
const twitterImage = await page.getAttribute('meta[name="twitter:image"]', 'content');
if (twitterImage) {
console.log('[Thumbnail] Found twitter:image meta tag');
const imageBuffer = await fetchImageAsBase64(twitterImage);
const imageBuffer = await fetchImageAsBase64(twitterImage, progressCallback);
if (imageBuffer) {
if (progressCallback) {
progressCallback({
@@ -697,7 +800,7 @@ async function extractThumbnailStealth(
const poster = await page.getAttribute('video', 'poster');
if (poster) {
console.log('[Thumbnail] Found video poster attribute');
const imageBuffer = await fetchImageAsBase64(poster);
const imageBuffer = await fetchImageAsBase64(poster, progressCallback);
if (imageBuffer) {
if (progressCallback) {
progressCallback({
@@ -736,7 +839,7 @@ async function extractThumbnailStealth(
if (thumbnailUrl) {
console.log('[Thumbnail] Found thumbnail in Instagram data structures');
const imageBuffer = await fetchImageAsBase64(thumbnailUrl);
const imageBuffer = await fetchImageAsBase64(thumbnailUrl, progressCallback);
if (imageBuffer) {
if (progressCallback) {
progressCallback({