feat(extraction): enhance thumbnail URL validation with strict HTTP 200 check
- Implement strict HTTP 200 validation (reject all other status codes)
- Add content-type validation (must be image/*)
- Add 10-second timeout protection with AbortController
- Thread progressCallback through all fetchImageAsBase64 calls
- Add detailed logging for each validation failure scenario
- Report validation failures via SSE progress callbacks
Unit tests:
- Add comprehensive test coverage for all validation scenarios
- Test HTTP status codes (200, 404, 403, 500, etc.)
- Test content-type validation (image/* vs text/html, etc.)
- Test timeout behavior with AbortController
- Test error handling (network errors, DNS, SSL, etc.)
- Test progress callback reporting
Integration tests:
- Add tests for complete extraction flow with URL failures
- Test fallback chain behavior (meta tags → poster → Instagram data → screenshot)
- Test real-world scenarios (redirects, query params, different post types)
Documentation:
- Enhanced JSDoc with validation criteria
- Added examples showing fallback behavior
- Documented all failure scenarios and their handling
All tests passing ✅
This commit is contained in:
@@ -613,19 +613,122 @@ async function extractThumbnailScreenshot(page: Page): Promise<string | null> {
|
||||
|
||||
/**
|
||||
* Helper: Fetch image from URL and convert to base64 data URI
|
||||
*
|
||||
* **Validation Criteria:**
|
||||
* - HTTP status must be exactly 200 (not 2xx, only 200)
|
||||
* - Content-Type must start with 'image/' (e.g., image/jpeg, image/png, image/webp)
|
||||
* - Request must complete within 10 seconds
|
||||
*
|
||||
* **Failure Scenarios:**
|
||||
* - Non-200 status → Returns null, reports status code via progress callback
|
||||
* - Invalid content-type → Returns null, reports content-type via progress callback
|
||||
* - Timeout → Returns null, reports timeout via progress callback
|
||||
* - Network error → Returns null, reports error message via progress callback
|
||||
*
|
||||
* **Usage in Fallback Chain:**
|
||||
* This function is used by `extractThumbnailStealth()` which tries multiple URL sources:
|
||||
* 1. Meta tags (og:image, twitter:image)
|
||||
* 2. Video poster attribute
|
||||
* 3. Instagram data structures (display_url, thumbnail_src)
|
||||
* 4. Screenshot fallback (always succeeds)
|
||||
*
|
||||
* When this function returns null, extraction continues to the next method.
|
||||
*
|
||||
* @param imageUrl - The image URL to fetch (must be HTTPS)
|
||||
* @param progressCallback - Optional callback for progress reporting
|
||||
* @returns Base64 data URI (data:image/*;base64,...) or null if validation fails
|
||||
*
|
||||
* @example
|
||||
* ```typescript
|
||||
* const thumbnail = await fetchImageAsBase64(
|
||||
* 'https://instagram.com/image.jpg',
|
||||
* (event) => console.log(event.message)
|
||||
* );
|
||||
*
|
||||
* if (thumbnail) {
|
||||
* // thumbnail is a valid base64 data URI
|
||||
* console.log(thumbnail.substring(0, 50)); // "data:image/jpeg;base64,/9j/4AAQSkZJRg..."
|
||||
* } else {
|
||||
* // URL validation failed, try next method
|
||||
* }
|
||||
* ```
|
||||
*/
|
||||
async function fetchImageAsBase64(imageUrl: string): Promise<string | null> {
|
||||
async function fetchImageAsBase64(
|
||||
imageUrl: string,
|
||||
progressCallback?: ProgressCallback
|
||||
): Promise<string | null> {
|
||||
try {
|
||||
const response = await fetch(imageUrl);
|
||||
if (!response.ok) return null;
|
||||
// Create abort controller for timeout
|
||||
const controller = new AbortController();
|
||||
const timeoutId = setTimeout(() => controller.abort(), 10000); // 10s timeout
|
||||
|
||||
console.log(`[Thumbnail] Validating URL: ${imageUrl}`);
|
||||
|
||||
const response = await fetch(imageUrl, {
|
||||
signal: controller.signal
|
||||
});
|
||||
|
||||
clearTimeout(timeoutId);
|
||||
|
||||
// Strict status validation: must be exactly 200
|
||||
if (response.status !== 200) {
|
||||
console.warn(`[Thumbnail] URL validation failed: HTTP ${response.status} for ${imageUrl}`);
|
||||
progressCallback?.({
|
||||
type: 'status',
|
||||
message: `Thumbnail URL returned HTTP ${response.status}, trying next method...`,
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
return null;
|
||||
}
|
||||
|
||||
// Validate content-type
|
||||
const contentType = response.headers.get('content-type') || '';
|
||||
if (!contentType.startsWith('image/')) {
|
||||
console.warn(
|
||||
`[Thumbnail] URL validation failed: Invalid content-type '${contentType}' for ${imageUrl}`
|
||||
);
|
||||
progressCallback?.({
|
||||
type: 'status',
|
||||
message: `Thumbnail URL returned non-image content (${contentType}), trying next method...`,
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
return null;
|
||||
}
|
||||
|
||||
console.log(`[Thumbnail] URL validation successful: ${imageUrl} (${contentType})`);
|
||||
|
||||
const arrayBuffer = await response.arrayBuffer();
|
||||
const buffer = Buffer.from(arrayBuffer);
|
||||
const contentType = response.headers.get('content-type') || 'image/jpeg';
|
||||
|
||||
return `data:${contentType};base64,${buffer.toString('base64')}`;
|
||||
const base64Data = `data:${contentType};base64,${buffer.toString('base64')}`;
|
||||
|
||||
progressCallback?.({
|
||||
type: 'status',
|
||||
message: 'Thumbnail fetched and validated from URL',
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
|
||||
return base64Data;
|
||||
} catch (e) {
|
||||
console.error('[Thumbnail] Failed to fetch image:', e);
|
||||
if (e instanceof Error) {
|
||||
if (e.name === 'AbortError') {
|
||||
console.error(`[Thumbnail] URL fetch timeout: ${imageUrl}`);
|
||||
progressCallback?.({
|
||||
type: 'status',
|
||||
message: 'Thumbnail URL fetch timeout, trying next method...',
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
} else {
|
||||
console.error(`[Thumbnail] Failed to fetch image from ${imageUrl}:`, e.message);
|
||||
progressCallback?.({
|
||||
type: 'status',
|
||||
message: `Thumbnail URL fetch failed (${e.message}), trying next method...`,
|
||||
timestamp: new Date().toISOString()
|
||||
});
|
||||
}
|
||||
} else {
|
||||
console.error('[Thumbnail] Failed to fetch image:', e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -658,7 +761,7 @@ async function extractThumbnailStealth(
|
||||
const ogImage = await page.getAttribute('meta[property="og:image"]', 'content');
|
||||
if (ogImage) {
|
||||
console.log('[Thumbnail] Found og:image meta tag');
|
||||
const imageBuffer = await fetchImageAsBase64(ogImage);
|
||||
const imageBuffer = await fetchImageAsBase64(ogImage, progressCallback);
|
||||
if (imageBuffer) {
|
||||
if (progressCallback) {
|
||||
progressCallback({
|
||||
@@ -675,7 +778,7 @@ async function extractThumbnailStealth(
|
||||
const twitterImage = await page.getAttribute('meta[name="twitter:image"]', 'content');
|
||||
if (twitterImage) {
|
||||
console.log('[Thumbnail] Found twitter:image meta tag');
|
||||
const imageBuffer = await fetchImageAsBase64(twitterImage);
|
||||
const imageBuffer = await fetchImageAsBase64(twitterImage, progressCallback);
|
||||
if (imageBuffer) {
|
||||
if (progressCallback) {
|
||||
progressCallback({
|
||||
@@ -697,7 +800,7 @@ async function extractThumbnailStealth(
|
||||
const poster = await page.getAttribute('video', 'poster');
|
||||
if (poster) {
|
||||
console.log('[Thumbnail] Found video poster attribute');
|
||||
const imageBuffer = await fetchImageAsBase64(poster);
|
||||
const imageBuffer = await fetchImageAsBase64(poster, progressCallback);
|
||||
if (imageBuffer) {
|
||||
if (progressCallback) {
|
||||
progressCallback({
|
||||
@@ -736,7 +839,7 @@ async function extractThumbnailStealth(
|
||||
|
||||
if (thumbnailUrl) {
|
||||
console.log('[Thumbnail] Found thumbnail in Instagram data structures');
|
||||
const imageBuffer = await fetchImageAsBase64(thumbnailUrl);
|
||||
const imageBuffer = await fetchImageAsBase64(thumbnailUrl, progressCallback);
|
||||
if (imageBuffer) {
|
||||
if (progressCallback) {
|
||||
progressCallback({
|
||||
|
||||
Reference in New Issue
Block a user