Merge branch 'fix/RECIPE-0006_fix_recipe_extraction'
This commit is contained in:
@@ -199,6 +199,10 @@ function cleanText(text: string): string {
|
||||
cleaned = cleaned.replace(pattern, '');
|
||||
});
|
||||
|
||||
// Remove hashtags from end of text
|
||||
// Pattern: #word #multiple_words (supports international characters)
|
||||
cleaned = cleaned.replace(/(#[\w\u00C0-\u024F\u1E00-\u1EFF\u0400-\u04FF]+\s*)+$/gi, '').trim();
|
||||
|
||||
return cleaned.trim();
|
||||
}
|
||||
|
||||
@@ -322,24 +326,33 @@ async function extractFromDOM(
|
||||
progressCallback?: ProgressCallback
|
||||
): Promise<ExtractedContent | null> {
|
||||
try {
|
||||
// Strategy: Direct caption selector
|
||||
const captionText = await page.evaluate(() => {
|
||||
// Try h1[dir="auto"] (most reliable for captions)
|
||||
const h1 = document.querySelector('h1[dir="auto"]');
|
||||
if (h1?.textContent) {
|
||||
return h1.textContent.trim();
|
||||
// Try multiple selectors in order of reliability
|
||||
const selectors = [
|
||||
'article h1', // Semantic title element
|
||||
'article span[dir="auto"]', // Caption with dir attribute
|
||||
'article div[role="button"] + span', // Caption after interactive element
|
||||
'article span:not([aria-label])', // Non-labeled spans (likely caption)
|
||||
];
|
||||
|
||||
for (const selector of selectors) {
|
||||
const element = document.querySelector(selector);
|
||||
if (element?.textContent && element.textContent.length > 100) {
|
||||
// Only accept elements with substantial text (not UI labels)
|
||||
console.log(`[Extractor] DOM selector matched: ${selector}`);
|
||||
return element.textContent.trim();
|
||||
}
|
||||
}
|
||||
|
||||
// Try article caption div
|
||||
const captionDiv = document.querySelector('article div._a9zs, article span');
|
||||
if (captionDiv?.textContent) {
|
||||
return captionDiv.textContent.trim();
|
||||
}
|
||||
|
||||
// Try meta tag
|
||||
// Fallback to og:description ONLY if all other methods fail
|
||||
// NOTE: This contains metadata prefix but better than nothing
|
||||
const metaDesc = document.querySelector('meta[property="og:description"]');
|
||||
if (metaDesc) {
|
||||
return metaDesc.getAttribute('content') || '';
|
||||
const content = metaDesc.getAttribute('content') || '';
|
||||
// Try to strip metadata prefix pattern: "X likes, Y comments - username on date: "
|
||||
const cleanedContent = content.replace(/^\d+K?\s+likes,\s+\d+\s+comments\s+-\s+[\w.]+\s+on\s+[^:]+:\s+/, '');
|
||||
console.log('[Extractor] DOM selector fallback: og:description (with metadata cleanup)');
|
||||
return cleanedContent;
|
||||
}
|
||||
|
||||
return null;
|
||||
|
||||
47
src/tests/instagram-caption-extraction.e2e.spec.ts
Normal file
47
src/tests/instagram-caption-extraction.e2e.spec.ts
Normal file
@@ -0,0 +1,47 @@
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { extractTextAndThumbnail } from '$lib/server/extraction';
|
||||
|
||||
describe('Instagram Caption Extraction E2E', () => {
|
||||
it('should extract complete recipe without metadata prefix', async () => {
|
||||
const testUrl = 'https://www.instagram.com/reel/DP6oN7JCEo8/?utm_source=ig_web_button_share_sheet';
|
||||
|
||||
const result = await extractTextAndThumbnail(testUrl);
|
||||
|
||||
// Verify extraction succeeded
|
||||
expect(result).toBeDefined();
|
||||
expect(result.bodyText).toBeDefined();
|
||||
expect(result.bodyText.length).toBeGreaterThan(100);
|
||||
|
||||
console.log('[Test] Extracted text length:', result.bodyText.length);
|
||||
console.log('[Test] First 200 chars:', result.bodyText.substring(0, 200));
|
||||
|
||||
// Should NOT contain metadata prefix patterns
|
||||
expect(result.bodyText).not.toMatch(/^\d+K?\s+likes,/);
|
||||
expect(result.bodyText).not.toMatch(/^\d+\s+likes,/);
|
||||
expect(result.bodyText).not.toMatch(/\d+\s+comments/);
|
||||
expect(result.bodyText).not.toMatch(/\w+\s+on\s+\w+\s+\d+/);
|
||||
|
||||
// Should start with recipe title
|
||||
expect(result.bodyText).toMatch(/^La cacio e pepe/i);
|
||||
|
||||
// Should NOT contain hashtags at the end
|
||||
expect(result.bodyText).not.toMatch(/#\w+\s*$/);
|
||||
expect(result.bodyText).not.toContain('#cacioepepe');
|
||||
expect(result.bodyText).not.toContain('#ricettefacili');
|
||||
|
||||
// Should contain ingredients section
|
||||
expect(result.bodyText).toContain('pecorino');
|
||||
expect(result.bodyText).toContain('pepe');
|
||||
|
||||
// Should contain procedure section
|
||||
expect(result.bodyText).toContain('pasta');
|
||||
expect(result.bodyText).toContain('acqua');
|
||||
|
||||
// Should NOT be truncated
|
||||
expect(result.bodyText).not.toContain('...');
|
||||
}, 30000);
|
||||
|
||||
it.skip('should handle invalid Instagram URL gracefully', async () => {
|
||||
// Placeholder for future test
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user