Merge branch 'fix/RECIPE-0006_fix_recipe_extraction'

2026-02-17 10:24:58 +01:00
parent b0b5c3579b b304f5266a
commit 33d2a10f8e
2 changed files with 73 additions and 13 deletions
--- a/src/lib/server/extraction.ts
+++ b/src/lib/server/extraction.ts
@@ -199,6 +199,10 @@ function cleanText(text: string): string {
 		cleaned = cleaned.replace(pattern, '');
 	});

+	// Remove hashtags from end of text
+	// Pattern: #word #multiple_words (supports international characters)
+	cleaned = cleaned.replace(/(#[\w\u00C0-\u024F\u1E00-\u1EFF\u0400-\u04FF]+\s*)+$/gi, '').trim();
+
 	return cleaned.trim();
 }

@@ -322,24 +326,33 @@ async function extractFromDOM(
 	progressCallback?: ProgressCallback
 ): Promise<ExtractedContent | null> {
 	try {
-		// Strategy: Direct caption selector
 		const captionText = await page.evaluate(() => {
-			// Try h1[dir="auto"] (most reliable for captions)
-			const h1 = document.querySelector('h1[dir="auto"]');
-			if (h1?.textContent) {
-				return h1.textContent.trim();
+			// Try multiple selectors in order of reliability
+			const selectors = [
+				'article h1',                          // Semantic title element
+				'article span[dir="auto"]',            // Caption with dir attribute
+				'article div[role="button"] + span',   // Caption after interactive element
+				'article span:not([aria-label])',      // Non-labeled spans (likely caption)
+			];
+
+			for (const selector of selectors) {
+				const element = document.querySelector(selector);
+				if (element?.textContent && element.textContent.length > 100) {
+					// Only accept elements with substantial text (not UI labels)
+					console.log(`[Extractor] DOM selector matched: ${selector}`);
+					return element.textContent.trim();
+				}
 			}

-			// Try article caption div
-			const captionDiv = document.querySelector('article div._a9zs, article span');
-			if (captionDiv?.textContent) {
-				return captionDiv.textContent.trim();
-			}
-
-			// Try meta tag
+			// Fallback to og:description ONLY if all other methods fail
+			// NOTE: This contains metadata prefix but better than nothing
 			const metaDesc = document.querySelector('meta[property="og:description"]');
 			if (metaDesc) {
-				return metaDesc.getAttribute('content') || '';
+				const content = metaDesc.getAttribute('content') || '';
+				// Try to strip metadata prefix pattern: "X likes, Y comments - username on date: "
+				const cleanedContent = content.replace(/^\d+K?\s+likes,\s+\d+\s+comments\s+-\s+[\w.]+\s+on\s+[^:]+:\s+/, '');
+				console.log('[Extractor] DOM selector fallback: og:description (with metadata cleanup)');
+				return cleanedContent;
 			}

 			return null;
--- a/src/tests/instagram-caption-extraction.e2e.spec.ts
+++ b/src/tests/instagram-caption-extraction.e2e.spec.ts
@@ -0,0 +1,47 @@
+import { describe, it, expect } from 'vitest';
+import { extractTextAndThumbnail } from '$lib/server/extraction';
+
+describe('Instagram Caption Extraction E2E', () => {
+	it('should extract complete recipe without metadata prefix', async () => {
+		const testUrl = 'https://www.instagram.com/reel/DP6oN7JCEo8/?utm_source=ig_web_button_share_sheet';
+		
+		const result = await extractTextAndThumbnail(testUrl);
+		
+		// Verify extraction succeeded
+		expect(result).toBeDefined();
+		expect(result.bodyText).toBeDefined();
+		expect(result.bodyText.length).toBeGreaterThan(100);
+		
+		console.log('[Test] Extracted text length:', result.bodyText.length);
+		console.log('[Test] First 200 chars:', result.bodyText.substring(0, 200));
+		
+		// Should NOT contain metadata prefix patterns
+		expect(result.bodyText).not.toMatch(/^\d+K?\s+likes,/);
+		expect(result.bodyText).not.toMatch(/^\d+\s+likes,/);
+		expect(result.bodyText).not.toMatch(/\d+\s+comments/);
+		expect(result.bodyText).not.toMatch(/\w+\s+on\s+\w+\s+\d+/);
+		
+		// Should start with recipe title
+		expect(result.bodyText).toMatch(/^La cacio e pepe/i);
+		
+		// Should NOT contain hashtags at the end
+		expect(result.bodyText).not.toMatch(/#\w+\s*$/);
+		expect(result.bodyText).not.toContain('#cacioepepe');
+		expect(result.bodyText).not.toContain('#ricettefacili');
+		
+		// Should contain ingredients section
+		expect(result.bodyText).toContain('pecorino');
+		expect(result.bodyText).toContain('pepe');
+		
+		// Should contain procedure section  
+		expect(result.bodyText).toContain('pasta');
+		expect(result.bodyText).toContain('acqua');
+		
+		// Should NOT be truncated
+		expect(result.bodyText).not.toContain('...');
+	}, 30000);
+
+	it.skip('should handle invalid Instagram URL gracefully', async () => {
+		// Placeholder for future test
+	});
+});