From b304f5266a2649aafa309047caca173a527bb0ce Mon Sep 17 00:00:00 2001 From: Giancarmine Salucci Date: Tue, 17 Feb 2026 10:14:52 +0100 Subject: [PATCH] =?UTF-8?q?fix(RECIPE-0006):=20complete=20iteration=200=20?= =?UTF-8?q?=E2=80=94=20fix=20Instagram=20recipe=20extraction?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/lib/server/extraction.ts | 39 ++++++++++----- .../instagram-caption-extraction.e2e.spec.ts | 47 +++++++++++++++++++ 2 files changed, 73 insertions(+), 13 deletions(-) create mode 100644 src/tests/instagram-caption-extraction.e2e.spec.ts diff --git a/src/lib/server/extraction.ts b/src/lib/server/extraction.ts index 9bb34c4..d2b370f 100644 --- a/src/lib/server/extraction.ts +++ b/src/lib/server/extraction.ts @@ -199,6 +199,10 @@ function cleanText(text: string): string { cleaned = cleaned.replace(pattern, ''); }); + // Remove hashtags from end of text + // Pattern: #word #multiple_words (supports international characters) + cleaned = cleaned.replace(/(#[\w\u00C0-\u024F\u1E00-\u1EFF\u0400-\u04FF]+\s*)+$/gi, '').trim(); + return cleaned.trim(); } @@ -322,24 +326,33 @@ async function extractFromDOM( progressCallback?: ProgressCallback ): Promise { try { - // Strategy: Direct caption selector const captionText = await page.evaluate(() => { - // Try h1[dir="auto"] (most reliable for captions) - const h1 = document.querySelector('h1[dir="auto"]'); - if (h1?.textContent) { - return h1.textContent.trim(); + // Try multiple selectors in order of reliability + const selectors = [ + 'article h1', // Semantic title element + 'article span[dir="auto"]', // Caption with dir attribute + 'article div[role="button"] + span', // Caption after interactive element + 'article span:not([aria-label])', // Non-labeled spans (likely caption) + ]; + + for (const selector of selectors) { + const element = document.querySelector(selector); + if (element?.textContent && element.textContent.length > 100) { + // Only accept elements with substantial text (not UI labels) + console.log(`[Extractor] DOM selector matched: ${selector}`); + return element.textContent.trim(); + } } - // Try article caption div - const captionDiv = document.querySelector('article div._a9zs, article span'); - if (captionDiv?.textContent) { - return captionDiv.textContent.trim(); - } - - // Try meta tag + // Fallback to og:description ONLY if all other methods fail + // NOTE: This contains metadata prefix but better than nothing const metaDesc = document.querySelector('meta[property="og:description"]'); if (metaDesc) { - return metaDesc.getAttribute('content') || ''; + const content = metaDesc.getAttribute('content') || ''; + // Try to strip metadata prefix pattern: "X likes, Y comments - username on date: " + const cleanedContent = content.replace(/^\d+K?\s+likes,\s+\d+\s+comments\s+-\s+[\w.]+\s+on\s+[^:]+:\s+/, ''); + console.log('[Extractor] DOM selector fallback: og:description (with metadata cleanup)'); + return cleanedContent; } return null; diff --git a/src/tests/instagram-caption-extraction.e2e.spec.ts b/src/tests/instagram-caption-extraction.e2e.spec.ts new file mode 100644 index 0000000..89f6cf0 --- /dev/null +++ b/src/tests/instagram-caption-extraction.e2e.spec.ts @@ -0,0 +1,47 @@ +import { describe, it, expect } from 'vitest'; +import { extractTextAndThumbnail } from '$lib/server/extraction'; + +describe('Instagram Caption Extraction E2E', () => { + it('should extract complete recipe without metadata prefix', async () => { + const testUrl = 'https://www.instagram.com/reel/DP6oN7JCEo8/?utm_source=ig_web_button_share_sheet'; + + const result = await extractTextAndThumbnail(testUrl); + + // Verify extraction succeeded + expect(result).toBeDefined(); + expect(result.bodyText).toBeDefined(); + expect(result.bodyText.length).toBeGreaterThan(100); + + console.log('[Test] Extracted text length:', result.bodyText.length); + console.log('[Test] First 200 chars:', result.bodyText.substring(0, 200)); + + // Should NOT contain metadata prefix patterns + expect(result.bodyText).not.toMatch(/^\d+K?\s+likes,/); + expect(result.bodyText).not.toMatch(/^\d+\s+likes,/); + expect(result.bodyText).not.toMatch(/\d+\s+comments/); + expect(result.bodyText).not.toMatch(/\w+\s+on\s+\w+\s+\d+/); + + // Should start with recipe title + expect(result.bodyText).toMatch(/^La cacio e pepe/i); + + // Should NOT contain hashtags at the end + expect(result.bodyText).not.toMatch(/#\w+\s*$/); + expect(result.bodyText).not.toContain('#cacioepepe'); + expect(result.bodyText).not.toContain('#ricettefacili'); + + // Should contain ingredients section + expect(result.bodyText).toContain('pecorino'); + expect(result.bodyText).toContain('pepe'); + + // Should contain procedure section + expect(result.bodyText).toContain('pasta'); + expect(result.bodyText).toContain('acqua'); + + // Should NOT be truncated + expect(result.bodyText).not.toContain('...'); + }, 30000); + + it.skip('should handle invalid Instagram URL gracefully', async () => { + // Placeholder for future test + }); +});