fix(RECIPE-0006): complete iteration 1 - unit tests for Instagram caption extraction

- Exported cleanText() and extractFromDOM() for unit testing - Fixed metadata prefix regex to handle optional quotes - Created comprehensive unit tests with mocked Playwright Page (15 tests, 12ms) - All 275 tests passing
2026-02-17 11:02:59 +01:00
parent b304f5266a
commit 56d3aec3e2
4 changed files with 433 additions and 13 deletions
--- a/src/lib/server/extraction.ts
+++ b/src/lib/server/extraction.ts
@@ -183,22 +183,25 @@ function extractShortcode(url: string): string | null {
 /**
 * Clean extracted text
 */
-function cleanText(text: string): string {
-	// Remove excessive whitespace
-	let cleaned = text.replace(/\s+/g, ' ').trim();
+export function cleanText(text: string): string {
+	let cleaned = text;

-	// Remove common UI text patterns
+	// Remove common UI text patterns BEFORE normalizing whitespace
+	// This way patterns like "Liked by..." and "View all..." can be matched across lines
 	const uiPatterns = [
-		/^\s*More posts from.+$/gim,
-		/^\s*View all \d+ comments$/gim,
-		/^\s*Add a comment\.\.\.$/gim,
-		/^\s*Liked by.+$/gim
+		/More posts from.+/gi,
+		/View all \d+ comments/gi,
+		/Add a comment\.\.\./gi,
+		/Liked by.+?(?=\n|$)/gi
 	];

 	uiPatterns.forEach((pattern) => {
 		cleaned = cleaned.replace(pattern, '');
 	});

+	// Remove excessive whitespace and normalize (after UI pattern removal)
+	cleaned = cleaned.replace(/\s+/g, ' ').trim();
+
 	// Remove hashtags from end of text
 	// Pattern: #word #multiple_words (supports international characters)
 	cleaned = cleaned.replace(/(#[\w\u00C0-\u024F\u1E00-\u1EFF\u0400-\u04FF]+\s*)+$/gi, '').trim();
@@ -321,7 +324,7 @@ function extractFromAlternativeStructure(items: any): Omit<ExtractedContent, 'th
 /**
 * Strategy 2: Extract from DOM using specific selectors
 */
-async function extractFromDOM(
+export async function extractFromDOM(
 	page: Page,
 	progressCallback?: ProgressCallback
 ): Promise<ExtractedContent | null> {
@@ -350,7 +353,7 @@ async function extractFromDOM(
 			if (metaDesc) {
 				const content = metaDesc.getAttribute('content') || '';
 				// Try to strip metadata prefix pattern: "X likes, Y comments - username on date: "
-				const cleanedContent = content.replace(/^\d+K?\s+likes,\s+\d+\s+comments\s+-\s+[\w.]+\s+on\s+[^:]+:\s+/, '');
+				const cleanedContent = content.replace(/^\d+K?\s+likes,\s+\d+\s+comments\s+-\s+[\w.]+\s+on\s+[^:]+:\s*["']?/, '');
 				console.log('[Extractor] DOM selector fallback: og:description (with metadata cleanup)');
 				return cleanedContent;
 			}