fix(RECIPE-0006): complete iteration 1 - unit tests for Instagram caption extraction

- Exported cleanText() and extractFromDOM() for unit testing
- Fixed metadata prefix regex to handle optional quotes
- Created comprehensive unit tests with mocked Playwright Page (15 tests, 12ms)
- All 275 tests passing
This commit is contained in:
Giancarmine Salucci
2026-02-17 11:02:59 +01:00
parent b304f5266a
commit 56d3aec3e2
4 changed files with 433 additions and 13 deletions

View File

@@ -183,22 +183,25 @@ function extractShortcode(url: string): string | null {
/**
* Clean extracted text
*/
function cleanText(text: string): string {
// Remove excessive whitespace
let cleaned = text.replace(/\s+/g, ' ').trim();
export function cleanText(text: string): string {
let cleaned = text;
// Remove common UI text patterns
// Remove common UI text patterns BEFORE normalizing whitespace
// This way patterns like "Liked by..." and "View all..." can be matched across lines
const uiPatterns = [
/^\s*More posts from.+$/gim,
/^\s*View all \d+ comments$/gim,
/^\s*Add a comment\.\.\.$/gim,
/^\s*Liked by.+$/gim
/More posts from.+/gi,
/View all \d+ comments/gi,
/Add a comment\.\.\./gi,
/Liked by.+?(?=\n|$)/gi
];
uiPatterns.forEach((pattern) => {
cleaned = cleaned.replace(pattern, '');
});
// Remove excessive whitespace and normalize (after UI pattern removal)
cleaned = cleaned.replace(/\s+/g, ' ').trim();
// Remove hashtags from end of text
// Pattern: #word #multiple_words (supports international characters)
cleaned = cleaned.replace(/(#[\w\u00C0-\u024F\u1E00-\u1EFF\u0400-\u04FF]+\s*)+$/gi, '').trim();
@@ -321,7 +324,7 @@ function extractFromAlternativeStructure(items: any): Omit<ExtractedContent, 'th
/**
* Strategy 2: Extract from DOM using specific selectors
*/
async function extractFromDOM(
export async function extractFromDOM(
page: Page,
progressCallback?: ProgressCallback
): Promise<ExtractedContent | null> {
@@ -350,7 +353,7 @@ async function extractFromDOM(
if (metaDesc) {
const content = metaDesc.getAttribute('content') || '';
// Try to strip metadata prefix pattern: "X likes, Y comments - username on date: "
const cleanedContent = content.replace(/^\d+K?\s+likes,\s+\d+\s+comments\s+-\s+[\w.]+\s+on\s+[^:]+:\s+/, '');
const cleanedContent = content.replace(/^\d+K?\s+likes,\s+\d+\s+comments\s+-\s+[\w.]+\s+on\s+[^:]+:\s*["']?/, '');
console.log('[Extractor] DOM selector fallback: og:description (with metadata cleanup)');
return cleanedContent;
}