fix(RECIPE-0006): complete iteration 1 - unit tests for Instagram caption extraction
- Exported cleanText() and extractFromDOM() for unit testing - Fixed metadata prefix regex to handle optional quotes - Created comprehensive unit tests with mocked Playwright Page (15 tests, 12ms) - All 275 tests passing
This commit is contained in:
@@ -183,22 +183,25 @@ function extractShortcode(url: string): string | null {
|
||||
/**
|
||||
* Clean extracted text
|
||||
*/
|
||||
function cleanText(text: string): string {
|
||||
// Remove excessive whitespace
|
||||
let cleaned = text.replace(/\s+/g, ' ').trim();
|
||||
export function cleanText(text: string): string {
|
||||
let cleaned = text;
|
||||
|
||||
// Remove common UI text patterns
|
||||
// Remove common UI text patterns BEFORE normalizing whitespace
|
||||
// This way patterns like "Liked by..." and "View all..." can be matched across lines
|
||||
const uiPatterns = [
|
||||
/^\s*More posts from.+$/gim,
|
||||
/^\s*View all \d+ comments$/gim,
|
||||
/^\s*Add a comment\.\.\.$/gim,
|
||||
/^\s*Liked by.+$/gim
|
||||
/More posts from.+/gi,
|
||||
/View all \d+ comments/gi,
|
||||
/Add a comment\.\.\./gi,
|
||||
/Liked by.+?(?=\n|$)/gi
|
||||
];
|
||||
|
||||
uiPatterns.forEach((pattern) => {
|
||||
cleaned = cleaned.replace(pattern, '');
|
||||
});
|
||||
|
||||
// Remove excessive whitespace and normalize (after UI pattern removal)
|
||||
cleaned = cleaned.replace(/\s+/g, ' ').trim();
|
||||
|
||||
// Remove hashtags from end of text
|
||||
// Pattern: #word #multiple_words (supports international characters)
|
||||
cleaned = cleaned.replace(/(#[\w\u00C0-\u024F\u1E00-\u1EFF\u0400-\u04FF]+\s*)+$/gi, '').trim();
|
||||
@@ -321,7 +324,7 @@ function extractFromAlternativeStructure(items: any): Omit<ExtractedContent, 'th
|
||||
/**
|
||||
* Strategy 2: Extract from DOM using specific selectors
|
||||
*/
|
||||
async function extractFromDOM(
|
||||
export async function extractFromDOM(
|
||||
page: Page,
|
||||
progressCallback?: ProgressCallback
|
||||
): Promise<ExtractedContent | null> {
|
||||
@@ -350,7 +353,7 @@ async function extractFromDOM(
|
||||
if (metaDesc) {
|
||||
const content = metaDesc.getAttribute('content') || '';
|
||||
// Try to strip metadata prefix pattern: "X likes, Y comments - username on date: "
|
||||
const cleanedContent = content.replace(/^\d+K?\s+likes,\s+\d+\s+comments\s+-\s+[\w.]+\s+on\s+[^:]+:\s+/, '');
|
||||
const cleanedContent = content.replace(/^\d+K?\s+likes,\s+\d+\s+comments\s+-\s+[\w.]+\s+on\s+[^:]+:\s*["']?/, '');
|
||||
console.log('[Extractor] DOM selector fallback: og:description (with metadata cleanup)');
|
||||
return cleanedContent;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user