fix(extraction): resolve progressCallback undefined errors

- Add progressCallback parameter to extractFromEmbeddedJSON and extractFromDOM
- Pass onProgress callback from extractWithStrategies to all strategies
- Fix legacy strategy to use correct callback variable name
- Verify extractViaGraphQL correctly returns null thumbnail

This fixes ReferenceError that was preventing all extraction methods from working.
All extraction strategies now properly emit thumbnail progress events via SSE.

Closes: FixProgressCallbackUndefinedErrors
This commit is contained in:
Giancarmine Salucci
2025-12-21 04:28:07 +01:00
parent 7e4d82de8d
commit 2de5567682
3 changed files with 497 additions and 5 deletions

View File

@@ -204,7 +204,10 @@ function cleanText(text: string): string {
/**
* Strategy 1: Extract from embedded JSON data in script tags
*/
async function extractFromEmbeddedJSON(page: Page): Promise<ExtractedContent | null> {
async function extractFromEmbeddedJSON(
page: Page,
progressCallback?: ProgressCallback
): Promise<ExtractedContent | null> {
try {
// Extract all script tag contents
const scriptContents = await page.evaluate(() => {
@@ -313,7 +316,10 @@ function extractFromAlternativeStructure(items: any): Omit<ExtractedContent, 'th
/**
* Strategy 2: Extract from DOM using specific selectors
*/
async function extractFromDOM(page: Page): Promise<ExtractedContent | null> {
async function extractFromDOM(
page: Page,
progressCallback?: ProgressCallback
): Promise<ExtractedContent | null> {
try {
// Strategy: Direct caption selector
const captionText = await page.evaluate(() => {
@@ -442,11 +448,11 @@ async function extractWithStrategies(
}> = [
{
name: 'embedded-json',
fn: () => extractFromEmbeddedJSON(page)
fn: () => extractFromEmbeddedJSON(page, onProgress)
},
{
name: 'dom-selector',
fn: () => extractFromDOM(page)
fn: () => extractFromDOM(page, onProgress)
},
{
name: 'graphql-api',
@@ -456,7 +462,7 @@ async function extractWithStrategies(
name: 'legacy',
fn: async () => {
const text = await extractCleanTextLegacy(page);
const thumbnail = await extractThumbnailStealth(page, progressCallback);
const thumbnail = await extractThumbnailStealth(page, onProgress);
return { bodyText: text, thumbnail };
}
}