feat: replace Playwright extractor with yt-dlp subprocess
- Add instagram-extractor.ts: yt-dlp subprocess backend for Instagram caption extraction. No in-process browser state, maintained against Instagram frontend churn, supports cookies.txt for auth-walled reels. - Add feature flag EXTRACTOR_BACKEND (ytdlp|playwright) in QueueProcessor so the old Playwright path remains available as fallback. - Add 9 unit tests and 2 live-network integration tests for the new extractor. - Dockerfile: install yt-dlp via pip3 alongside existing Chromium deps. - docker-compose: expose EXTRACTOR_BACKEND env var (default: ytdlp). Also in this commit: - LLM: configurable per-request timeout via LLM_REQUEST_TIMEOUT_MS (default 120s); set maxRetries=0 to surface errors immediately; llama-swap /running health probe. - QueueProcessor: thread progress callback through parser phase. - LlmHealthIndicator: surface llama-swap loaded-model name. - Logging: improve error serialization in queue-processor tests. - .env.example: document llama-swap endpoint and model options. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
49
src/tests/instagram-extractor.integration.spec.ts
Normal file
49
src/tests/instagram-extractor.integration.spec.ts
Normal file
@@ -0,0 +1,49 @@
|
||||
/**
|
||||
* E2E integration test for the yt-dlp Instagram extractor.
|
||||
*
|
||||
* Makes real network calls (yt-dlp + Instagram CDN). Requires:
|
||||
* - yt-dlp installed on PATH
|
||||
* - Network access to instagram.com
|
||||
* - EXTRACTOR_E2E=1 env var (safety guard to avoid running in normal test runs)
|
||||
*
|
||||
* Run with:
|
||||
* EXTRACTOR_E2E=1 npm test -- src/tests/instagram-extractor.e2e.spec.ts
|
||||
*/
|
||||
|
||||
import { describe, it, expect } from 'vitest';
|
||||
import { extractTextAndThumbnail } from '$lib/server/instagram-extractor';
|
||||
|
||||
const E2E = !!process.env.EXTRACTOR_E2E;
|
||||
|
||||
describe.skipIf(!E2E)('instagram-extractor E2E (requires yt-dlp + network)', () => {
|
||||
// Public reels that have previously been in the app queue
|
||||
const TEST_REELS = [
|
||||
{
|
||||
url: 'https://www.instagram.com/reel/DX4XEDZt3qT/',
|
||||
expectKeyword: 'pizza'
|
||||
},
|
||||
{
|
||||
url: 'https://www.instagram.com/reel/DUtHm2EiD26/',
|
||||
expectKeyword: 'noodles'
|
||||
}
|
||||
];
|
||||
|
||||
for (const { url, expectKeyword } of TEST_REELS) {
|
||||
it(`extracts caption from ${url}`, async () => {
|
||||
const events: { type: string; message: string }[] = [];
|
||||
const result = await extractTextAndThumbnail(url, (e) =>
|
||||
events.push(e as { type: string; message: string })
|
||||
);
|
||||
|
||||
expect(result.bodyText.length).toBeGreaterThan(20);
|
||||
expect(result.bodyText.toLowerCase()).toContain(expectKeyword);
|
||||
|
||||
if (result.thumbnail !== null) {
|
||||
expect(result.thumbnail).toMatch(/^data:image\//);
|
||||
}
|
||||
|
||||
expect(events.some((e) => e.type === 'complete')).toBe(true);
|
||||
expect(events.some((e) => e.type === 'status' && e.message.includes('yt-dlp'))).toBe(true);
|
||||
}, 90_000);
|
||||
}
|
||||
});
|
||||
171
src/tests/instagram-extractor.spec.ts
Normal file
171
src/tests/instagram-extractor.spec.ts
Normal file
@@ -0,0 +1,171 @@
|
||||
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
|
||||
|
||||
// Mock node:child_process before importing the SUT. The SUT uses
|
||||
// promisify(execFile); without the Node-internal special handling, promisify
|
||||
// would only forward the first callback arg. We sidestep that by returning a
|
||||
// pre-promisified function tagged with util.promisify.custom that resolves
|
||||
// to {stdout, stderr}.
|
||||
import * as util from 'node:util';
|
||||
const execFileMock = vi.fn();
|
||||
vi.mock('node:child_process', () => {
|
||||
const execFile: any = () => {
|
||||
throw new Error('callback form not used in tests');
|
||||
};
|
||||
execFile[util.promisify.custom] = (cmd: string, args: string[], opts: any) =>
|
||||
execFileMock(cmd, args, opts);
|
||||
return { execFile };
|
||||
});
|
||||
|
||||
const existsSyncMock = vi.fn();
|
||||
vi.mock('node:fs', () => ({
|
||||
existsSync: (p: string) => existsSyncMock(p)
|
||||
}));
|
||||
|
||||
import { extractTextAndThumbnail } from '../lib/server/instagram-extractor';
|
||||
|
||||
describe('instagram-extractor (yt-dlp backend)', () => {
|
||||
const originalFetch = globalThis.fetch;
|
||||
|
||||
beforeEach(() => {
|
||||
execFileMock.mockReset();
|
||||
existsSyncMock.mockReset();
|
||||
existsSyncMock.mockReturnValue(false);
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
globalThis.fetch = originalFetch;
|
||||
});
|
||||
|
||||
it('parses yt-dlp JSON and returns bodyText + thumbnail data URI', async () => {
|
||||
execFileMock.mockResolvedValue({
|
||||
stdout: JSON.stringify({
|
||||
description: 'Pasta carbonara: 200g spaghetti, 100g pancetta, 2 eggs.',
|
||||
thumbnail: 'https://example.com/thumb.jpg'
|
||||
}),
|
||||
stderr: ''
|
||||
});
|
||||
|
||||
globalThis.fetch = vi.fn().mockResolvedValue({
|
||||
status: 200,
|
||||
headers: { get: () => 'image/jpeg' },
|
||||
arrayBuffer: () => Promise.resolve(new Uint8Array([1, 2, 3]).buffer)
|
||||
}) as unknown as typeof fetch;
|
||||
|
||||
const result = await extractTextAndThumbnail('https://www.instagram.com/reel/abc123/');
|
||||
|
||||
expect(result.bodyText).toContain('carbonara');
|
||||
expect(result.thumbnail).toMatch(/^data:image\/jpeg;base64,/);
|
||||
});
|
||||
|
||||
it('falls back to first thumbnails entry when top-level thumbnail is absent', async () => {
|
||||
execFileMock.mockResolvedValue({
|
||||
stdout: JSON.stringify({
|
||||
description: 'Recipe text',
|
||||
thumbnails: [{ url: 'https://example.com/alt-thumb.jpg' }]
|
||||
}),
|
||||
stderr: ''
|
||||
});
|
||||
|
||||
globalThis.fetch = vi.fn().mockResolvedValue({
|
||||
status: 200,
|
||||
headers: { get: () => 'image/png' },
|
||||
arrayBuffer: () => Promise.resolve(new Uint8Array([4, 5, 6]).buffer)
|
||||
}) as unknown as typeof fetch;
|
||||
|
||||
const result = await extractTextAndThumbnail('https://www.instagram.com/reel/abc/');
|
||||
expect(result.thumbnail).toMatch(/^data:image\/png;base64,/);
|
||||
});
|
||||
|
||||
it('returns null thumbnail when fetch fails', async () => {
|
||||
execFileMock.mockResolvedValue({
|
||||
stdout: JSON.stringify({
|
||||
description: 'Recipe text',
|
||||
thumbnail: 'https://example.com/missing.jpg'
|
||||
}),
|
||||
stderr: ''
|
||||
});
|
||||
globalThis.fetch = vi.fn().mockResolvedValue({
|
||||
status: 404,
|
||||
headers: { get: () => 'text/html' },
|
||||
arrayBuffer: () => Promise.resolve(new ArrayBuffer(0))
|
||||
}) as unknown as typeof fetch;
|
||||
|
||||
const result = await extractTextAndThumbnail('https://www.instagram.com/reel/abc/');
|
||||
expect(result.bodyText).toBe('Recipe text');
|
||||
expect(result.thumbnail).toBeNull();
|
||||
});
|
||||
|
||||
it('passes --cookies flag when secrets/cookies.txt exists', async () => {
|
||||
existsSyncMock.mockImplementation((p: string) => p.endsWith('cookies.txt'));
|
||||
execFileMock.mockResolvedValue({
|
||||
stdout: JSON.stringify({ description: 'x', thumbnail: null }),
|
||||
stderr: ''
|
||||
});
|
||||
|
||||
await extractTextAndThumbnail('https://www.instagram.com/reel/abc/');
|
||||
|
||||
const [, args] = execFileMock.mock.calls[0];
|
||||
expect(args).toContain('--cookies');
|
||||
const idx = (args as string[]).indexOf('--cookies');
|
||||
expect((args as string[])[idx + 1]).toMatch(/cookies\.txt$/);
|
||||
});
|
||||
|
||||
it('omits --cookies flag when no cookie file is present', async () => {
|
||||
existsSyncMock.mockReturnValue(false);
|
||||
execFileMock.mockResolvedValue({
|
||||
stdout: JSON.stringify({ description: 'x', thumbnail: null }),
|
||||
stderr: ''
|
||||
});
|
||||
|
||||
await extractTextAndThumbnail('https://www.instagram.com/reel/abc/');
|
||||
|
||||
const [, args] = execFileMock.mock.calls[0];
|
||||
expect(args).not.toContain('--cookies');
|
||||
});
|
||||
|
||||
it('throws non-recoverable error on "Login required" stderr', async () => {
|
||||
const err: any = new Error('yt-dlp failed');
|
||||
err.stderr = 'ERROR: [Instagram] xyz: Login required to access this post.';
|
||||
execFileMock.mockRejectedValue(err);
|
||||
|
||||
await expect(
|
||||
extractTextAndThumbnail('https://www.instagram.com/reel/private/')
|
||||
).rejects.toThrow(/authentication/);
|
||||
});
|
||||
|
||||
it('throws clear error when yt-dlp binary is missing (ENOENT)', async () => {
|
||||
const err: any = new Error('not found');
|
||||
err.code = 'ENOENT';
|
||||
execFileMock.mockRejectedValue(err);
|
||||
|
||||
await expect(
|
||||
extractTextAndThumbnail('https://www.instagram.com/reel/abc/')
|
||||
).rejects.toThrow(/yt-dlp is not installed/);
|
||||
});
|
||||
|
||||
it('throws when description is empty', async () => {
|
||||
execFileMock.mockResolvedValue({
|
||||
stdout: JSON.stringify({ description: '', thumbnail: null }),
|
||||
stderr: ''
|
||||
});
|
||||
|
||||
await expect(
|
||||
extractTextAndThumbnail('https://www.instagram.com/reel/empty/')
|
||||
).rejects.toThrow(/no description/);
|
||||
});
|
||||
|
||||
it('emits progress events through the callback', async () => {
|
||||
execFileMock.mockResolvedValue({
|
||||
stdout: JSON.stringify({ description: 'x', thumbnail: null }),
|
||||
stderr: ''
|
||||
});
|
||||
|
||||
const events: any[] = [];
|
||||
await extractTextAndThumbnail('https://www.instagram.com/reel/abc/', (e) =>
|
||||
events.push(e)
|
||||
);
|
||||
|
||||
expect(events.some((e) => e.type === 'status' && e.message.includes('yt-dlp'))).toBe(true);
|
||||
expect(events.some((e) => e.type === 'complete')).toBe(true);
|
||||
});
|
||||
});
|
||||
@@ -18,7 +18,7 @@ vi.mock('$lib/server/tandoor', () => ({
|
||||
}));
|
||||
|
||||
import { queueManager } from '$lib/server/queue/QueueManager';
|
||||
import * as extraction from '$lib/server/extraction';
|
||||
import * as instagramExtractor from '$lib/server/instagram-extractor';
|
||||
import { queueProcessor } from '$lib/server/queue/QueueProcessor';
|
||||
|
||||
describe('QueueProcessor logging', () => {
|
||||
@@ -50,8 +50,8 @@ describe('QueueProcessor logging', () => {
|
||||
(complexError as any).code = 'ERR_TEST';
|
||||
(complexError as any).details = { phase: 'extraction', retries: 3 };
|
||||
|
||||
// Mock extraction to fail BEFORE starting processor
|
||||
const extractSpy = vi.spyOn(extraction, 'extractTextAndThumbnail');
|
||||
// Mock extraction to fail BEFORE starting processor (default backend = ytdlp)
|
||||
const extractSpy = vi.spyOn(instagramExtractor, 'extractTextAndThumbnail');
|
||||
extractSpy.mockRejectedValueOnce(complexError);
|
||||
|
||||
const item = queueManager.enqueue('https://instagram.com/p/TEST');
|
||||
|
||||
@@ -35,13 +35,21 @@ vi.mock('$lib/server/queue/config', () => ({
|
||||
}
|
||||
}));
|
||||
|
||||
// Mock external dependencies BEFORE importing QueueProcessor
|
||||
// Mock external dependencies BEFORE importing QueueProcessor.
|
||||
// QueueProcessor.extractionPhase picks between two extractor modules based on
|
||||
// EXTRACTOR_BACKEND; mock both so behavior is identical regardless of default.
|
||||
vi.mock('$lib/server/extraction', () => ({
|
||||
extractTextAndThumbnail: vi.fn().mockResolvedValue({
|
||||
bodyText: 'Default recipe text',
|
||||
thumbnail: null
|
||||
})
|
||||
}));
|
||||
vi.mock('$lib/server/instagram-extractor', () => ({
|
||||
extractTextAndThumbnail: vi.fn().mockResolvedValue({
|
||||
bodyText: 'Default recipe text',
|
||||
thumbnail: null
|
||||
})
|
||||
}));
|
||||
|
||||
vi.mock('$lib/server/parser', () => ({
|
||||
extractRecipe: vi.fn().mockResolvedValue({
|
||||
@@ -62,11 +70,16 @@ vi.mock('$lib/server/tandoor', () => ({
|
||||
})
|
||||
}));
|
||||
|
||||
import { extractTextAndThumbnail } from '$lib/server/extraction';
|
||||
import { extractTextAndThumbnail as extractFromExtraction } from '$lib/server/extraction';
|
||||
import { extractTextAndThumbnail as extractFromYtDlp } from '$lib/server/instagram-extractor';
|
||||
import { extractRecipe } from '$lib/server/parser';
|
||||
import { uploadRecipeWithIngredientsDTO, uploadRecipeImage } from '$lib/server/tandoor';
|
||||
import * as configModule from '$lib/server/queue/config';
|
||||
|
||||
// Alias used by existing assertions; default backend is ytdlp so the new
|
||||
// instagram-extractor mock is what the processor actually invokes.
|
||||
const extractTextAndThumbnail = extractFromYtDlp;
|
||||
|
||||
// Import processor AFTER mocks - it will auto-start (imported for side effects)
|
||||
import '$lib/server/queue/QueueProcessor';
|
||||
|
||||
@@ -78,8 +91,13 @@ describe('QueueProcessor Integration Tests', () => {
|
||||
// Reset mocks and their implementations
|
||||
vi.resetAllMocks();
|
||||
|
||||
// Set default mock implementations
|
||||
vi.mocked(extractTextAndThumbnail).mockResolvedValue({
|
||||
// Set default mock implementations on BOTH backend modules so the test
|
||||
// behavior is invariant to EXTRACTOR_BACKEND.
|
||||
vi.mocked(extractFromExtraction).mockResolvedValue({
|
||||
bodyText: 'Default recipe text',
|
||||
thumbnail: null
|
||||
});
|
||||
vi.mocked(extractFromYtDlp).mockResolvedValue({
|
||||
bodyText: 'Default recipe text',
|
||||
thumbnail: null
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user