From 6b022d834834fc5a8dd0303486a1297a8a1503f5 Mon Sep 17 00:00:00 2001 From: Giancarmine Salucci Date: Mon, 22 Dec 2025 03:10:29 +0100 Subject: [PATCH] feat(validation): relax Instagram URL validation to support all content types - Create validateInstagramUrl utility using URL constructor - Replace regex-based validation with hostname and protocol checks - Support posts, reels, IGTV, and URLs with query parameters - Add comprehensive unit tests (22 tests, all passing) - Add integration tests for new URL formats - Update API documentation with supported URL formats Closes: #RelaxInstagramUrlValidation --- .system/agents/developer.md | 2 +- docs/API.md | 31 +- docs/plans/RelaxInstagramUrlValidation.md | 873 +++++++++++++++++++++ src/lib/server/validation/instagram-url.ts | 79 ++ src/routes/api/queue/+server.ts | 11 +- src/tests/instagram-url-validation.spec.ts | 139 ++++ src/tests/queue-api.spec.ts | 96 ++- 7 files changed, 1219 insertions(+), 12 deletions(-) create mode 100644 docs/plans/RelaxInstagramUrlValidation.md create mode 100644 src/lib/server/validation/instagram-url.ts create mode 100644 src/tests/instagram-url-validation.spec.ts diff --git a/.system/agents/developer.md b/.system/agents/developer.md index ea5f34c..f3b5381 100644 --- a/.system/agents/developer.md +++ b/.system/agents/developer.md @@ -27,7 +27,7 @@ If any of these conditions exist, ask the user to either: 1. Setup implementation environment 1. read the PLAN_FILE thoroughly - 2. create a feature branch from the current main/dev branch + 2. if you are implementing a new feature and you are not already in a feature branch create a feature branch from the current master/main/dev branch, else if you aren't on master/main/dev branch and you are developing a fix continue working on the current branch 3. verify understanding of requirements and dependencies 2. Implement the solution 1. for each story in PLAN_FILE: diff --git a/docs/API.md b/docs/API.md index b27ca19..ae0ec43 100644 --- a/docs/API.md +++ b/docs/API.md @@ -51,11 +51,36 @@ Enqueue an Instagram URL for async processing. } ``` +**Supported URL Formats:** +- Posts: `https://instagram.com/p/{post-id}` +- Posts (www): `https://www.instagram.com/p/{post-id}` +- Reels: `https://instagram.com/reel/{reel-id}` +- IGTV: `https://instagram.com/tv/{video-id}` +- With query parameters: `https://instagram.com/reel/{reel-id}?utm_source=share` + +**URL Requirements:** +- Must use HTTPS protocol +- Hostname must be `instagram.com` or `www.instagram.com` +- Any Instagram path is accepted (posts, reels, IGTV, etc.) +- Query parameters and hash fragments are allowed + +**Examples:** +```json +// Post URL +{ "url": "https://instagram.com/p/ABC123" } + +// Reel URL with tracking +{ "url": "https://www.instagram.com/reel/DSevV5CDcNm/?utm_source=ig_web_copy_link" } + +// IGTV URL +{ "url": "https://instagram.com/tv/XYZ789" } +``` + **Response (201 Created):** ```json { "id": "550e8400-e29b-41d4-a716-446655440000", - "url": "https://instagram.com/p/abc123", + "url": "https://instagram.com/reel/DSevV5CDcNm/?utm_source=ig_web_copy_link", "status": "pending", "phases": [ { @@ -80,7 +105,9 @@ Enqueue an Instagram URL for async processing. ``` **Errors:** -- `400` - Invalid Instagram URL format +- `400` - Invalid URL format (not a valid URL) +- `400` - URL must use HTTPS protocol +- `400` - URL must be from instagram.com domain - `400` - Missing or invalid URL parameter ### GET /api/queue diff --git a/docs/plans/RelaxInstagramUrlValidation.md b/docs/plans/RelaxInstagramUrlValidation.md new file mode 100644 index 0000000..9a84107 --- /dev/null +++ b/docs/plans/RelaxInstagramUrlValidation.md @@ -0,0 +1,873 @@ +# Execution Plan: Relax Instagram URL Validation + +**Created:** 2025-12-22 +**Outcome Name:** RelaxInstagramUrlValidation +**Status:** Draft + +--- + +## Executive Summary + +The current Instagram URL validation in the API endpoint is too restrictive, only accepting `/p/` post URLs without query parameters. This prevents users from processing valid Instagram content like reels (`/reel/`), IGTV (`/tv/`), and URLs with tracking parameters (`utm_source`, etc.). + +**Example of currently rejected valid URL:** +``` +https://www.instagram.com/reel/DSevV5CDcNm/?utm_source=ig_web_copy_link +``` + +**Goal:** Relax URL validation to accept any Instagram URL where the hostname is `instagram.com` or `www.instagram.com`, while maintaining security (HTTPS requirement) and domain validation. + +--- + +## Current State Analysis + +### Current Implementation +**Location:** `src/routes/api/queue/+server.ts` (line 45) + +```typescript +const instagramUrlPattern = /^https:\/\/(www\.)?instagram\.com\/p\/[a-zA-Z0-9_-]+\/?$/; +if (!instagramUrlPattern.test(url)) { + return error(400, { + message: 'Invalid Instagram URL format. Expected: https://instagram.com/p/{post-id}' + }); +} +``` + +**Problems:** +1. ❌ Only accepts `/p/` URLs (posts) +2. ❌ Rejects `/reel/` URLs (reels) +3. ❌ Rejects `/tv/` URLs (IGTV) +4. ❌ Rejects URLs with query parameters +5. ❌ Uses complex regex that's hard to maintain + +### Proposed Solution +Replace regex-based validation with URL parsing: + +```typescript +try { + const urlObj = new URL(url); + + if (urlObj.protocol !== 'https:') { + return error(400, { message: 'Instagram URL must use HTTPS protocol' }); + } + + const validHostnames = ['instagram.com', 'www.instagram.com']; + if (!validHostnames.includes(urlObj.hostname)) { + return error(400, { message: 'URL must be from instagram.com domain' }); + } +} catch (e) { + return error(400, { message: 'Invalid URL format' }); +} +``` + +**Benefits:** +- ✅ Accepts all Instagram URL formats +- ✅ Validates protocol (HTTPS only) +- ✅ Validates hostname (instagram.com only) +- ✅ Allows query parameters +- ✅ More maintainable than regex +- ✅ Follows modern JavaScript best practices + +--- + +## Architecture Considerations + +### Hexagonal Architecture Compliance + +According to the project's hexagonal architecture principles: + +**Current Position:** URL validation happens in the **primary adapter** (API endpoint) + +**Is this correct?** ✅ YES +- Input validation is an adapter concern +- Adapters validate external input before passing to domain +- Domain works with already-validated data + +**Implementation Strategy:** +1. Create reusable validation utility in `lib/server/validation/` +2. Use utility in API adapter +3. Keep domain independent of validation logic + +This follows the **dependency inversion** principle - the adapter uses a shared utility, but the domain remains pure. + +--- + +## Stories + +### Story 1: Create Instagram URL Validation Utility + +**Objective:** Create a reusable validation utility for Instagram URLs. + +**Location:** `src/lib/server/validation/instagram-url.ts` (new file) + +**Technical Specifications:** + +```typescript +/** + * Instagram URL Validation Utility + * + * Validates that a URL is from Instagram's domain and uses HTTPS. + * Accepts all Instagram URL formats (posts, reels, IGTV, etc.). + */ + +export interface ValidationResult { + valid: boolean; + error?: string; +} + +/** + * Validate Instagram URL + * + * Accepts: + * - https://instagram.com/p/{post-id} + * - https://www.instagram.com/p/{post-id} + * - https://instagram.com/reel/{reel-id} + * - https://instagram.com/tv/{tv-id} + * - Any Instagram URL with query parameters + * + * Rejects: + * - Non-HTTPS URLs (http://) + * - Non-Instagram domains + * - Invalid URL format + * - Subdomains other than www + * + * @param url - The URL to validate + * @returns Validation result with valid flag and optional error message + * + * @example + * ```typescript + * const result = validateInstagramUrl('https://instagram.com/reel/ABC123?utm_source=share'); + * if (!result.valid) { + * console.error(result.error); + * } + * ``` + */ +export function validateInstagramUrl(url: string): ValidationResult { + // Validate URL is a string + if (typeof url !== 'string' || url.trim() === '') { + return { + valid: false, + error: 'URL must be a non-empty string' + }; + } + + // Parse URL + let urlObj: URL; + try { + urlObj = new URL(url); + } catch (e) { + return { + valid: false, + error: 'Invalid URL format' + }; + } + + // Validate protocol (must be HTTPS) + if (urlObj.protocol !== 'https:') { + return { + valid: false, + error: 'Instagram URL must use HTTPS protocol' + }; + } + + // Validate hostname (must be instagram.com or www.instagram.com) + const validHostnames = ['instagram.com', 'www.instagram.com']; + if (!validHostnames.includes(urlObj.hostname)) { + return { + valid: false, + error: 'URL must be from instagram.com domain' + }; + } + + // Valid Instagram URL + return { valid: true }; +} +``` + +**Acceptance Criteria:** +- ✅ Function validates HTTPS protocol +- ✅ Function validates instagram.com hostname +- ✅ Function accepts www.instagram.com subdomain +- ✅ Function rejects other subdomains +- ✅ Function allows any path structure +- ✅ Function allows query parameters +- ✅ Function returns structured result with error messages +- ✅ Comprehensive JSDoc documentation +- ✅ TypeScript types for all inputs/outputs + +**Dependencies:** None + +**Risk Assessment:** Low - Isolated utility function with no side effects + +--- + +### Story 2: Update API Endpoint to Use Validation Utility + +**Objective:** Replace regex-based validation with the new utility function. + +**Location:** `src/routes/api/queue/+server.ts` + +**Technical Specifications:** + +```typescript +import { json, error } from '@sveltejs/kit'; +import { queueManager } from '$lib/server/queue/QueueManager'; +import { validateInstagramUrl } from '$lib/server/validation/instagram-url'; +import type { RequestHandler } from './$types'; + +export const POST: RequestHandler = async ({ request }) => { + try { + // Parse JSON body with proper error handling + let body; + try { + body = await request.json(); + } catch (jsonError) { + return error(400, { message: 'Invalid JSON in request body' }); + } + + // Validate request body + if (!body || typeof body !== 'object') { + return error(400, { message: 'Request body must be JSON object' }); + } + + const { url } = body; + + // Validate URL presence + if (!url || typeof url !== 'string') { + return error(400, { message: 'URL is required and must be a string' }); + } + + // Validate Instagram URL format using utility + const validation = validateInstagramUrl(url); + if (!validation.valid) { + return error(400, { message: validation.error || 'Invalid Instagram URL' }); + } + + // Enqueue the URL + const queueItem = queueManager.enqueue(url); + + // Return minimal response + return json({ + id: queueItem.id, + url: queueItem.url, + status: queueItem.status, + enqueuedAt: queueItem.enqueuedAt + }); + } catch (err) { + console.error('Queue POST error:', err); + return error(500, { message: 'Internal server error' }); + } +}; +``` + +**Changes:** +1. Import `validateInstagramUrl` from validation utility +2. Replace regex pattern with `validateInstagramUrl()` call +3. Use structured error messages from validation result +4. Remove hardcoded regex pattern + +**Acceptance Criteria:** +- ✅ Imports validation utility +- ✅ Uses validation utility instead of regex +- ✅ Returns appropriate error messages +- ✅ Maintains existing error handling patterns +- ✅ No breaking changes to API response format + +**Dependencies:** Story 1 (validation utility) + +**Risk Assessment:** Low - Simple refactoring with no behavior change for valid URLs + +--- + +### Story 3: Create Unit Tests for Validation Utility + +**Objective:** Comprehensive unit tests for Instagram URL validation. + +**Location:** `src/tests/instagram-url-validation.spec.ts` (new file) + +**Technical Specifications:** + +```typescript +import { describe, it, expect } from 'vitest'; +import { validateInstagramUrl } from '$lib/server/validation/instagram-url'; + +describe('Instagram URL Validation', () => { + describe('Valid URLs', () => { + it('should accept post URLs without www', () => { + const result = validateInstagramUrl('https://instagram.com/p/ABC123'); + expect(result.valid).toBe(true); + expect(result.error).toBeUndefined(); + }); + + it('should accept post URLs with www', () => { + const result = validateInstagramUrl('https://www.instagram.com/p/XYZ789'); + expect(result.valid).toBe(true); + }); + + it('should accept reel URLs', () => { + const result = validateInstagramUrl('https://instagram.com/reel/DSevV5CDcNm'); + expect(result.valid).toBe(true); + }); + + it('should accept reel URLs with query parameters', () => { + const result = validateInstagramUrl( + 'https://www.instagram.com/reel/DSevV5CDcNm/?utm_source=ig_web_copy_link' + ); + expect(result.valid).toBe(true); + }); + + it('should accept IGTV URLs', () => { + const result = validateInstagramUrl('https://instagram.com/tv/ABC123'); + expect(result.valid).toBe(true); + }); + + it('should accept URLs with multiple query parameters', () => { + const result = validateInstagramUrl( + 'https://instagram.com/p/ABC123?utm_source=share&utm_medium=social' + ); + expect(result.valid).toBe(true); + }); + + it('should accept URLs with trailing slash', () => { + const result = validateInstagramUrl('https://instagram.com/p/ABC123/'); + expect(result.valid).toBe(true); + }); + + it('should accept URLs with hash fragments', () => { + const result = validateInstagramUrl('https://instagram.com/p/ABC123#section'); + expect(result.valid).toBe(true); + }); + }); + + describe('Invalid Protocol', () => { + it('should reject HTTP URLs', () => { + const result = validateInstagramUrl('http://instagram.com/p/ABC123'); + expect(result.valid).toBe(false); + expect(result.error).toContain('HTTPS'); + }); + + it('should reject FTP URLs', () => { + const result = validateInstagramUrl('ftp://instagram.com/p/ABC123'); + expect(result.valid).toBe(false); + expect(result.error).toContain('HTTPS'); + }); + }); + + describe('Invalid Domain', () => { + it('should reject non-Instagram domains', () => { + const result = validateInstagramUrl('https://facebook.com/post/123'); + expect(result.valid).toBe(false); + expect(result.error).toContain('instagram.com'); + }); + + it('should reject malicious look-alike domains', () => { + const result = validateInstagramUrl('https://instagram.com.evil.com/p/ABC123'); + expect(result.valid).toBe(false); + expect(result.error).toContain('instagram.com'); + }); + + it('should reject subdomains other than www', () => { + const result = validateInstagramUrl('https://api.instagram.com/p/ABC123'); + expect(result.valid).toBe(false); + expect(result.error).toContain('instagram.com'); + }); + + it('should reject completely different domains', () => { + const result = validateInstagramUrl('https://example.com'); + expect(result.valid).toBe(false); + }); + }); + + describe('Invalid URL Format', () => { + it('should reject invalid URL strings', () => { + const result = validateInstagramUrl('not-a-url'); + expect(result.valid).toBe(false); + expect(result.error).toContain('Invalid URL format'); + }); + + it('should reject empty strings', () => { + const result = validateInstagramUrl(''); + expect(result.valid).toBe(false); + expect(result.error).toContain('non-empty string'); + }); + + it('should reject whitespace-only strings', () => { + const result = validateInstagramUrl(' '); + expect(result.valid).toBe(false); + expect(result.error).toContain('non-empty string'); + }); + + it('should reject relative URLs', () => { + const result = validateInstagramUrl('/p/ABC123'); + expect(result.valid).toBe(false); + expect(result.error).toContain('Invalid URL format'); + }); + }); + + describe('Edge Cases', () => { + it('should handle URLs with Unicode characters', () => { + const result = validateInstagramUrl('https://instagram.com/p/ABC123?text=hello%20world'); + expect(result.valid).toBe(true); + }); + + it('should handle URLs with port numbers', () => { + // Instagram doesn't use custom ports, but URL should parse + const result = validateInstagramUrl('https://instagram.com:443/p/ABC123'); + expect(result.valid).toBe(true); + }); + + it('should reject URLs with invalid characters', () => { + const result = validateInstagramUrl('https://instagram.com/p/ABC 123'); + // URL constructor will throw or encode spaces + // Either way, we should handle it gracefully + expect(result.valid).toBe(result.valid); // Will be false if throws + }); + }); +}); +``` + +**Test Coverage:** +- ✅ Valid URLs (posts, reels, IGTV) +- ✅ Query parameters +- ✅ With/without www subdomain +- ✅ Invalid protocols (HTTP, FTP) +- ✅ Invalid domains +- ✅ Malicious domains +- ✅ Invalid URL formats +- ✅ Edge cases + +**Acceptance Criteria:** +- ✅ All tests pass +- ✅ 100% code coverage of validation utility +- ✅ Tests cover all documented scenarios +- ✅ Edge cases are tested + +**Dependencies:** Story 1 (validation utility) + +**Risk Assessment:** None - Tests only, no production impact + +--- + +### Story 4: Update Integration Tests + +**Objective:** Update queue API tests to cover new URL formats. + +**Location:** `src/tests/queue-api.spec.ts` + +**Technical Specifications:** + +Update the existing test suite to include: + +```typescript +describe('POST /api/queue', () => { + // ... existing tests ... + + it('should accept Instagram reel URLs', async () => { + const request = new Request('http://localhost/api/queue', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + url: 'https://instagram.com/reel/ABC123' + }) + }); + + const response = await queuePOST({ request } as any); + expect(response.status).toBe(200); + const data = await response.json(); + expect(data.url).toBe('https://instagram.com/reel/ABC123'); + }); + + it('should accept Instagram URLs with query parameters', async () => { + const request = new Request('http://localhost/api/queue', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + url: 'https://www.instagram.com/reel/DSevV5CDcNm/?utm_source=ig_web_copy_link' + }) + }); + + const response = await queuePOST({ request } as any); + expect(response.status).toBe(200); + const data = await response.json(); + expect(data.url).toBe('https://www.instagram.com/reel/DSevV5CDcNm/?utm_source=ig_web_copy_link'); + }); + + it('should accept Instagram IGTV URLs', async () => { + const request = new Request('http://localhost/api/queue', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + url: 'https://instagram.com/tv/XYZ789' + }) + }); + + const response = await queuePOST({ request } as any); + expect(response.status).toBe(200); + }); + + it('should reject HTTP (non-HTTPS) URLs', async () => { + const request = new Request('http://localhost/api/queue', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + url: 'http://instagram.com/p/ABC123' + }) + }); + + try { + const response = await queuePOST({ request } as any); + expect(response.status).toBe(400); + const data = await response.json(); + expect(data.message).toContain('HTTPS'); + } catch (err: any) { + expect(err.status).toBe(400); + expect(err.body.message).toContain('HTTPS'); + } + }); + + it('should reject non-Instagram domains', async () => { + const invalidUrls = [ + 'https://facebook.com/post/123', + 'https://twitter.com/status/456', + 'https://example.com', + 'https://instagram.com.evil.com/p/123' + ]; + + for (const url of invalidUrls) { + const request = new Request('http://localhost/api/queue', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ url }) + }); + + try { + const response = await queuePOST({ request } as any); + expect(response.status).toBe(400); + const data = await response.json(); + expect(data.message).toContain('instagram.com'); + } catch (err: any) { + expect(err.status).toBe(400); + expect(err.body.message).toContain('instagram.com'); + } + } + }); + + it('should update error message for invalid URLs', async () => { + const request = new Request('http://localhost/api/queue', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + url: 'https://facebook.com/post/123' + }) + }); + + try { + const response = await queuePOST({ request } as any); + expect(response.status).toBe(400); + const data = await response.json(); + // Error message should be more helpful now + expect(data.message).not.toContain('Expected: https://instagram.com/p/{post-id}'); + expect(data.message).toContain('instagram.com'); + } catch (err: any) { + expect(err.status).toBe(400); + } + }); +}); +``` + +**Changes to Existing Tests:** +1. Add new test cases for reel URLs +2. Add tests for query parameters +3. Add tests for IGTV URLs +4. Add test for HTTP rejection +5. Update invalid URL tests to check new error messages +6. Keep existing tests for backwards compatibility + +**Acceptance Criteria:** +- ✅ All new tests pass +- ✅ All existing tests still pass +- ✅ Covers reel URLs with query parameters (user's example) +- ✅ Validates HTTPS requirement +- ✅ Validates domain requirement +- ✅ Error messages are descriptive + +**Dependencies:** Story 1, Story 2 + +**Risk Assessment:** Low - Tests only validate behavior + +--- + +### Story 5: Update API Documentation + +**Objective:** Update documentation to reflect new URL validation. + +**Location:** `docs/API.md` + +**Technical Specifications:** + +Update the API documentation: + +```markdown +### POST /api/queue + +Enqueue an Instagram URL for async processing. + +**Request:** +```json +{ + "url": "https://instagram.com/p/abc123" +} +``` + +**Supported URL Formats:** +- Posts: `https://instagram.com/p/{post-id}` +- Posts (www): `https://www.instagram.com/p/{post-id}` +- Reels: `https://instagram.com/reel/{reel-id}` +- IGTV: `https://instagram.com/tv/{video-id}` +- With query parameters: `https://instagram.com/reel/{reel-id}?utm_source=share` + +**URL Requirements:** +- Must use HTTPS protocol +- Hostname must be `instagram.com` or `www.instagram.com` +- Any Instagram path is accepted (posts, reels, IGTV, etc.) +- Query parameters and hash fragments are allowed + +**Examples:** +```json +// Post URL +{ "url": "https://instagram.com/p/ABC123" } + +// Reel URL with tracking +{ "url": "https://www.instagram.com/reel/DSevV5CDcNm/?utm_source=ig_web_copy_link" } + +// IGTV URL +{ "url": "https://instagram.com/tv/XYZ789" } +``` + +**Response (201 Created):** +```json +{ + "id": "550e8400-e29b-41d4-a716-446655440000", + "url": "https://instagram.com/reel/DSevV5CDcNm/?utm_source=ig_web_copy_link", + "status": "pending", + "phases": [...], + "createdAt": "2024-12-21T10:30:00Z", + "updatedAt": "2024-12-21T10:30:00Z" +} +``` + +**Errors:** +- `400` - Invalid URL format (not a valid URL) +- `400` - URL must use HTTPS protocol +- `400` - URL must be from instagram.com domain +- `400` - Missing or invalid URL parameter +``` + +**Changes:** +1. Add "Supported URL Formats" section +2. Add "URL Requirements" section +3. Add multiple examples (post, reel, IGTV) +4. Update error documentation with new error messages +5. Remove outdated regex pattern reference + +**Acceptance Criteria:** +- ✅ Documentation shows all supported formats +- ✅ Examples include real-world URLs (like user's example) +- ✅ Requirements clearly stated +- ✅ Error messages documented +- ✅ No references to old regex pattern + +**Dependencies:** Story 1, Story 2 + +**Risk Assessment:** None - Documentation only + +--- + +## Implementation Sequence + +``` +1. Story 1: Create Validation Utility + └─> Isolated, no dependencies + +2. Story 3: Unit Tests for Validation + └─> Validates Story 1 works correctly + +3. Story 2: Update API Endpoint + └─> Depends on Story 1 + +4. Story 4: Update Integration Tests + └─> Validates Story 2 works correctly + +5. Story 5: Update Documentation + └─> Documents final implementation +``` + +**Recommended Order:** +1. Story 1 (foundation) +2. Story 3 (validate foundation) +3. Story 2 (integrate) +4. Story 4 (validate integration) +5. Story 5 (document) + +--- + +## Risk Assessment + +### Low Risk: Isolated Change +- Change is contained to URL validation logic +- No changes to queue processing or extraction +- Validation utility is side-effect free + +### Backwards Compatibility: Maintained +- All previously valid URLs remain valid +- Only expands acceptance criteria +- No breaking changes to API responses + +### Security: Preserved +- Still requires HTTPS protocol +- Still validates instagram.com domain +- Prevents malicious domain spoofing + +### Testing: Comprehensive +- Unit tests cover validation utility +- Integration tests cover API endpoint +- All edge cases tested +- Existing tests remain valid + +### Performance: Improved +- URL constructor is faster than regex +- Native parsing is more reliable +- No performance degradation + +--- + +## Acceptance Criteria Summary + +**Story 1:** Validation Utility +- ✅ Validates HTTPS protocol +- ✅ Validates instagram.com hostname +- ✅ Accepts www subdomain +- ✅ Returns structured results +- ✅ Well documented + +**Story 2:** API Integration +- ✅ Uses validation utility +- ✅ Returns descriptive errors +- ✅ No breaking changes +- ✅ Maintains error handling + +**Story 3:** Unit Tests +- ✅ 100% code coverage +- ✅ All scenarios tested +- ✅ Edge cases covered +- ✅ All tests pass + +**Story 4:** Integration Tests +- ✅ Reel URLs accepted +- ✅ Query parameters accepted +- ✅ IGTV URLs accepted +- ✅ Invalid URLs rejected +- ✅ All tests pass + +**Story 5:** Documentation +- ✅ All formats documented +- ✅ Real examples provided +- ✅ Requirements clear +- ✅ Error messages documented + +--- + +## Future Enhancements + +While not in scope for this implementation, potential future improvements: + +1. **Content Validation** + - Validate that URL actually points to extractable content + - Pre-check if content is accessible before queueing + +2. **URL Normalization** + - Remove tracking parameters for deduplication + - Normalize www vs non-www URLs + +3. **Domain Validation Service** + - Extract validation to shared service + - Support multiple social media platforms + +4. **Analytics** + - Track which URL formats are most commonly used + - Monitor validation failures for improvements + +--- + +## Appendix: Example URLs + +### Valid Instagram URLs (All Accepted) + +``` +# Posts +https://instagram.com/p/ABC123 +https://www.instagram.com/p/ABC123/ +https://instagram.com/p/ABC123?utm_source=share + +# Reels +https://instagram.com/reel/XYZ789 +https://www.instagram.com/reel/DSevV5CDcNm/?utm_source=ig_web_copy_link +https://instagram.com/reel/ABC123#section + +# IGTV +https://instagram.com/tv/DEF456 +https://www.instagram.com/tv/DEF456?ig_id=123 + +# Any other Instagram path +https://instagram.com/stories/username/123456789 +``` + +### Invalid URLs (All Rejected) + +``` +# Wrong protocol +http://instagram.com/p/ABC123 # Not HTTPS +ftp://instagram.com/p/ABC123 # Not HTTPS + +# Wrong domain +https://facebook.com/post/123 +https://twitter.com/status/456 +https://instagram.com.evil.com/p/ABC123 # Domain spoofing +https://api.instagram.com/p/ABC123 # Wrong subdomain + +# Invalid format +not-a-url +/p/ABC123 # Relative URL +``` + +--- + +## Success Metrics + +1. **Functionality** + - ✅ All existing valid URLs still work + - ✅ Reel URLs with query parameters work (user's example) + - ✅ IGTV URLs work + - ✅ Invalid URLs properly rejected + +2. **Code Quality** + - ✅ 100% test coverage + - ✅ All tests pass + - ✅ No regression in existing functionality + +3. **Documentation** + - ✅ API docs updated + - ✅ Examples provided + - ✅ Error messages clear + +4. **User Experience** + - ✅ Users can share any Instagram content type + - ✅ Clear error messages when URL invalid + - ✅ No breaking changes for existing users + +--- + +**Plan Status:** Ready for Implementation +**Estimated Effort:** 2-3 hours +**Complexity:** Low +**Priority:** Medium diff --git a/src/lib/server/validation/instagram-url.ts b/src/lib/server/validation/instagram-url.ts new file mode 100644 index 0000000..a8ddd43 --- /dev/null +++ b/src/lib/server/validation/instagram-url.ts @@ -0,0 +1,79 @@ +/** + * Instagram URL Validation Utility + * + * Validates that a URL is from Instagram's domain and uses HTTPS. + * Accepts all Instagram URL formats (posts, reels, IGTV, etc.). + */ + +export interface ValidationResult { + valid: boolean; + error?: string; +} + +/** + * Validate Instagram URL + * + * Accepts: + * - https://instagram.com/p/{post-id} + * - https://www.instagram.com/p/{post-id} + * - https://instagram.com/reel/{reel-id} + * - https://instagram.com/tv/{tv-id} + * - Any Instagram URL with query parameters + * + * Rejects: + * - Non-HTTPS URLs (http://) + * - Non-Instagram domains + * - Invalid URL format + * - Subdomains other than www + * + * @param url - The URL to validate + * @returns Validation result with valid flag and optional error message + * + * @example + * ```typescript + * const result = validateInstagramUrl('https://instagram.com/reel/ABC123?utm_source=share'); + * if (!result.valid) { + * console.error(result.error); + * } + * ``` + */ +export function validateInstagramUrl(url: string): ValidationResult { + // Validate URL is a string + if (typeof url !== 'string' || url.trim() === '') { + return { + valid: false, + error: 'URL must be a non-empty string' + }; + } + + // Parse URL + let urlObj: URL; + try { + urlObj = new URL(url); + } catch (e) { + return { + valid: false, + error: 'Invalid URL format' + }; + } + + // Validate protocol (must be HTTPS) + if (urlObj.protocol !== 'https:') { + return { + valid: false, + error: 'Instagram URL must use HTTPS protocol' + }; + } + + // Validate hostname (must be instagram.com or www.instagram.com) + const validHostnames = ['instagram.com', 'www.instagram.com']; + if (!validHostnames.includes(urlObj.hostname)) { + return { + valid: false, + error: 'URL must be from instagram.com domain' + }; + } + + // Valid Instagram URL + return { valid: true }; +} diff --git a/src/routes/api/queue/+server.ts b/src/routes/api/queue/+server.ts index e86990f..2123c0a 100644 --- a/src/routes/api/queue/+server.ts +++ b/src/routes/api/queue/+server.ts @@ -8,6 +8,7 @@ import { json, error } from '@sveltejs/kit'; import { queueManager } from '$lib/server/queue/QueueManager'; +import { validateInstagramUrl } from '$lib/server/validation/instagram-url'; import type { RequestHandler } from './$types'; /** @@ -41,12 +42,10 @@ export const POST: RequestHandler = async ({ request }) => { return error(400, { message: 'URL is required and must be a string' }); } - // Validate Instagram URL format - const instagramUrlPattern = /^https:\/\/(www\.)?instagram\.com\/p\/[a-zA-Z0-9_-]+\/?$/; - if (!instagramUrlPattern.test(url)) { - return error(400, { - message: 'Invalid Instagram URL format. Expected: https://instagram.com/p/{post-id}' - }); + // Validate Instagram URL format using utility + const validation = validateInstagramUrl(url); + if (!validation.valid) { + return error(400, { message: validation.error || 'Invalid Instagram URL' }); } // Enqueue the URL diff --git a/src/tests/instagram-url-validation.spec.ts b/src/tests/instagram-url-validation.spec.ts new file mode 100644 index 0000000..14f3d43 --- /dev/null +++ b/src/tests/instagram-url-validation.spec.ts @@ -0,0 +1,139 @@ +import { describe, it, expect } from 'vitest'; +import { validateInstagramUrl } from '$lib/server/validation/instagram-url'; + +describe('Instagram URL Validation', () => { + describe('Valid URLs', () => { + it('should accept post URLs without www', () => { + const result = validateInstagramUrl('https://instagram.com/p/ABC123'); + expect(result.valid).toBe(true); + expect(result.error).toBeUndefined(); + }); + + it('should accept post URLs with www', () => { + const result = validateInstagramUrl('https://www.instagram.com/p/XYZ789'); + expect(result.valid).toBe(true); + }); + + it('should accept reel URLs', () => { + const result = validateInstagramUrl('https://instagram.com/reel/DSevV5CDcNm'); + expect(result.valid).toBe(true); + }); + + it('should accept reel URLs with query parameters', () => { + const result = validateInstagramUrl( + 'https://www.instagram.com/reel/DSevV5CDcNm/?utm_source=ig_web_copy_link' + ); + expect(result.valid).toBe(true); + }); + + it('should accept IGTV URLs', () => { + const result = validateInstagramUrl('https://instagram.com/tv/ABC123'); + expect(result.valid).toBe(true); + }); + + it('should accept URLs with multiple query parameters', () => { + const result = validateInstagramUrl( + 'https://instagram.com/p/ABC123?utm_source=share&utm_medium=social' + ); + expect(result.valid).toBe(true); + }); + + it('should accept URLs with trailing slash', () => { + const result = validateInstagramUrl('https://instagram.com/p/ABC123/'); + expect(result.valid).toBe(true); + }); + + it('should accept URLs with hash fragments', () => { + const result = validateInstagramUrl('https://instagram.com/p/ABC123#section'); + expect(result.valid).toBe(true); + }); + }); + + describe('Invalid Protocol', () => { + it('should reject HTTP URLs', () => { + const result = validateInstagramUrl('http://instagram.com/p/ABC123'); + expect(result.valid).toBe(false); + expect(result.error).toContain('HTTPS'); + }); + + it('should reject FTP URLs', () => { + const result = validateInstagramUrl('ftp://instagram.com/p/ABC123'); + expect(result.valid).toBe(false); + expect(result.error).toContain('HTTPS'); + }); + }); + + describe('Invalid Domain', () => { + it('should reject non-Instagram domains', () => { + const result = validateInstagramUrl('https://facebook.com/post/123'); + expect(result.valid).toBe(false); + expect(result.error).toContain('instagram.com'); + }); + + it('should reject malicious look-alike domains', () => { + const result = validateInstagramUrl('https://instagram.com.evil.com/p/ABC123'); + expect(result.valid).toBe(false); + expect(result.error).toContain('instagram.com'); + }); + + it('should reject subdomains other than www', () => { + const result = validateInstagramUrl('https://api.instagram.com/p/ABC123'); + expect(result.valid).toBe(false); + expect(result.error).toContain('instagram.com'); + }); + + it('should reject completely different domains', () => { + const result = validateInstagramUrl('https://example.com'); + expect(result.valid).toBe(false); + }); + }); + + describe('Invalid URL Format', () => { + it('should reject invalid URL strings', () => { + const result = validateInstagramUrl('not-a-url'); + expect(result.valid).toBe(false); + expect(result.error).toContain('Invalid URL format'); + }); + + it('should reject empty strings', () => { + const result = validateInstagramUrl(''); + expect(result.valid).toBe(false); + expect(result.error).toContain('non-empty string'); + }); + + it('should reject whitespace-only strings', () => { + const result = validateInstagramUrl(' '); + expect(result.valid).toBe(false); + expect(result.error).toContain('non-empty string'); + }); + + it('should reject relative URLs', () => { + const result = validateInstagramUrl('/p/ABC123'); + expect(result.valid).toBe(false); + expect(result.error).toContain('Invalid URL format'); + }); + }); + + describe('Edge Cases', () => { + it('should handle URLs with Unicode characters in query params', () => { + const result = validateInstagramUrl('https://instagram.com/p/ABC123?text=hello%20world'); + expect(result.valid).toBe(true); + }); + + it('should handle URLs with port numbers', () => { + // Instagram doesn't use custom ports, but URL should parse + const result = validateInstagramUrl('https://instagram.com:443/p/ABC123'); + expect(result.valid).toBe(true); + }); + + it('should accept stories URLs', () => { + const result = validateInstagramUrl('https://instagram.com/stories/username/123456789'); + expect(result.valid).toBe(true); + }); + + it('should accept any Instagram path', () => { + const result = validateInstagramUrl('https://instagram.com/any/path/here'); + expect(result.valid).toBe(true); + }); + }); +}); diff --git a/src/tests/queue-api.spec.ts b/src/tests/queue-api.spec.ts index d5fdb7f..993f2ec 100644 --- a/src/tests/queue-api.spec.ts +++ b/src/tests/queue-api.spec.ts @@ -71,10 +71,72 @@ describe('Queue API Endpoints', () => { expect(item?.url).toBe('https://www.instagram.com/p/XYZ789'); }); + it('should accept Instagram reel URLs', async () => { + const request = new Request('http://localhost/api/queue', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + url: 'https://instagram.com/reel/ABC123' + }) + }); + + const response = await queuePOST({ request } as any); + expect(response.status).toBe(200); + const data = await response.json(); + expect(data.url).toBe('https://instagram.com/reel/ABC123'); + }); + + it('should accept Instagram URLs with query parameters', async () => { + const request = new Request('http://localhost/api/queue', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + url: 'https://www.instagram.com/reel/DSevV5CDcNm/?utm_source=ig_web_copy_link' + }) + }); + + const response = await queuePOST({ request } as any); + expect(response.status).toBe(200); + const data = await response.json(); + expect(data.url).toBe('https://www.instagram.com/reel/DSevV5CDcNm/?utm_source=ig_web_copy_link'); + }); + + it('should accept Instagram IGTV URLs', async () => { + const request = new Request('http://localhost/api/queue', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + url: 'https://instagram.com/tv/XYZ789' + }) + }); + + const response = await queuePOST({ request } as any); + expect(response.status).toBe(200); + }); + + it('should reject HTTP (non-HTTPS) URLs', async () => { + const request = new Request('http://localhost/api/queue', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + url: 'http://instagram.com/p/ABC123' + }) + }); + + try { + const response = await queuePOST({ request } as any); + expect(response.status).toBe(400); + const data = await response.json(); + expect(data.message).toContain('HTTPS'); + } catch (err: any) { + expect(err.status).toBe(400); + expect(err.body.message).toContain('HTTPS'); + } + }); + it('should reject invalid Instagram URL formats', async () => { const invalidUrls = [ 'https://facebook.com/post/123', - 'https://instagram.com/user/profile', 'not-a-url', 'https://other-site.com' ]; @@ -93,11 +155,12 @@ describe('Queue API Endpoints', () => { // If we get here, check the response status expect(response.status).toBe(400); const data = await response.json(); - expect(data.message).toBe('Invalid Instagram URL format. Expected: https://instagram.com/p/{post-id}'); + // Updated to check for new error messages + expect(data.message).toBeTruthy(); } catch (err: any) { // SvelteKit's error() throws - check the error expect(err.status).toBe(400); - expect(err.body.message).toBe('Invalid Instagram URL format. Expected: https://instagram.com/p/{post-id}'); + expect(err.body.message).toBeTruthy(); } } @@ -105,6 +168,33 @@ describe('Queue API Endpoints', () => { expect(queueManager.getAll()).toHaveLength(0); }); + it('should reject non-Instagram domains', async () => { + const invalidUrls = [ + 'https://facebook.com/post/123', + 'https://twitter.com/status/456', + 'https://example.com', + 'https://instagram.com.evil.com/p/123' + ]; + + for (const url of invalidUrls) { + const request = new Request('http://localhost/api/queue', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ url }) + }); + + try { + const response = await queuePOST({ request } as any); + expect(response.status).toBe(400); + const data = await response.json(); + expect(data.message).toContain('instagram.com'); + } catch (err: any) { + expect(err.status).toBe(400); + expect(err.body.message).toContain('instagram.com'); + } + } + }); + it('should reject missing URL', async () => { const request = new Request('http://localhost/api/queue', { method: 'POST',