/** * Query preprocessor for FTS5 search queries. * * Normalizes raw user input into an FTS5-compatible MATCH expression * with prefix wildcard expansion on the last token. Handles punctuation-heavy * and code-like queries by extracting searchable alphanumeric/underscore terms. */ /** * Preprocess a raw search query string for FTS5 MATCH. * * Steps: * 1. Trim and normalize internal whitespace. * 2. Preserve FTS5 operators (AND, OR, NOT) and extract alphanumeric/underscore terms. * 3. Strip punctuation that breaks FTS5 parsing (parentheses, brackets, special chars). * 4. Preserve searchable code-like patterns (snake_case, dot notation parts, etc.). * 5. Return empty string if no searchable terms remain; otherwise, append a prefix * wildcard `*` to the last token when it is >= 3 characters and does not already * end with `*`. */ export function preprocessQuery(raw: string): string { // 1. Trim and collapse whitespace. const q = raw.trim().replace(/\s+/g, ' '); if (!q) return q; // 2. Split into tokens while preserving FTS operators and extracting searchable terms. const tokens = q.split(/\s+/); const processedTokens: string[] = []; for (const token of tokens) { // Preserve FTS operators as-is. if (['AND', 'OR', 'NOT'].includes(token)) { processedTokens.push(token); continue; } // Extract searchable terms from the token: // - Keep alphanumeric sequences and underscores // - Skip pure punctuation // - Handle code-like patterns (foo_bar, foo.bar.baz, etc.) const searchableTerms: string[] = []; // Replace common separators with spaces, then split. const sanitized = token .replace(/[()[\]{}]/g, ' ') // Remove grouping characters .replace(/[;:,!?]/g, ' ') // Remove punctuation that breaks FTS .replace(/[<>|]/g, ' ') // Remove comparison/pipe chars .replace(/[-+*/%]/g, ' ') // Remove operators (but keep underscores) .replace(/[@#$&^~`\\]/g, ' '); // Remove special chars // Split on remaining punctuation (like dots and slashes) but preserve alphanumeric/underscore. const parts = sanitized.split(/[./\s]+/).filter(Boolean); for (const part of parts) { // Keep parts that contain at least one alphanumeric character. if (/[a-zA-Z0-9_]/.test(part)) { // Remove leading/trailing non-alphanumeric/underscore characters const cleaned = part.replace(/^[^a-zA-Z0-9_]+|[^a-zA-Z0-9_]+$/g, ''); if (cleaned) { searchableTerms.push(cleaned); } } } // Add unique searchable terms (avoid duplicates from same token). for (const term of searchableTerms) { if (!processedTokens.includes(term)) { processedTokens.push(term); } } } // 3. Separate operators from searchable terms. const searchableTerms = processedTokens.filter((t) => !['AND', 'OR', 'NOT'].includes(t)); if (searchableTerms.length === 0) return ''; // 4. Reconstruct final tokens keeping operators between searchable terms. const finalTokens: string[] = []; for (const token of processedTokens) { // Keep operators only if we have searchable terms if (['AND', 'OR', 'NOT'].includes(token)) { // Only keep if surrounded by searchable terms or at the boundary if (finalTokens.length > 0) { finalTokens.push(token); } } else { finalTokens.push(token); } } // Remove trailing operators while ( finalTokens.length > 0 && ['AND', 'OR', 'NOT'].includes(finalTokens[finalTokens.length - 1]) ) { finalTokens.pop(); } if (finalTokens.length === 0) return ''; // 5. Add prefix wildcard to the last non-operator token. let lastIdx = finalTokens.length - 1; while (lastIdx >= 0 && ['AND', 'OR', 'NOT'].includes(finalTokens[lastIdx])) { lastIdx--; } if (lastIdx >= 0) { const lastToken = finalTokens[lastIdx]; if (lastToken.length >= 3 && !lastToken.endsWith('*')) { finalTokens[lastIdx] = lastToken + '*'; } } return finalTokens.join(' '); }