118 lines
3.8 KiB
TypeScript
118 lines
3.8 KiB
TypeScript
/**
|
|
* Query preprocessor for FTS5 search queries.
|
|
*
|
|
* Normalizes raw user input into an FTS5-compatible MATCH expression
|
|
* with prefix wildcard expansion on the last token. Handles punctuation-heavy
|
|
* and code-like queries by extracting searchable alphanumeric/underscore terms.
|
|
*/
|
|
|
|
/**
|
|
* Preprocess a raw search query string for FTS5 MATCH.
|
|
*
|
|
* Steps:
|
|
* 1. Trim and normalize internal whitespace.
|
|
* 2. Preserve FTS5 operators (AND, OR, NOT) and extract alphanumeric/underscore terms.
|
|
* 3. Strip punctuation that breaks FTS5 parsing (parentheses, brackets, special chars).
|
|
* 4. Preserve searchable code-like patterns (snake_case, dot notation parts, etc.).
|
|
* 5. Return empty string if no searchable terms remain; otherwise, append a prefix
|
|
* wildcard `*` to the last token when it is >= 3 characters and does not already
|
|
* end with `*`.
|
|
*/
|
|
export function preprocessQuery(raw: string): string {
|
|
// 1. Trim and collapse whitespace.
|
|
const q = raw.trim().replace(/\s+/g, ' ');
|
|
|
|
if (!q) return q;
|
|
|
|
// 2. Split into tokens while preserving FTS operators and extracting searchable terms.
|
|
const tokens = q.split(/\s+/);
|
|
const processedTokens: string[] = [];
|
|
|
|
for (const token of tokens) {
|
|
// Preserve FTS operators as-is.
|
|
if (['AND', 'OR', 'NOT'].includes(token)) {
|
|
processedTokens.push(token);
|
|
continue;
|
|
}
|
|
|
|
// Extract searchable terms from the token:
|
|
// - Keep alphanumeric sequences and underscores
|
|
// - Skip pure punctuation
|
|
// - Handle code-like patterns (foo_bar, foo.bar.baz, etc.)
|
|
const searchableTerms: string[] = [];
|
|
|
|
// Replace common separators with spaces, then split.
|
|
const sanitized = token
|
|
.replace(/[()[\]{}]/g, ' ') // Remove grouping characters
|
|
.replace(/[;:,!?]/g, ' ') // Remove punctuation that breaks FTS
|
|
.replace(/[<>|]/g, ' ') // Remove comparison/pipe chars
|
|
.replace(/[-+*/%]/g, ' ') // Remove operators (but keep underscores)
|
|
.replace(/[@#$&^~`\\]/g, ' '); // Remove special chars
|
|
|
|
// Split on remaining punctuation (like dots and slashes) but preserve alphanumeric/underscore.
|
|
const parts = sanitized.split(/[./\s]+/).filter(Boolean);
|
|
|
|
for (const part of parts) {
|
|
// Keep parts that contain at least one alphanumeric character.
|
|
if (/[a-zA-Z0-9_]/.test(part)) {
|
|
// Remove leading/trailing non-alphanumeric/underscore characters
|
|
const cleaned = part.replace(/^[^a-zA-Z0-9_]+|[^a-zA-Z0-9_]+$/g, '');
|
|
if (cleaned) {
|
|
searchableTerms.push(cleaned);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Add unique searchable terms (avoid duplicates from same token).
|
|
for (const term of searchableTerms) {
|
|
if (!processedTokens.includes(term)) {
|
|
processedTokens.push(term);
|
|
}
|
|
}
|
|
}
|
|
|
|
// 3. Separate operators from searchable terms.
|
|
const searchableTerms = processedTokens.filter((t) => !['AND', 'OR', 'NOT'].includes(t));
|
|
|
|
if (searchableTerms.length === 0) return '';
|
|
|
|
// 4. Reconstruct final tokens keeping operators between searchable terms.
|
|
const finalTokens: string[] = [];
|
|
for (const token of processedTokens) {
|
|
// Keep operators only if we have searchable terms
|
|
if (['AND', 'OR', 'NOT'].includes(token)) {
|
|
// Only keep if surrounded by searchable terms or at the boundary
|
|
if (finalTokens.length > 0) {
|
|
finalTokens.push(token);
|
|
}
|
|
} else {
|
|
finalTokens.push(token);
|
|
}
|
|
}
|
|
|
|
// Remove trailing operators
|
|
while (
|
|
finalTokens.length > 0 &&
|
|
['AND', 'OR', 'NOT'].includes(finalTokens[finalTokens.length - 1])
|
|
) {
|
|
finalTokens.pop();
|
|
}
|
|
|
|
if (finalTokens.length === 0) return '';
|
|
|
|
// 5. Add prefix wildcard to the last non-operator token.
|
|
let lastIdx = finalTokens.length - 1;
|
|
while (lastIdx >= 0 && ['AND', 'OR', 'NOT'].includes(finalTokens[lastIdx])) {
|
|
lastIdx--;
|
|
}
|
|
|
|
if (lastIdx >= 0) {
|
|
const lastToken = finalTokens[lastIdx];
|
|
if (lastToken.length >= 3 && !lastToken.endsWith('*')) {
|
|
finalTokens[lastIdx] = lastToken + '*';
|
|
}
|
|
}
|
|
|
|
return finalTokens.join(' ');
|
|
}
|