Files
trueref-legacy/src/lib/server/search/query-preprocessor.ts
2026-03-27 03:01:37 +01:00

118 lines
3.8 KiB
TypeScript

/**
* Query preprocessor for FTS5 search queries.
*
* Normalizes raw user input into an FTS5-compatible MATCH expression
* with prefix wildcard expansion on the last token. Handles punctuation-heavy
* and code-like queries by extracting searchable alphanumeric/underscore terms.
*/
/**
* Preprocess a raw search query string for FTS5 MATCH.
*
* Steps:
* 1. Trim and normalize internal whitespace.
* 2. Preserve FTS5 operators (AND, OR, NOT) and extract alphanumeric/underscore terms.
* 3. Strip punctuation that breaks FTS5 parsing (parentheses, brackets, special chars).
* 4. Preserve searchable code-like patterns (snake_case, dot notation parts, etc.).
* 5. Return empty string if no searchable terms remain; otherwise, append a prefix
* wildcard `*` to the last token when it is >= 3 characters and does not already
* end with `*`.
*/
export function preprocessQuery(raw: string): string {
// 1. Trim and collapse whitespace.
const q = raw.trim().replace(/\s+/g, ' ');
if (!q) return q;
// 2. Split into tokens while preserving FTS operators and extracting searchable terms.
const tokens = q.split(/\s+/);
const processedTokens: string[] = [];
for (const token of tokens) {
// Preserve FTS operators as-is.
if (['AND', 'OR', 'NOT'].includes(token)) {
processedTokens.push(token);
continue;
}
// Extract searchable terms from the token:
// - Keep alphanumeric sequences and underscores
// - Skip pure punctuation
// - Handle code-like patterns (foo_bar, foo.bar.baz, etc.)
const searchableTerms: string[] = [];
// Replace common separators with spaces, then split.
const sanitized = token
.replace(/[()[\]{}]/g, ' ') // Remove grouping characters
.replace(/[;:,!?]/g, ' ') // Remove punctuation that breaks FTS
.replace(/[<>|]/g, ' ') // Remove comparison/pipe chars
.replace(/[-+*/%]/g, ' ') // Remove operators (but keep underscores)
.replace(/[@#$&^~`\\]/g, ' '); // Remove special chars
// Split on remaining punctuation (like dots and slashes) but preserve alphanumeric/underscore.
const parts = sanitized.split(/[./\s]+/).filter(Boolean);
for (const part of parts) {
// Keep parts that contain at least one alphanumeric character.
if (/[a-zA-Z0-9_]/.test(part)) {
// Remove leading/trailing non-alphanumeric/underscore characters
const cleaned = part.replace(/^[^a-zA-Z0-9_]+|[^a-zA-Z0-9_]+$/g, '');
if (cleaned) {
searchableTerms.push(cleaned);
}
}
}
// Add unique searchable terms (avoid duplicates from same token).
for (const term of searchableTerms) {
if (!processedTokens.includes(term)) {
processedTokens.push(term);
}
}
}
// 3. Separate operators from searchable terms.
const searchableTerms = processedTokens.filter((t) => !['AND', 'OR', 'NOT'].includes(t));
if (searchableTerms.length === 0) return '';
// 4. Reconstruct final tokens keeping operators between searchable terms.
const finalTokens: string[] = [];
for (const token of processedTokens) {
// Keep operators only if we have searchable terms
if (['AND', 'OR', 'NOT'].includes(token)) {
// Only keep if surrounded by searchable terms or at the boundary
if (finalTokens.length > 0) {
finalTokens.push(token);
}
} else {
finalTokens.push(token);
}
}
// Remove trailing operators
while (
finalTokens.length > 0 &&
['AND', 'OR', 'NOT'].includes(finalTokens[finalTokens.length - 1])
) {
finalTokens.pop();
}
if (finalTokens.length === 0) return '';
// 5. Add prefix wildcard to the last non-operator token.
let lastIdx = finalTokens.length - 1;
while (lastIdx >= 0 && ['AND', 'OR', 'NOT'].includes(finalTokens[lastIdx])) {
lastIdx--;
}
if (lastIdx >= 0) {
const lastToken = finalTokens[lastIdx];
if (lastToken.length >= 3 && !lastToken.endsWith('*')) {
finalTokens[lastIdx] = lastToken + '*';
}
}
return finalTokens.join(' ');
}