fix(FEEDBACK-0001): complete iteration 0 - harden context search
This commit is contained in:
@@ -2,7 +2,8 @@
|
||||
* Query preprocessor for FTS5 search queries.
|
||||
*
|
||||
* Normalizes raw user input into an FTS5-compatible MATCH expression
|
||||
* with prefix wildcard expansion on the last token.
|
||||
* with prefix wildcard expansion on the last token. Handles punctuation-heavy
|
||||
* and code-like queries by extracting searchable alphanumeric/underscore terms.
|
||||
*/
|
||||
|
||||
/**
|
||||
@@ -10,25 +11,104 @@
|
||||
*
|
||||
* Steps:
|
||||
* 1. Trim and normalize internal whitespace.
|
||||
* 2. Strip FTS5 grouping characters `(` and `)` that would cause parse errors.
|
||||
* 3. Append a prefix wildcard `*` to the last token when it is >= 3 characters
|
||||
* and does not already end with `*`. This gives a "typing as you go" feel.
|
||||
* 2. Preserve FTS5 operators (AND, OR, NOT) and extract alphanumeric/underscore terms.
|
||||
* 3. Strip punctuation that breaks FTS5 parsing (parentheses, brackets, special chars).
|
||||
* 4. Preserve searchable code-like patterns (snake_case, dot notation parts, etc.).
|
||||
* 5. Return empty string if no searchable terms remain; otherwise, append a prefix
|
||||
* wildcard `*` to the last token when it is >= 3 characters and does not already
|
||||
* end with `*`.
|
||||
*/
|
||||
export function preprocessQuery(raw: string): string {
|
||||
// 1. Trim and collapse whitespace.
|
||||
let q = raw.trim().replace(/\s+/g, ' ');
|
||||
|
||||
// 2. Remove parentheses (not valid in simple FTS5 queries without explicit operators).
|
||||
q = q.replace(/[()]/g, ' ').replace(/\s+/g, ' ').trim();
|
||||
|
||||
if (!q) return q;
|
||||
|
||||
// 3. Add prefix wildcard to the last token.
|
||||
const tokens = q.split(' ');
|
||||
const lastToken = tokens.at(-1) ?? '';
|
||||
if (lastToken.length >= 3 && !lastToken.endsWith('*')) {
|
||||
tokens[tokens.length - 1] = lastToken + '*';
|
||||
// 2. Split into tokens while preserving FTS operators and extracting searchable terms.
|
||||
const tokens = q.split(/\s+/);
|
||||
const processedTokens: string[] = [];
|
||||
|
||||
for (const token of tokens) {
|
||||
// Preserve FTS operators as-is.
|
||||
if (['AND', 'OR', 'NOT'].includes(token)) {
|
||||
processedTokens.push(token);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Extract searchable terms from the token:
|
||||
// - Keep alphanumeric sequences and underscores
|
||||
// - Skip pure punctuation
|
||||
// - Handle code-like patterns (foo_bar, foo.bar.baz, etc.)
|
||||
const searchableTerms: string[] = [];
|
||||
|
||||
// Replace common separators with spaces, then split.
|
||||
const sanitized = token
|
||||
.replace(/[()[\]{}]/g, ' ') // Remove grouping characters
|
||||
.replace(/[;:,!?]/g, ' ') // Remove punctuation that breaks FTS
|
||||
.replace(/[<>|]/g, ' ') // Remove comparison/pipe chars
|
||||
.replace(/[\-+*/%]/g, ' ') // Remove operators (but keep underscores)
|
||||
.replace(/[@#$&^\\~\`]/g, ' '); // Remove special chars
|
||||
|
||||
// Split on remaining punctuation (like dots and slashes) but preserve alphanumeric/underscore.
|
||||
const parts = sanitized.split(/[./\s]+/).filter(Boolean);
|
||||
|
||||
for (const part of parts) {
|
||||
// Keep parts that contain at least one alphanumeric character.
|
||||
if (/[a-zA-Z0-9_]/.test(part)) {
|
||||
// Remove leading/trailing non-alphanumeric/underscore characters
|
||||
const cleaned = part.replace(/^[^a-zA-Z0-9_]+|[^a-zA-Z0-9_]+$/g, '');
|
||||
if (cleaned) {
|
||||
searchableTerms.push(cleaned);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add unique searchable terms (avoid duplicates from same token).
|
||||
for (const term of searchableTerms) {
|
||||
if (!processedTokens.includes(term)) {
|
||||
processedTokens.push(term);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tokens.join(' ');
|
||||
// 3. Separate operators from searchable terms.
|
||||
const searchableTerms = processedTokens.filter((t) => !['AND', 'OR', 'NOT'].includes(t));
|
||||
|
||||
if (searchableTerms.length === 0) return '';
|
||||
|
||||
// 4. Reconstruct final tokens keeping operators between searchable terms.
|
||||
const finalTokens: string[] = [];
|
||||
for (const token of processedTokens) {
|
||||
// Keep operators only if we have searchable terms
|
||||
if (['AND', 'OR', 'NOT'].includes(token)) {
|
||||
// Only keep if surrounded by searchable terms or at the boundary
|
||||
if (finalTokens.length > 0) {
|
||||
finalTokens.push(token);
|
||||
}
|
||||
} else {
|
||||
finalTokens.push(token);
|
||||
}
|
||||
}
|
||||
|
||||
// Remove trailing operators
|
||||
while (finalTokens.length > 0 && ['AND', 'OR', 'NOT'].includes(finalTokens[finalTokens.length - 1])) {
|
||||
finalTokens.pop();
|
||||
}
|
||||
|
||||
if (finalTokens.length === 0) return '';
|
||||
|
||||
// 5. Add prefix wildcard to the last non-operator token.
|
||||
let lastIdx = finalTokens.length - 1;
|
||||
while (lastIdx >= 0 && ['AND', 'OR', 'NOT'].includes(finalTokens[lastIdx])) {
|
||||
lastIdx--;
|
||||
}
|
||||
|
||||
if (lastIdx >= 0) {
|
||||
const lastToken = finalTokens[lastIdx];
|
||||
if (lastToken.length >= 3 && !lastToken.endsWith('*')) {
|
||||
finalTokens[lastIdx] = lastToken + '*';
|
||||
}
|
||||
}
|
||||
|
||||
return finalTokens.join(' ');
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user