fix(FEEDBACK-0001): complete iteration 0 - harden context search

2026-03-27 01:25:46 +01:00
parent e7a2a83cdb
commit 16436bfab2
15 changed files with 1469 additions and 44 deletions
--- a/src/lib/server/search/query-preprocessor.ts
+++ b/src/lib/server/search/query-preprocessor.ts
@@ -2,7 +2,8 @@
 * Query preprocessor for FTS5 search queries.
 *
 * Normalizes raw user input into an FTS5-compatible MATCH expression
- * with prefix wildcard expansion on the last token.
+ * with prefix wildcard expansion on the last token. Handles punctuation-heavy
+ * and code-like queries by extracting searchable alphanumeric/underscore terms.
 */

 /**
@@ -10,25 +11,104 @@
 *
 * Steps:
 * 1. Trim and normalize internal whitespace.
- * 2. Strip FTS5 grouping characters `(` and `)` that would cause parse errors.
- * 3. Append a prefix wildcard `*` to the last token when it is >= 3 characters
- *    and does not already end with `*`. This gives a "typing as you go" feel.
+ * 2. Preserve FTS5 operators (AND, OR, NOT) and extract alphanumeric/underscore terms.
+ * 3. Strip punctuation that breaks FTS5 parsing (parentheses, brackets, special chars).
+ * 4. Preserve searchable code-like patterns (snake_case, dot notation parts, etc.).
+ * 5. Return empty string if no searchable terms remain; otherwise, append a prefix
+ *    wildcard `*` to the last token when it is >= 3 characters and does not already
+ *    end with `*`.
 */
 export function preprocessQuery(raw: string): string {
 	// 1. Trim and collapse whitespace.
 	let q = raw.trim().replace(/\s+/g, ' ');

-	// 2. Remove parentheses (not valid in simple FTS5 queries without explicit operators).
-	q = q.replace(/[()]/g, ' ').replace(/\s+/g, ' ').trim();
-
 	if (!q) return q;

-	// 3. Add prefix wildcard to the last token.
-	const tokens = q.split(' ');
-	const lastToken = tokens.at(-1) ?? '';
-	if (lastToken.length >= 3 && !lastToken.endsWith('*')) {
-		tokens[tokens.length - 1] = lastToken + '*';
+	// 2. Split into tokens while preserving FTS operators and extracting searchable terms.
+	const tokens = q.split(/\s+/);
+	const processedTokens: string[] = [];
+
+	for (const token of tokens) {
+		// Preserve FTS operators as-is.
+		if (['AND', 'OR', 'NOT'].includes(token)) {
+			processedTokens.push(token);
+			continue;
+		}
+
+		// Extract searchable terms from the token:
+		// - Keep alphanumeric sequences and underscores
+		// - Skip pure punctuation
+		// - Handle code-like patterns (foo_bar, foo.bar.baz, etc.)
+		const searchableTerms: string[] = [];
+
+		// Replace common separators with spaces, then split.
+		const sanitized = token
+			.replace(/[()[\]{}]/g, ' ') // Remove grouping characters
+			.replace(/[;:,!?]/g, ' ') // Remove punctuation that breaks FTS
+			.replace(/[<>|]/g, ' ') // Remove comparison/pipe chars
+			.replace(/[\-+*/%]/g, ' ') // Remove operators (but keep underscores)
+			.replace(/[@#$&^\\~\`]/g, ' '); // Remove special chars
+
+		// Split on remaining punctuation (like dots and slashes) but preserve alphanumeric/underscore.
+		const parts = sanitized.split(/[./\s]+/).filter(Boolean);
+
+		for (const part of parts) {
+			// Keep parts that contain at least one alphanumeric character.
+			if (/[a-zA-Z0-9_]/.test(part)) {
+				// Remove leading/trailing non-alphanumeric/underscore characters
+				const cleaned = part.replace(/^[^a-zA-Z0-9_]+|[^a-zA-Z0-9_]+$/g, '');
+				if (cleaned) {
+					searchableTerms.push(cleaned);
+				}
+			}
+		}
+
+		// Add unique searchable terms (avoid duplicates from same token).
+		for (const term of searchableTerms) {
+			if (!processedTokens.includes(term)) {
+				processedTokens.push(term);
+			}
+		}
 	}

-	return tokens.join(' ');
+	// 3. Separate operators from searchable terms.
+	const searchableTerms = processedTokens.filter((t) => !['AND', 'OR', 'NOT'].includes(t));
+	
+	if (searchableTerms.length === 0) return '';
+
+	// 4. Reconstruct final tokens keeping operators between searchable terms.
+	const finalTokens: string[] = [];
+	for (const token of processedTokens) {
+		// Keep operators only if we have searchable terms
+		if (['AND', 'OR', 'NOT'].includes(token)) {
+			// Only keep if surrounded by searchable terms or at the boundary
+			if (finalTokens.length > 0) {
+				finalTokens.push(token);
+			}
+		} else {
+			finalTokens.push(token);
+		}
+	}
+
+	// Remove trailing operators
+	while (finalTokens.length > 0 && ['AND', 'OR', 'NOT'].includes(finalTokens[finalTokens.length - 1])) {
+		finalTokens.pop();
+	}
+
+	if (finalTokens.length === 0) return '';
+
+	// 5. Add prefix wildcard to the last non-operator token.
+	let lastIdx = finalTokens.length - 1;
+	while (lastIdx >= 0 && ['AND', 'OR', 'NOT'].includes(finalTokens[lastIdx])) {
+		lastIdx--;
+	}
+
+	if (lastIdx >= 0) {
+		const lastToken = finalTokens[lastIdx];
+		if (lastToken.length >= 3 && !lastToken.endsWith('*')) {
+			finalTokens[lastIdx] = lastToken + '*';
+		}
+	}
+
+	return finalTokens.join(' ');
 }