trueref-legacy/src/lib/server/search/query-preprocessor.ts

/**
 * Query preprocessor for FTS5 search queries.
 *
 * Normalizes raw user input into an FTS5-compatible MATCH expression
 * with prefix wildcard expansion on the last token. Handles punctuation-heavy
 * and code-like queries by extracting searchable alphanumeric/underscore terms.
 */

/**
 * Preprocess a raw search query string for FTS5 MATCH.
 *
 * Steps:
 * 1. Trim and normalize internal whitespace.
 * 2. Preserve FTS5 operators (AND, OR, NOT) and extract alphanumeric/underscore terms.
 * 3. Strip punctuation that breaks FTS5 parsing (parentheses, brackets, special chars).
 * 4. Preserve searchable code-like patterns (snake_case, dot notation parts, etc.).
 * 5. Return empty string if no searchable terms remain; otherwise, append a prefix
 *    wildcard `*` to the last token when it is >= 3 characters and does not already
 *    end with `*`.
 */
export function preprocessQuery(raw: string): string {
	// 1. Trim and collapse whitespace.
	const q = raw.trim().replace(/\s+/g, ' ');

	if (!q) return q;

	// 2. Split into tokens while preserving FTS operators and extracting searchable terms.
	const tokens = q.split(/\s+/);
	const processedTokens: string[] = [];

	for (const token of tokens) {
		// Preserve FTS operators as-is.
		if (['AND', 'OR', 'NOT'].includes(token)) {
			processedTokens.push(token);
			continue;
		}

		// Extract searchable terms from the token:
		// - Keep alphanumeric sequences and underscores
		// - Skip pure punctuation
		// - Handle code-like patterns (foo_bar, foo.bar.baz, etc.)
		const searchableTerms: string[] = [];

		// Replace common separators with spaces, then split.
		const sanitized = token
			.replace(/[()[\]{}]/g, ' ') // Remove grouping characters
			.replace(/[;:,!?]/g, ' ') // Remove punctuation that breaks FTS
			.replace(/[<>|]/g, ' ') // Remove comparison/pipe chars
			.replace(/[-+*/%]/g, ' ') // Remove operators (but keep underscores)
			.replace(/[@#$&^~`\\]/g, ' '); // Remove special chars

		// Split on remaining punctuation (like dots and slashes) but preserve alphanumeric/underscore.
		const parts = sanitized.split(/[./\s]+/).filter(Boolean);

		for (const part of parts) {
			// Keep parts that contain at least one alphanumeric character.
			if (/[a-zA-Z0-9_]/.test(part)) {
				// Remove leading/trailing non-alphanumeric/underscore characters
				const cleaned = part.replace(/^[^a-zA-Z0-9_]+|[^a-zA-Z0-9_]+$/g, '');
				if (cleaned) {
					searchableTerms.push(cleaned);
				}
			}
		}

		// Add unique searchable terms (avoid duplicates from same token).
		for (const term of searchableTerms) {
			if (!processedTokens.includes(term)) {
				processedTokens.push(term);
			}
		}
	}

	// 3. Separate operators from searchable terms.
	const searchableTerms = processedTokens.filter((t) => !['AND', 'OR', 'NOT'].includes(t));

	if (searchableTerms.length === 0) return '';

	// 4. Reconstruct final tokens keeping operators between searchable terms.
	const finalTokens: string[] = [];
	for (const token of processedTokens) {
		// Keep operators only if we have searchable terms
		if (['AND', 'OR', 'NOT'].includes(token)) {
			// Only keep if surrounded by searchable terms or at the boundary
			if (finalTokens.length > 0) {
				finalTokens.push(token);
			}
		} else {
			finalTokens.push(token);
		}
	}

	// Remove trailing operators
	while (
		finalTokens.length > 0 &&
		['AND', 'OR', 'NOT'].includes(finalTokens[finalTokens.length - 1])
	) {
		finalTokens.pop();
	}

	if (finalTokens.length === 0) return '';

	// 5. Add prefix wildcard to the last non-operator token.
	let lastIdx = finalTokens.length - 1;
	while (lastIdx >= 0 && ['AND', 'OR', 'NOT'].includes(finalTokens[lastIdx])) {
		lastIdx--;
	}

	if (lastIdx >= 0) {
		const lastToken = finalTokens[lastIdx];
		if (lastToken.length >= 3 && !lastToken.endsWith('*')) {
			finalTokens[lastIdx] = lastToken + '*';
		}
	}

	return finalTokens.join(' ');
}