fix(memory/holographic): sanitize FTS5 queries for natural-language recall

The FactRetriever's _fts_candidates passed the raw query string directly
to FTS5's MATCH operator. FTS5 defaults to AND-between-tokens, which
means any multi-word prose query like 'what happened with the deployment
rollback' required every single token to co-occur in a fact — dropping
recall to zero on the kind of queries agents actually issue via prefetch().

Fix: add _sanitize_fts_query() that:
- tokenizes the query and drops English stopwords
- strips FTS5 operator characters per token
- OR-joins the remaining content tokens as phrase literals

For pathological inputs (all stopwords, empty), falls back to the raw
query so the caller sees zero results instead of a SQL error.

This is a pure-retrieval-quality fix — the HRR + Jaccard reranking
stages still keep precision high. Ships with 10 tests covering the
sanitizer and retrieval integration.
This commit is contained in:
cyb3rwr3n 2026-04-16 19:40:19 -07:00 committed by Teknium
parent 2a3dbcaf46
commit cb6d6d46ab
2 changed files with 191 additions and 1 deletions

View file

@ -496,7 +496,11 @@ class FactRetriever:
# We need to join facts_fts with facts to get all columns
params: list = []
where_clauses = ["facts_fts MATCH ?"]
params.append(query)
# FTS5 defaults to AND-between-tokens, which kills recall on
# natural-language queries ("what happened with the deployment
# rollback"). Sanitize: drop stopwords, OR-join content tokens, so
# any significant term can match.
params.append(self._sanitize_fts_query(query))
if category:
where_clauses.append("f.category = ?")
@ -557,6 +561,63 @@ class FactRetriever:
tokens.add(cleaned)
return tokens
# Stopwords dropped before FTS5 OR-expansion. Short English function
# words that carry no retrieval signal and force false-negative AND
# matches when left in the query.
_FTS_STOPWORDS = frozenset({
"a", "about", "above", "after", "again", "all", "am", "an", "and",
"any", "are", "as", "at", "be", "because", "been", "before", "being",
"between", "both", "but", "by", "can", "could", "did", "do", "does",
"doing", "don", "down", "during", "each", "few", "for", "from",
"further", "had", "has", "have", "having", "he", "her", "here",
"hers", "herself", "him", "himself", "his", "how", "i", "if", "in",
"into", "is", "it", "its", "itself", "just", "me", "more", "most",
"my", "myself", "no", "nor", "not", "now", "of", "off", "on", "once",
"only", "or", "other", "our", "ours", "ourselves", "out", "over",
"own", "same", "she", "should", "so", "some", "such", "than", "that",
"the", "their", "theirs", "them", "themselves", "then", "there",
"these", "they", "this", "those", "through", "to", "too", "under",
"until", "up", "very", "was", "we", "were", "what", "when", "where",
"which", "while", "who", "whom", "why", "will", "with", "would",
"you", "your", "yours", "yourself", "yourselves",
})
@classmethod
def _sanitize_fts_query(cls, query: str) -> str:
"""Convert a natural-language query to an FTS5-safe OR expression.
FTS5 treats a multi-word MATCH argument as AND-joined by default,
which tanks recall on prose queries. This helper:
- tokenizes the query
- drops stopwords and short (<2 char) tokens
- strips FTS5 special characters from each token
- OR-joins the survivors
If nothing remains (pathological query), falls back to the raw
query so the caller sees zero results instead of a SQL error.
"""
if not query:
return ""
# Strip FTS5 operator characters from EACH token to avoid
# accidentally creating a malformed query.
_FTS_SPECIAL = '"()*^:-+'
tokens: list[str] = []
for raw in query.lower().split():
cleaned = raw.strip(".,;:!?\"'()[]{}#@<>") .translate(
str.maketrans("", "", _FTS_SPECIAL)
)
if len(cleaned) < 2:
continue
if cleaned in cls._FTS_STOPWORDS:
continue
# FTS5 phrase-literal each token to ensure no special chars
# sneak through as operators.
tokens.append(f'"{cleaned}"')
if not tokens:
# Fallback: raw query (likely returns 0, but never crashes)
return query
return " OR ".join(tokens)
@staticmethod
def _jaccard_similarity(set_a: set, set_b: set) -> float:
"""Jaccard similarity coefficient: |A ∩ B| / |A B|."""

View file

@ -0,0 +1,129 @@
"""Tests for FactRetriever FTS5 query sanitization.
These tests cover the fix where raw natural-language queries passed to
FTS5 MATCH were AND-joined by default, dropping recall to zero on any
multi-word prose query. The sanitizer drops stopwords and OR-joins the
remaining content tokens as phrase literals.
"""
from __future__ import annotations
import pytest
pytest.importorskip("numpy") # retrieval module imports numpy indirectly
from plugins.memory.holographic.retrieval import FactRetriever
from plugins.memory.holographic.store import MemoryStore
# ---------------------------------------------------------------------------
# _sanitize_fts_query — unit tests (no DB required)
# ---------------------------------------------------------------------------
@pytest.mark.parametrize(
"query,expected_tokens",
[
# stopwords dropped
("what happened with the deployment rollback", {"happened", "deployment", "rollback"}),
# single content word passes through
("compaction", {"compaction"}),
# all stopwords → falls back to raw
("the and of", None), # None = sentinel for fallback-to-raw
# empty string → empty output
("", ""),
# FTS5 operator characters stripped
("context: length-probe", {"context", "lengthprobe"}),
# trailing punctuation stripped by tokenizer
("hello, world!", {"hello", "world"}),
],
)
def test_sanitize_fts_query_extracts_content_tokens(query, expected_tokens):
result = FactRetriever._sanitize_fts_query(query)
if expected_tokens == "":
assert result == ""
return
if expected_tokens is None:
# Pathological case: all stopwords — should fall back to raw query
assert result == query
return
# OR-joined phrase literals: `"tok1" OR "tok2" OR ...`
# Extract the tokens between quotes, order-independent.
import re
matches = re.findall(r'"([^"]+)"', result)
assert set(matches) == expected_tokens, f"got {result!r}"
def test_sanitize_fts_query_never_crashes_on_fts5_specials():
"""Queries with FTS5 operator characters must not produce malformed SQL."""
problematic = [
'test " query',
"test * query",
"test (a OR b) query",
"test^2 query",
"test:colon query",
"test-hyphen query",
"a" * 1000, # long query
]
for q in problematic:
result = FactRetriever._sanitize_fts_query(q)
# We just need it to return a string without raising
assert isinstance(result, str)
# ---------------------------------------------------------------------------
# Integration test — actually run _fts_candidates against an in-memory DB
# ---------------------------------------------------------------------------
@pytest.fixture
def retriever_with_facts(tmp_path):
"""MemoryStore seeded with a few facts for retrieval tests."""
db_path = tmp_path / "test_facts.db"
store = MemoryStore(str(db_path))
store.add_fact(
content="The Thursday deployment rollback failed because of stale migration state.",
category="project",
)
store.add_fact(
content="Compaction settings tuned to 0.85 threshold.",
category="tool",
)
store.add_fact(
content="Venice.ai advertises availableContextTokens inside model_spec.",
category="tool",
)
retriever = FactRetriever(store=store)
yield retriever
store.close()
def test_prefetch_recovers_prose_query(retriever_with_facts):
"""A natural-language query should now match the relevant fact.
Before the sanitizer fix, 'what happened with the deployment rollback'
returned zero hits because FTS5 required every token to co-occur.
"""
results = retriever_with_facts.search(
"what happened with the deployment rollback"
)
assert len(results) >= 1
# The top hit should be the deployment rollback fact
assert "deployment rollback" in results[0]["content"].lower()
def test_prefetch_single_keyword_still_works(retriever_with_facts):
"""Single-term queries (pre-fix working case) remain working."""
results = retriever_with_facts.search("compaction")
assert len(results) >= 1
assert "Compaction" in results[0]["content"] or "compaction" in results[0]["content"].lower()
def test_prefetch_stopword_only_query_empty(retriever_with_facts):
"""Pure stopword queries return zero results but don't crash."""
# Pass to _sanitize_fts_query directly first so we know what happens
assert FactRetriever._sanitize_fts_query("the and of") == "the and of"
# search() handles the likely-zero-hit case gracefully
results = retriever_with_facts.search("the and of")
# Either zero results or it errored-gracefully to [] — both are fine
assert isinstance(results, list)