fix(memory/holographic): sanitize FTS5 queries for natural-language recall
The FactRetriever's _fts_candidates passed the raw query string directly to FTS5's MATCH operator. FTS5 defaults to AND-between-tokens, which means any multi-word prose query like 'what happened with the deployment rollback' required every single token to co-occur in a fact — dropping recall to zero on the kind of queries agents actually issue via prefetch(). Fix: add _sanitize_fts_query() that: - tokenizes the query and drops English stopwords - strips FTS5 operator characters per token - OR-joins the remaining content tokens as phrase literals For pathological inputs (all stopwords, empty), falls back to the raw query so the caller sees zero results instead of a SQL error. This is a pure-retrieval-quality fix — the HRR + Jaccard reranking stages still keep precision high. Ships with 10 tests covering the sanitizer and retrieval integration.
This commit is contained in:
parent
2a3dbcaf46
commit
cb6d6d46ab
2 changed files with 191 additions and 1 deletions
|
|
@ -496,7 +496,11 @@ class FactRetriever:
|
|||
# We need to join facts_fts with facts to get all columns
|
||||
params: list = []
|
||||
where_clauses = ["facts_fts MATCH ?"]
|
||||
params.append(query)
|
||||
# FTS5 defaults to AND-between-tokens, which kills recall on
|
||||
# natural-language queries ("what happened with the deployment
|
||||
# rollback"). Sanitize: drop stopwords, OR-join content tokens, so
|
||||
# any significant term can match.
|
||||
params.append(self._sanitize_fts_query(query))
|
||||
|
||||
if category:
|
||||
where_clauses.append("f.category = ?")
|
||||
|
|
@ -557,6 +561,63 @@ class FactRetriever:
|
|||
tokens.add(cleaned)
|
||||
return tokens
|
||||
|
||||
# Stopwords dropped before FTS5 OR-expansion. Short English function
|
||||
# words that carry no retrieval signal and force false-negative AND
|
||||
# matches when left in the query.
|
||||
_FTS_STOPWORDS = frozenset({
|
||||
"a", "about", "above", "after", "again", "all", "am", "an", "and",
|
||||
"any", "are", "as", "at", "be", "because", "been", "before", "being",
|
||||
"between", "both", "but", "by", "can", "could", "did", "do", "does",
|
||||
"doing", "don", "down", "during", "each", "few", "for", "from",
|
||||
"further", "had", "has", "have", "having", "he", "her", "here",
|
||||
"hers", "herself", "him", "himself", "his", "how", "i", "if", "in",
|
||||
"into", "is", "it", "its", "itself", "just", "me", "more", "most",
|
||||
"my", "myself", "no", "nor", "not", "now", "of", "off", "on", "once",
|
||||
"only", "or", "other", "our", "ours", "ourselves", "out", "over",
|
||||
"own", "same", "she", "should", "so", "some", "such", "than", "that",
|
||||
"the", "their", "theirs", "them", "themselves", "then", "there",
|
||||
"these", "they", "this", "those", "through", "to", "too", "under",
|
||||
"until", "up", "very", "was", "we", "were", "what", "when", "where",
|
||||
"which", "while", "who", "whom", "why", "will", "with", "would",
|
||||
"you", "your", "yours", "yourself", "yourselves",
|
||||
})
|
||||
|
||||
@classmethod
|
||||
def _sanitize_fts_query(cls, query: str) -> str:
|
||||
"""Convert a natural-language query to an FTS5-safe OR expression.
|
||||
|
||||
FTS5 treats a multi-word MATCH argument as AND-joined by default,
|
||||
which tanks recall on prose queries. This helper:
|
||||
- tokenizes the query
|
||||
- drops stopwords and short (<2 char) tokens
|
||||
- strips FTS5 special characters from each token
|
||||
- OR-joins the survivors
|
||||
|
||||
If nothing remains (pathological query), falls back to the raw
|
||||
query so the caller sees zero results instead of a SQL error.
|
||||
"""
|
||||
if not query:
|
||||
return ""
|
||||
# Strip FTS5 operator characters from EACH token to avoid
|
||||
# accidentally creating a malformed query.
|
||||
_FTS_SPECIAL = '"()*^:-+'
|
||||
tokens: list[str] = []
|
||||
for raw in query.lower().split():
|
||||
cleaned = raw.strip(".,;:!?\"'()[]{}#@<>") .translate(
|
||||
str.maketrans("", "", _FTS_SPECIAL)
|
||||
)
|
||||
if len(cleaned) < 2:
|
||||
continue
|
||||
if cleaned in cls._FTS_STOPWORDS:
|
||||
continue
|
||||
# FTS5 phrase-literal each token to ensure no special chars
|
||||
# sneak through as operators.
|
||||
tokens.append(f'"{cleaned}"')
|
||||
if not tokens:
|
||||
# Fallback: raw query (likely returns 0, but never crashes)
|
||||
return query
|
||||
return " OR ".join(tokens)
|
||||
|
||||
@staticmethod
|
||||
def _jaccard_similarity(set_a: set, set_b: set) -> float:
|
||||
"""Jaccard similarity coefficient: |A ∩ B| / |A ∪ B|."""
|
||||
|
|
|
|||
129
tests/plugins/memory/test_holographic_retrieval.py
Normal file
129
tests/plugins/memory/test_holographic_retrieval.py
Normal file
|
|
@ -0,0 +1,129 @@
|
|||
"""Tests for FactRetriever FTS5 query sanitization.
|
||||
|
||||
These tests cover the fix where raw natural-language queries passed to
|
||||
FTS5 MATCH were AND-joined by default, dropping recall to zero on any
|
||||
multi-word prose query. The sanitizer drops stopwords and OR-joins the
|
||||
remaining content tokens as phrase literals.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
pytest.importorskip("numpy") # retrieval module imports numpy indirectly
|
||||
|
||||
from plugins.memory.holographic.retrieval import FactRetriever
|
||||
from plugins.memory.holographic.store import MemoryStore
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _sanitize_fts_query — unit tests (no DB required)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"query,expected_tokens",
|
||||
[
|
||||
# stopwords dropped
|
||||
("what happened with the deployment rollback", {"happened", "deployment", "rollback"}),
|
||||
# single content word passes through
|
||||
("compaction", {"compaction"}),
|
||||
# all stopwords → falls back to raw
|
||||
("the and of", None), # None = sentinel for fallback-to-raw
|
||||
# empty string → empty output
|
||||
("", ""),
|
||||
# FTS5 operator characters stripped
|
||||
("context: length-probe", {"context", "lengthprobe"}),
|
||||
# trailing punctuation stripped by tokenizer
|
||||
("hello, world!", {"hello", "world"}),
|
||||
],
|
||||
)
|
||||
def test_sanitize_fts_query_extracts_content_tokens(query, expected_tokens):
|
||||
result = FactRetriever._sanitize_fts_query(query)
|
||||
|
||||
if expected_tokens == "":
|
||||
assert result == ""
|
||||
return
|
||||
|
||||
if expected_tokens is None:
|
||||
# Pathological case: all stopwords — should fall back to raw query
|
||||
assert result == query
|
||||
return
|
||||
|
||||
# OR-joined phrase literals: `"tok1" OR "tok2" OR ...`
|
||||
# Extract the tokens between quotes, order-independent.
|
||||
import re
|
||||
matches = re.findall(r'"([^"]+)"', result)
|
||||
assert set(matches) == expected_tokens, f"got {result!r}"
|
||||
|
||||
|
||||
def test_sanitize_fts_query_never_crashes_on_fts5_specials():
|
||||
"""Queries with FTS5 operator characters must not produce malformed SQL."""
|
||||
problematic = [
|
||||
'test " query',
|
||||
"test * query",
|
||||
"test (a OR b) query",
|
||||
"test^2 query",
|
||||
"test:colon query",
|
||||
"test-hyphen query",
|
||||
"a" * 1000, # long query
|
||||
]
|
||||
for q in problematic:
|
||||
result = FactRetriever._sanitize_fts_query(q)
|
||||
# We just need it to return a string without raising
|
||||
assert isinstance(result, str)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Integration test — actually run _fts_candidates against an in-memory DB
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@pytest.fixture
|
||||
def retriever_with_facts(tmp_path):
|
||||
"""MemoryStore seeded with a few facts for retrieval tests."""
|
||||
db_path = tmp_path / "test_facts.db"
|
||||
store = MemoryStore(str(db_path))
|
||||
store.add_fact(
|
||||
content="The Thursday deployment rollback failed because of stale migration state.",
|
||||
category="project",
|
||||
)
|
||||
store.add_fact(
|
||||
content="Compaction settings tuned to 0.85 threshold.",
|
||||
category="tool",
|
||||
)
|
||||
store.add_fact(
|
||||
content="Venice.ai advertises availableContextTokens inside model_spec.",
|
||||
category="tool",
|
||||
)
|
||||
retriever = FactRetriever(store=store)
|
||||
yield retriever
|
||||
store.close()
|
||||
|
||||
|
||||
def test_prefetch_recovers_prose_query(retriever_with_facts):
|
||||
"""A natural-language query should now match the relevant fact.
|
||||
|
||||
Before the sanitizer fix, 'what happened with the deployment rollback'
|
||||
returned zero hits because FTS5 required every token to co-occur.
|
||||
"""
|
||||
results = retriever_with_facts.search(
|
||||
"what happened with the deployment rollback"
|
||||
)
|
||||
assert len(results) >= 1
|
||||
# The top hit should be the deployment rollback fact
|
||||
assert "deployment rollback" in results[0]["content"].lower()
|
||||
|
||||
|
||||
def test_prefetch_single_keyword_still_works(retriever_with_facts):
|
||||
"""Single-term queries (pre-fix working case) remain working."""
|
||||
results = retriever_with_facts.search("compaction")
|
||||
assert len(results) >= 1
|
||||
assert "Compaction" in results[0]["content"] or "compaction" in results[0]["content"].lower()
|
||||
|
||||
|
||||
def test_prefetch_stopword_only_query_empty(retriever_with_facts):
|
||||
"""Pure stopword queries return zero results but don't crash."""
|
||||
# Pass to _sanitize_fts_query directly first so we know what happens
|
||||
assert FactRetriever._sanitize_fts_query("the and of") == "the and of"
|
||||
# search() handles the likely-zero-hit case gracefully
|
||||
results = retriever_with_facts.search("the and of")
|
||||
# Either zero results or it errored-gracefully to [] — both are fine
|
||||
assert isinstance(results, list)
|
||||
Loading…
Add table
Add a link
Reference in a new issue