fix(memory/holographic): sanitize FTS5 queries for natural-language recall

The FactRetriever's _fts_candidates passed the raw query string directly to FTS5's MATCH operator. FTS5 defaults to AND-between-tokens, which means any multi-word prose query like 'what happened with the deployment rollback' required every single token to co-occur in a fact — dropping recall to zero on the kind of queries agents actually issue via prefetch(). Fix: add _sanitize_fts_query() that: - tokenizes the query and drops English stopwords - strips FTS5 operator characters per token - OR-joins the remaining content tokens as phrase literals For pathological inputs (all stopwords, empty), falls back to the raw query so the caller sees zero results instead of a SQL error. This is a pure-retrieval-quality fix — the HRR + Jaccard reranking stages still keep precision high. Ships with 10 tests covering the sanitizer and retrieval integration.
2026-04-16 19:40:19 -07:00 · 2026-04-16 19:40:19 -07:00 · cb6d6d46ab
commit cb6d6d46ab
parent 2a3dbcaf46
2 changed files with 191 additions and 1 deletions
--- a/plugins/memory/holographic/retrieval.py
+++ b/plugins/memory/holographic/retrieval.py
@ -496,7 +496,11 @@ class FactRetriever:
        # We need to join facts_fts with facts to get all columns
        params: list = []
        where_clauses = ["facts_fts MATCH ?"]
-        params.append(query)
+        # FTS5 defaults to AND-between-tokens, which kills recall on
+        # natural-language queries ("what happened with the deployment
+        # rollback"). Sanitize: drop stopwords, OR-join content tokens, so
+        # any significant term can match.
+        params.append(self._sanitize_fts_query(query))

        if category:
            where_clauses.append("f.category = ?")
@ -557,6 +561,63 @@ class FactRetriever:
                tokens.add(cleaned)
        return tokens

+    # Stopwords dropped before FTS5 OR-expansion. Short English function
+    # words that carry no retrieval signal and force false-negative AND
+    # matches when left in the query.
+    _FTS_STOPWORDS = frozenset({
+        "a", "about", "above", "after", "again", "all", "am", "an", "and",
+        "any", "are", "as", "at", "be", "because", "been", "before", "being",
+        "between", "both", "but", "by", "can", "could", "did", "do", "does",
+        "doing", "don", "down", "during", "each", "few", "for", "from",
+        "further", "had", "has", "have", "having", "he", "her", "here",
+        "hers", "herself", "him", "himself", "his", "how", "i", "if", "in",
+        "into", "is", "it", "its", "itself", "just", "me", "more", "most",
+        "my", "myself", "no", "nor", "not", "now", "of", "off", "on", "once",
+        "only", "or", "other", "our", "ours", "ourselves", "out", "over",
+        "own", "same", "she", "should", "so", "some", "such", "than", "that",
+        "the", "their", "theirs", "them", "themselves", "then", "there",
+        "these", "they", "this", "those", "through", "to", "too", "under",
+        "until", "up", "very", "was", "we", "were", "what", "when", "where",
+        "which", "while", "who", "whom", "why", "will", "with", "would",
+        "you", "your", "yours", "yourself", "yourselves",
+    })
+
+    @classmethod
+    def _sanitize_fts_query(cls, query: str) -> str:
+        """Convert a natural-language query to an FTS5-safe OR expression.
+
+        FTS5 treats a multi-word MATCH argument as AND-joined by default,
+        which tanks recall on prose queries. This helper:
+          - tokenizes the query
+          - drops stopwords and short (<2 char) tokens
+          - strips FTS5 special characters from each token
+          - OR-joins the survivors
+
+        If nothing remains (pathological query), falls back to the raw
+        query so the caller sees zero results instead of a SQL error.
+        """
+        if not query:
+            return ""
+        # Strip FTS5 operator characters from EACH token to avoid
+        # accidentally creating a malformed query.
+        _FTS_SPECIAL = '"()*^:-+'
+        tokens: list[str] = []
+        for raw in query.lower().split():
+            cleaned = raw.strip(".,;:!?\"'()[]{}#@<>") .translate(
+                str.maketrans("", "", _FTS_SPECIAL)
+            )
+            if len(cleaned) < 2:
+                continue
+            if cleaned in cls._FTS_STOPWORDS:
+                continue
+            # FTS5 phrase-literal each token to ensure no special chars
+            # sneak through as operators.
+            tokens.append(f'"{cleaned}"')
+        if not tokens:
+            # Fallback: raw query (likely returns 0, but never crashes)
+            return query
+        return " OR ".join(tokens)
+
    @staticmethod
    def _jaccard_similarity(set_a: set, set_b: set) -> float:
        """Jaccard similarity coefficient: |A ∩ B| / |A ∪ B|."""
--- a/tests/plugins/memory/test_holographic_retrieval.py
+++ b/tests/plugins/memory/test_holographic_retrieval.py
@ -0,0 +1,129 @@
+"""Tests for FactRetriever FTS5 query sanitization.
+
+These tests cover the fix where raw natural-language queries passed to
+FTS5 MATCH were AND-joined by default, dropping recall to zero on any
+multi-word prose query. The sanitizer drops stopwords and OR-joins the
+remaining content tokens as phrase literals.
+"""
+from __future__ import annotations
+
+import pytest
+
+pytest.importorskip("numpy")  # retrieval module imports numpy indirectly
+
+from plugins.memory.holographic.retrieval import FactRetriever
+from plugins.memory.holographic.store import MemoryStore
+
+
+# ---------------------------------------------------------------------------
+# _sanitize_fts_query — unit tests (no DB required)
+# ---------------------------------------------------------------------------
+
+@pytest.mark.parametrize(
+    "query,expected_tokens",
+    [
+        # stopwords dropped
+        ("what happened with the deployment rollback", {"happened", "deployment", "rollback"}),
+        # single content word passes through
+        ("compaction", {"compaction"}),
+        # all stopwords → falls back to raw
+        ("the and of", None),  # None = sentinel for fallback-to-raw
+        # empty string → empty output
+        ("", ""),
+        # FTS5 operator characters stripped
+        ("context: length-probe", {"context", "lengthprobe"}),
+        # trailing punctuation stripped by tokenizer
+        ("hello, world!", {"hello", "world"}),
+    ],
+)
+def test_sanitize_fts_query_extracts_content_tokens(query, expected_tokens):
+    result = FactRetriever._sanitize_fts_query(query)
+
+    if expected_tokens == "":
+        assert result == ""
+        return
+
+    if expected_tokens is None:
+        # Pathological case: all stopwords — should fall back to raw query
+        assert result == query
+        return
+
+    # OR-joined phrase literals: `"tok1" OR "tok2" OR ...`
+    # Extract the tokens between quotes, order-independent.
+    import re
+    matches = re.findall(r'"([^"]+)"', result)
+    assert set(matches) == expected_tokens, f"got {result!r}"
+
+
+def test_sanitize_fts_query_never_crashes_on_fts5_specials():
+    """Queries with FTS5 operator characters must not produce malformed SQL."""
+    problematic = [
+        'test " query',
+        "test * query",
+        "test (a OR b) query",
+        "test^2 query",
+        "test:colon query",
+        "test-hyphen query",
+        "a" * 1000,  # long query
+    ]
+    for q in problematic:
+        result = FactRetriever._sanitize_fts_query(q)
+        # We just need it to return a string without raising
+        assert isinstance(result, str)
+
+
+# ---------------------------------------------------------------------------
+# Integration test — actually run _fts_candidates against an in-memory DB
+# ---------------------------------------------------------------------------
+
+@pytest.fixture
+def retriever_with_facts(tmp_path):
+    """MemoryStore seeded with a few facts for retrieval tests."""
+    db_path = tmp_path / "test_facts.db"
+    store = MemoryStore(str(db_path))
+    store.add_fact(
+        content="The Thursday deployment rollback failed because of stale migration state.",
+        category="project",
+    )
+    store.add_fact(
+        content="Compaction settings tuned to 0.85 threshold.",
+        category="tool",
+    )
+    store.add_fact(
+        content="Venice.ai advertises availableContextTokens inside model_spec.",
+        category="tool",
+    )
+    retriever = FactRetriever(store=store)
+    yield retriever
+    store.close()
+
+
+def test_prefetch_recovers_prose_query(retriever_with_facts):
+    """A natural-language query should now match the relevant fact.
+
+    Before the sanitizer fix, 'what happened with the deployment rollback'
+    returned zero hits because FTS5 required every token to co-occur.
+    """
+    results = retriever_with_facts.search(
+        "what happened with the deployment rollback"
+    )
+    assert len(results) >= 1
+    # The top hit should be the deployment rollback fact
+    assert "deployment rollback" in results[0]["content"].lower()
+
+
+def test_prefetch_single_keyword_still_works(retriever_with_facts):
+    """Single-term queries (pre-fix working case) remain working."""
+    results = retriever_with_facts.search("compaction")
+    assert len(results) >= 1
+    assert "Compaction" in results[0]["content"] or "compaction" in results[0]["content"].lower()
+
+
+def test_prefetch_stopword_only_query_empty(retriever_with_facts):
+    """Pure stopword queries return zero results but don't crash."""
+    # Pass to _sanitize_fts_query directly first so we know what happens
+    assert FactRetriever._sanitize_fts_query("the and of") == "the and of"
+    # search() handles the likely-zero-hit case gracefully
+    results = retriever_with_facts.search("the and of")
+    # Either zero results or it errored-gracefully to [] — both are fine
+    assert isinstance(results, list)