From cb6d6d46ab6b20b173c8215a1f066b53847e9ee5 Mon Sep 17 00:00:00 2001 From: cyb3rwr3n Date: Thu, 16 Apr 2026 19:40:19 -0700 Subject: [PATCH] fix(memory/holographic): sanitize FTS5 queries for natural-language recall MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The FactRetriever's _fts_candidates passed the raw query string directly to FTS5's MATCH operator. FTS5 defaults to AND-between-tokens, which means any multi-word prose query like 'what happened with the deployment rollback' required every single token to co-occur in a fact — dropping recall to zero on the kind of queries agents actually issue via prefetch(). Fix: add _sanitize_fts_query() that: - tokenizes the query and drops English stopwords - strips FTS5 operator characters per token - OR-joins the remaining content tokens as phrase literals For pathological inputs (all stopwords, empty), falls back to the raw query so the caller sees zero results instead of a SQL error. This is a pure-retrieval-quality fix — the HRR + Jaccard reranking stages still keep precision high. Ships with 10 tests covering the sanitizer and retrieval integration. --- plugins/memory/holographic/retrieval.py | 63 ++++++++- .../memory/test_holographic_retrieval.py | 129 ++++++++++++++++++ 2 files changed, 191 insertions(+), 1 deletion(-) create mode 100644 tests/plugins/memory/test_holographic_retrieval.py diff --git a/plugins/memory/holographic/retrieval.py b/plugins/memory/holographic/retrieval.py index a673dcef8..6fb6da2b7 100644 --- a/plugins/memory/holographic/retrieval.py +++ b/plugins/memory/holographic/retrieval.py @@ -496,7 +496,11 @@ class FactRetriever: # We need to join facts_fts with facts to get all columns params: list = [] where_clauses = ["facts_fts MATCH ?"] - params.append(query) + # FTS5 defaults to AND-between-tokens, which kills recall on + # natural-language queries ("what happened with the deployment + # rollback"). Sanitize: drop stopwords, OR-join content tokens, so + # any significant term can match. + params.append(self._sanitize_fts_query(query)) if category: where_clauses.append("f.category = ?") @@ -557,6 +561,63 @@ class FactRetriever: tokens.add(cleaned) return tokens + # Stopwords dropped before FTS5 OR-expansion. Short English function + # words that carry no retrieval signal and force false-negative AND + # matches when left in the query. + _FTS_STOPWORDS = frozenset({ + "a", "about", "above", "after", "again", "all", "am", "an", "and", + "any", "are", "as", "at", "be", "because", "been", "before", "being", + "between", "both", "but", "by", "can", "could", "did", "do", "does", + "doing", "don", "down", "during", "each", "few", "for", "from", + "further", "had", "has", "have", "having", "he", "her", "here", + "hers", "herself", "him", "himself", "his", "how", "i", "if", "in", + "into", "is", "it", "its", "itself", "just", "me", "more", "most", + "my", "myself", "no", "nor", "not", "now", "of", "off", "on", "once", + "only", "or", "other", "our", "ours", "ourselves", "out", "over", + "own", "same", "she", "should", "so", "some", "such", "than", "that", + "the", "their", "theirs", "them", "themselves", "then", "there", + "these", "they", "this", "those", "through", "to", "too", "under", + "until", "up", "very", "was", "we", "were", "what", "when", "where", + "which", "while", "who", "whom", "why", "will", "with", "would", + "you", "your", "yours", "yourself", "yourselves", + }) + + @classmethod + def _sanitize_fts_query(cls, query: str) -> str: + """Convert a natural-language query to an FTS5-safe OR expression. + + FTS5 treats a multi-word MATCH argument as AND-joined by default, + which tanks recall on prose queries. This helper: + - tokenizes the query + - drops stopwords and short (<2 char) tokens + - strips FTS5 special characters from each token + - OR-joins the survivors + + If nothing remains (pathological query), falls back to the raw + query so the caller sees zero results instead of a SQL error. + """ + if not query: + return "" + # Strip FTS5 operator characters from EACH token to avoid + # accidentally creating a malformed query. + _FTS_SPECIAL = '"()*^:-+' + tokens: list[str] = [] + for raw in query.lower().split(): + cleaned = raw.strip(".,;:!?\"'()[]{}#@<>") .translate( + str.maketrans("", "", _FTS_SPECIAL) + ) + if len(cleaned) < 2: + continue + if cleaned in cls._FTS_STOPWORDS: + continue + # FTS5 phrase-literal each token to ensure no special chars + # sneak through as operators. + tokens.append(f'"{cleaned}"') + if not tokens: + # Fallback: raw query (likely returns 0, but never crashes) + return query + return " OR ".join(tokens) + @staticmethod def _jaccard_similarity(set_a: set, set_b: set) -> float: """Jaccard similarity coefficient: |A ∩ B| / |A ∪ B|.""" diff --git a/tests/plugins/memory/test_holographic_retrieval.py b/tests/plugins/memory/test_holographic_retrieval.py new file mode 100644 index 000000000..cd533b5d8 --- /dev/null +++ b/tests/plugins/memory/test_holographic_retrieval.py @@ -0,0 +1,129 @@ +"""Tests for FactRetriever FTS5 query sanitization. + +These tests cover the fix where raw natural-language queries passed to +FTS5 MATCH were AND-joined by default, dropping recall to zero on any +multi-word prose query. The sanitizer drops stopwords and OR-joins the +remaining content tokens as phrase literals. +""" +from __future__ import annotations + +import pytest + +pytest.importorskip("numpy") # retrieval module imports numpy indirectly + +from plugins.memory.holographic.retrieval import FactRetriever +from plugins.memory.holographic.store import MemoryStore + + +# --------------------------------------------------------------------------- +# _sanitize_fts_query — unit tests (no DB required) +# --------------------------------------------------------------------------- + +@pytest.mark.parametrize( + "query,expected_tokens", + [ + # stopwords dropped + ("what happened with the deployment rollback", {"happened", "deployment", "rollback"}), + # single content word passes through + ("compaction", {"compaction"}), + # all stopwords → falls back to raw + ("the and of", None), # None = sentinel for fallback-to-raw + # empty string → empty output + ("", ""), + # FTS5 operator characters stripped + ("context: length-probe", {"context", "lengthprobe"}), + # trailing punctuation stripped by tokenizer + ("hello, world!", {"hello", "world"}), + ], +) +def test_sanitize_fts_query_extracts_content_tokens(query, expected_tokens): + result = FactRetriever._sanitize_fts_query(query) + + if expected_tokens == "": + assert result == "" + return + + if expected_tokens is None: + # Pathological case: all stopwords — should fall back to raw query + assert result == query + return + + # OR-joined phrase literals: `"tok1" OR "tok2" OR ...` + # Extract the tokens between quotes, order-independent. + import re + matches = re.findall(r'"([^"]+)"', result) + assert set(matches) == expected_tokens, f"got {result!r}" + + +def test_sanitize_fts_query_never_crashes_on_fts5_specials(): + """Queries with FTS5 operator characters must not produce malformed SQL.""" + problematic = [ + 'test " query', + "test * query", + "test (a OR b) query", + "test^2 query", + "test:colon query", + "test-hyphen query", + "a" * 1000, # long query + ] + for q in problematic: + result = FactRetriever._sanitize_fts_query(q) + # We just need it to return a string without raising + assert isinstance(result, str) + + +# --------------------------------------------------------------------------- +# Integration test — actually run _fts_candidates against an in-memory DB +# --------------------------------------------------------------------------- + +@pytest.fixture +def retriever_with_facts(tmp_path): + """MemoryStore seeded with a few facts for retrieval tests.""" + db_path = tmp_path / "test_facts.db" + store = MemoryStore(str(db_path)) + store.add_fact( + content="The Thursday deployment rollback failed because of stale migration state.", + category="project", + ) + store.add_fact( + content="Compaction settings tuned to 0.85 threshold.", + category="tool", + ) + store.add_fact( + content="Venice.ai advertises availableContextTokens inside model_spec.", + category="tool", + ) + retriever = FactRetriever(store=store) + yield retriever + store.close() + + +def test_prefetch_recovers_prose_query(retriever_with_facts): + """A natural-language query should now match the relevant fact. + + Before the sanitizer fix, 'what happened with the deployment rollback' + returned zero hits because FTS5 required every token to co-occur. + """ + results = retriever_with_facts.search( + "what happened with the deployment rollback" + ) + assert len(results) >= 1 + # The top hit should be the deployment rollback fact + assert "deployment rollback" in results[0]["content"].lower() + + +def test_prefetch_single_keyword_still_works(retriever_with_facts): + """Single-term queries (pre-fix working case) remain working.""" + results = retriever_with_facts.search("compaction") + assert len(results) >= 1 + assert "Compaction" in results[0]["content"] or "compaction" in results[0]["content"].lower() + + +def test_prefetch_stopword_only_query_empty(retriever_with_facts): + """Pure stopword queries return zero results but don't crash.""" + # Pass to _sanitize_fts_query directly first so we know what happens + assert FactRetriever._sanitize_fts_query("the and of") == "the and of" + # search() handles the likely-zero-hit case gracefully + results = retriever_with_facts.search("the and of") + # Either zero results or it errored-gracefully to [] — both are fine + assert isinstance(results, list)