fix(searcher): guard against empty ChromaDB query results (#195) (#865)

Fixes #195. When ChromaDB returns no documents (empty palace, or wing/room filter that excludes everything), it returns the shape: {"documents": [], "metadatas": [], "distances": []} Indexing `results["documents"][0]` blindly raises IndexError instead of the expected 'no results' response. Affected: searcher.search(), searcher.search_memories() (drawer + closet branches plus the total_before_filter aggregate), and Layer3.search() / Layer3.search_raw(). Adds a tiny private helper `searcher._first_or_empty(results, key)` that safely extracts the inner list, returning [] for any of: missing key, empty outer list, [None], or [[]]. layers.py imports the same helper to avoid duplicating the guard. Tests: tests/test_empty_chromadb_results.py covers all observed shapes plus a documentation-style test that pins the original IndexError so future readers understand why the helper exists.
2026-04-15 09:26:38 +02:00
parent 54a386d925
commit 6a73eb2e20
3 changed files with 79 additions and 17 deletions
@@ -30,6 +30,20 @@ class SearchError(Exception):
 _TOKEN_RE = re.compile(r"\w{2,}", re.UNICODE)


+def _first_or_empty(results: dict, key: str) -> list:
+    """Return the first inner list of a ChromaDB query result, or [].
+
+    ChromaDB returns shapes like ``{"documents": [["a", "b"]], ...}`` for a
+    successful query, but ``{"documents": [], ...}`` (empty outer list) when
+    the collection is empty or the filter excludes everything. Indexing
+    ``[0]`` blindly raises IndexError in that case (issue #195).
+    """
+    outer = results.get(key)
+    if not outer:
+        return []
+    return outer[0] or []
+
+
 def _tokenize(text: str) -> list:
    """Lowercase + strip to alphanumeric tokens of length ≥ 2."""
    return _TOKEN_RE.findall(text.lower())
@@ -251,9 +265,9 @@ def search(query: str, palace_path: str, wing: str = None, room: str = None, n_r
        print(f"\n  Search error: {e}")
        raise SearchError(f"Search error: {e}") from e

-    docs = results["documents"][0]
-    metas = results["metadatas"][0]
-    dists = results["distances"][0]
+    docs = _first_or_empty(results, "documents")
+    metas = _first_or_empty(results, "metadatas")
+    dists = _first_or_empty(results, "distances")

    if not docs:
        print(f'\n  No results found for: "{query}"')
@@ -353,9 +367,9 @@ def search_memories(
        closet_results = closets_col.query(**ckwargs)
        for rank, (cdoc, cmeta, cdist) in enumerate(
            zip(
-                closet_results["documents"][0],
-                closet_results["metadatas"][0],
-                closet_results["distances"][0],
+                _first_or_empty(closet_results, "documents"),
+                _first_or_empty(closet_results, "metadatas"),
+                _first_or_empty(closet_results, "distances"),
            )
        ):
            source = cmeta.get("source_file", "")
@@ -372,9 +386,9 @@ def search_memories(

    scored: list = []
    for doc, meta, dist in zip(
-        drawer_results["documents"][0],
-        drawer_results["metadatas"][0],
-        drawer_results["distances"][0],
+        _first_or_empty(drawer_results, "documents"),
+        _first_or_empty(drawer_results, "metadatas"),
+        _first_or_empty(drawer_results, "distances"),
    ):
        # Filter on raw distance before rounding to avoid precision loss.
        if max_distance > 0.0 and dist > max_distance:
@@ -482,6 +496,6 @@ def search_memories(
    return {
        "query": query,
        "filters": {"wing": wing, "room": room},
-        "total_before_filter": len(drawer_results["documents"][0]),
+        "total_before_filter": len(_first_or_empty(drawer_results, "documents")),
        "results": hits,
    }