From 8df7b9bf2c037a0a886115529064b8eeb95906b5 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Tue, 14 Apr 2026 21:20:14 -0300
Subject: [PATCH] benchmarks: add --llm-backend ollama for non-Anthropic rerank
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The rerank pipeline was hardcoded to Anthropic's /v1/messages.
Add a backend flag so the same code path can be exercised with
any OpenAI-compatible endpoint — local Ollama, Ollama Cloud,
or any gateway that speaks /v1/chat/completions.

Enables independent verification of the "100% with Haiku rerank"
claim by running the full benchmark with a different LLM family
(e.g. minimax-m2.7:cloud) and zero Anthropic dependency.

Both longmemeval_bench.py and locomo_bench.py:
 - llm_rerank*() gain backend= / base_url= kwargs
 - CLI: --llm-backend {anthropic,ollama}, --llm-base-url
 - API key required only when backend=anthropic (diary/palace modes still require it)
 - Parse last integer in response (reasoning models emit multi-int output)
 - Fallback to message.reasoning when content is empty
 - Raise max_tokens to 1024 for reasoning models
---
 benchmarks/locomo_bench.py      |  92 +++++++++++++++------
 benchmarks/longmemeval_bench.py | 141 ++++++++++++++++++++++----------
 uv.lock                         |   2 +-
 3 files changed, 169 insertions(+), 66 deletions(-)

diff --git a/benchmarks/locomo_bench.py b/benchmarks/locomo_bench.py
index 3f62069..dd6dbc4 100644
--- a/benchmarks/locomo_bench.py
+++ b/benchmarks/locomo_bench.py
@@ -510,11 +510,20 @@ def palace_assign_rooms(sessions, sample_id, api_key, cache, model="claude-haiku
 
 
 def llm_rerank_locomo(
-    question, retrieved_ids, retrieved_docs, api_key, top_k=10, model="claude-sonnet-4-6"
+    question,
+    retrieved_ids,
+    retrieved_docs,
+    api_key,
+    top_k=10,
+    model="claude-sonnet-4-6",
+    backend="anthropic",
+    base_url="",
 ):
     """
     Ask LLM to pick the single most relevant document for this question.
     Returns reordered retrieved_ids with the best candidate first.
+
+    Supports backend="anthropic" (default) or "ollama" (OpenAI-compat endpoint).
     """
     candidates = retrieved_ids[:top_k]
     candidate_docs = retrieved_docs[:top_k]
@@ -522,7 +531,6 @@ def llm_rerank_locomo(
     if len(candidates) <= 1:
         return retrieved_ids
 
-    # Build numbered list of candidates
     lines = []
     for i, (cid, doc) in enumerate(zip(candidates, candidate_docs), 1):
         snippet = doc[:300].replace("\n", " ")
@@ -534,35 +542,51 @@ def llm_rerank_locomo(
         f"Reply with just the number (1-{len(candidates)}).\n\n" + "\n".join(lines)
     )
 
-    payload = json.dumps(
-        {
-            "model": model,
-            "max_tokens": 8,
-            "messages": [{"role": "user", "content": prompt}],
-        }
-    ).encode("utf-8")
-
-    req = urllib.request.Request(
-        "https://api.anthropic.com/v1/messages",
-        data=payload,
-        headers={
+    if backend == "ollama":
+        url = (base_url or "http://localhost:11434").rstrip("/") + "/v1/chat/completions"
+        payload = json.dumps(
+            {
+                "model": model,
+                "messages": [{"role": "user", "content": prompt}],
+                "max_tokens": 1024,
+                "temperature": 0.0,
+            }
+        ).encode("utf-8")
+        headers = {"content-type": "application/json"}
+        if api_key:
+            headers["authorization"] = f"Bearer {api_key}"
+    else:
+        url = "https://api.anthropic.com/v1/messages"
+        payload = json.dumps(
+            {
+                "model": model,
+                "max_tokens": 8,
+                "messages": [{"role": "user", "content": prompt}],
+            }
+        ).encode("utf-8")
+        headers = {
             "x-api-key": api_key,
             "anthropic-version": "2023-06-01",
             "content-type": "application/json",
-        },
-        method="POST",
-    )
+        }
+
+    req = urllib.request.Request(url, data=payload, headers=headers, method="POST")
 
     import socket as _socket
 
     for _attempt in range(3):
         try:
-            with urllib.request.urlopen(req, timeout=30) as resp:
+            with urllib.request.urlopen(req, timeout=120 if backend == "ollama" else 30) as resp:
                 result = json.loads(resp.read())
-            raw = result["content"][0]["text"].strip()
-            m = re.search(r"\b(\d+)\b", raw)
+            if backend == "ollama":
+                msg = result["choices"][0]["message"]
+                raw = (msg.get("content") or "").strip() or (msg.get("reasoning") or "").strip()
+            else:
+                raw = result["content"][0]["text"].strip()
+            # Take LAST integer — reasoning models often count candidates first
+            m = re.search(r"\b(\d+)\b", raw[::-1])
             if m:
-                pick = int(m.group(1))
+                pick = int(m.group(1)[::-1])
                 if 1 <= pick <= len(candidates):
                     chosen_id = candidates[pick - 1]
                     reordered = [chosen_id] + [cid for cid in retrieved_ids if cid != chosen_id]
@@ -608,6 +632,8 @@ def run_benchmark(
     palace_cache_file=None,
     palace_model="claude-haiku-4-5-20251001",
     embed_model="default",
+    llm_backend="anthropic",
+    llm_base_url="",
 ):
     """Run LoCoMo retrieval benchmark."""
     with open(data_file) as f:
@@ -619,8 +645,12 @@ def run_benchmark(
     api_key = ""
     if llm_rerank_enabled or mode == "palace":
         api_key = _load_api_key(llm_key)
-        if not api_key:
-            print(f"ERROR: --mode {mode} requires an API key (--llm-key or ANTHROPIC_API_KEY).")
+        # Ollama backend doesn't require an Anthropic key. Palace mode still does
+        # (it uses Anthropic for room-assignment indexing) — so only relax the
+        # requirement when rerank is the ONLY llm use and backend is ollama.
+        needs_key = mode == "palace" or (llm_rerank_enabled and llm_backend == "anthropic")
+        if needs_key and not api_key:
+            print(f"ERROR: --mode {mode} / --llm-rerank (anthropic) requires an API key.")
             sys.exit(1)
 
     # Palace mode: load or create room assignment cache
@@ -888,6 +918,8 @@ def run_benchmark(
                         api_key,
                         top_k=rerank_pool,
                         model=llm_model,
+                        backend=llm_backend,
+                        base_url=llm_base_url,
                     )
 
                 # Compute recall
@@ -1013,6 +1045,18 @@ if __name__ == "__main__":
         help="Model for LLM rerank (default: claude-sonnet-4-6)",
     )
     parser.add_argument("--llm-key", default="", help="API key (or set ANTHROPIC_API_KEY env var)")
+    parser.add_argument(
+        "--llm-backend",
+        choices=["anthropic", "ollama"],
+        default="anthropic",
+        help="Which API for --llm-rerank. 'anthropic' (default) or 'ollama' "
+        "(OpenAI-compat /v1/chat/completions — works for local + Ollama Cloud).",
+    )
+    parser.add_argument(
+        "--llm-base-url",
+        default="",
+        help="Override base URL for --llm-backend ollama. Default: http://localhost:11434.",
+    )
     parser.add_argument(
         "--hybrid-weight",
         type=float,
@@ -1049,4 +1093,6 @@ if __name__ == "__main__":
         palace_cache_file=args.palace_cache,
         palace_model=args.palace_model,
         embed_model=args.embed_model,
+        llm_backend=args.llm_backend,
+        llm_base_url=args.llm_base_url,
     )
diff --git a/benchmarks/longmemeval_bench.py b/benchmarks/longmemeval_bench.py
index 06ec4bc..7243f22 100644
--- a/benchmarks/longmemeval_bench.py
+++ b/benchmarks/longmemeval_bench.py
@@ -2763,7 +2763,15 @@ def build_palace_and_retrieve_diary(
 
 
 def llm_rerank(
-    question, rankings, corpus, corpus_ids, api_key, top_k=10, model="claude-haiku-4-5-20251001"
+    question,
+    rankings,
+    corpus,
+    corpus_ids,
+    api_key,
+    top_k=10,
+    model="claude-haiku-4-5-20251001",
+    backend="anthropic",
+    base_url="",
 ):
     """
     Use an LLM to re-rank the top-k retrieved sessions.
@@ -2772,19 +2780,22 @@ def llm_rerank(
     which single session is most relevant to the question. That session
     is promoted to rank 1; the rest stay in their existing order.
 
-    This closes the gap for "preference" and jargon-dense "assistant"
-    failures where the right session is in top-10 semantically but not
-    top-5 — because the semantic gap (battery life ↔ phone hardware) is
-    too large for embeddings to bridge.
+    Supports two backends:
+      - "anthropic": hits https://api.anthropic.com/v1/messages with x-api-key.
+      - "ollama":    hits {base_url}/v1/chat/completions (OpenAI-compat) —
+                     works for local Ollama (default http://localhost:11434)
+                     and Ollama Cloud (:cloud model tags).
 
     Args:
-        question:    The benchmark question string
-        rankings:    Current ranked list of corpus indices (from any mode)
-        corpus:      List of document strings
-        corpus_ids:  List of corpus IDs (parallel to corpus)
-        api_key:     Anthropic API key string
-        top_k:       How many top sessions to send to LLM (default: 10)
-        model:       Claude model ID for reranking (default: haiku)
+        question:   The benchmark question string
+        rankings:   Current ranked list of corpus indices (from any mode)
+        corpus:     List of document strings
+        corpus_ids: List of corpus IDs (parallel to corpus)
+        api_key:    Anthropic API key (only required for backend="anthropic")
+        top_k:      How many top sessions to send to LLM (default: 10)
+        model:      Model id (Claude model for anthropic, e.g. "minimax-m2.7:cloud" for ollama)
+        backend:    "anthropic" or "ollama"
+        base_url:   Override base URL (ollama default: http://localhost:11434)
 
     Returns:
         Reordered rankings list with LLM's best pick promoted to rank 1.
@@ -2796,7 +2807,6 @@ def llm_rerank(
     if not candidates:
         return rankings
 
-    # Format sessions for the prompt — first 500 chars each, labelled 1..N
     session_blocks = []
     for rank, idx in enumerate(candidates):
         text = corpus[idx][:500].replace("\n", " ").strip()
@@ -2813,49 +2823,66 @@ def llm_rerank(
         f"Most relevant session number:"
     )
 
-    payload = json.dumps(
-        {
-            "model": model,
-            "max_tokens": 8,
-            "messages": [{"role": "user", "content": prompt}],
-        }
-    ).encode("utf-8")
-
-    req = urllib.request.Request(
-        "https://api.anthropic.com/v1/messages",
-        data=payload,
-        headers={
+    if backend == "ollama":
+        url = (base_url or "http://localhost:11434").rstrip("/") + "/v1/chat/completions"
+        payload = json.dumps(
+            {
+                "model": model,
+                "messages": [{"role": "user", "content": prompt}],
+                "max_tokens": 1024,
+                "temperature": 0.0,
+            }
+        ).encode("utf-8")
+        headers = {"content-type": "application/json"}
+        if api_key:
+            headers["authorization"] = f"Bearer {api_key}"
+    else:
+        url = "https://api.anthropic.com/v1/messages"
+        payload = json.dumps(
+            {
+                "model": model,
+                "max_tokens": 8,
+                "messages": [{"role": "user", "content": prompt}],
+            }
+        ).encode("utf-8")
+        headers = {
             "x-api-key": api_key,
             "anthropic-version": "2023-06-01",
             "content-type": "application/json",
-        },
-        method="POST",
-    )
+        }
+
+    req = urllib.request.Request(url, data=payload, headers=headers, method="POST")
 
     import socket as _socket
 
     for _attempt in range(3):
         try:
-            with urllib.request.urlopen(req, timeout=20) as resp:
+            with urllib.request.urlopen(req, timeout=120 if backend == "ollama" else 20) as resp:
                 result = json.loads(resp.read())
-            raw = result["content"][0]["text"].strip()
-            # Parse just the first integer from Haiku's response
-            m = re.search(r"\b(\d+)\b", raw)
+            if backend == "ollama":
+                msg = result["choices"][0]["message"]
+                # Reasoning models (e.g. minimax-m2.7) may emit final answer in "content"
+                # or embed it in "reasoning". Try content first, fall back to reasoning.
+                raw = (msg.get("content") or "").strip()
+                if not raw:
+                    raw = (msg.get("reasoning") or "").strip()
+            else:
+                raw = result["content"][0]["text"].strip()
+            m = re.search(r"\b(\d+)\b", raw[::-1])  # take LAST integer (rerank models often reason first)
             if m:
-                pick = int(m.group(1))
+                pick = int(m.group(1)[::-1])
                 if 1 <= pick <= len(candidates):
                     chosen_idx = candidates[pick - 1]
                     reordered = [chosen_idx] + [i for i in rankings if i != chosen_idx]
                     return reordered
-            break  # Got a response, even if unparseable — don't retry
+            break
         except (_socket.timeout, TimeoutError):
             if _attempt < 2:
                 import time as _time
 
-                _time.sleep(3)  # brief pause then retry
-            # else fall through to return rankings
+                _time.sleep(3)
         except (urllib.error.URLError, KeyError, ValueError, IndexError, OSError):
-            break  # Non-timeout error — fall back immediately
+            break
 
     return rankings
 
@@ -2919,6 +2946,8 @@ def run_benchmark(
     skip_precompute=False,
     split_file=None,
     split_subset=None,
+    llm_backend="anthropic",
+    llm_base_url="",
 ):
     """Run the full benchmark.
 
@@ -2947,10 +2976,14 @@ def run_benchmark(
     api_key = ""
     if llm_rerank_enabled or mode == "diary":
         api_key = _load_api_key(llm_key)
-        if not api_key:
+        # Ollama backend doesn't require an Anthropic API key; a local/cloud Ollama
+        # daemon with the requested model pulled is enough. Diary mode is always anthropic.
+        needs_key = (llm_backend == "anthropic") or (mode == "diary")
+        if needs_key and not api_key:
             print(
-                "ERROR: --llm-rerank / --mode diary requires an API key. "
-                "Set ANTHROPIC_API_KEY or use --llm-key."
+                "ERROR: --llm-rerank (anthropic backend) / --mode diary requires an API key. "
+                "Set ANTHROPIC_API_KEY or use --llm-key. For ollama backend, pass "
+                "--llm-backend ollama."
             )
             sys.exit(1)
 
@@ -3100,7 +3133,15 @@ def run_benchmark(
         if llm_rerank_enabled:
             rerank_pool = 20 if mode in ("hybrid_v3", "hybrid_v4", "palace") else 10
             rankings = llm_rerank(
-                question, rankings, corpus, corpus_ids, api_key, top_k=rerank_pool, model=llm_model
+                question,
+                rankings,
+                corpus,
+                corpus_ids,
+                api_key,
+                top_k=rerank_pool,
+                model=llm_model,
+                backend=llm_backend,
+                base_url=llm_base_url,
             )
 
         # Evaluate at session level
@@ -3276,7 +3317,21 @@ if __name__ == "__main__":
         default="claude-haiku-4-5-20251001",
         help="Model for LLM re-ranking and diary ingest "
         "(default: claude-haiku-4-5-20251001). "
-        "Use 'claude-sonnet-4-6' for Sonnet comparison.",
+        "Use 'claude-sonnet-4-6' for Sonnet comparison. "
+        "With --llm-backend ollama, use an Ollama model tag like 'minimax-m2.7:cloud'.",
+    )
+    parser.add_argument(
+        "--llm-backend",
+        choices=["anthropic", "ollama"],
+        default="anthropic",
+        help="Which API to hit for --llm-rerank. 'anthropic' (default) uses Anthropic's "
+        "/v1/messages endpoint. 'ollama' uses Ollama's OpenAI-compatible "
+        "/v1/chat/completions endpoint (works with local Ollama and Ollama Cloud).",
+    )
+    parser.add_argument(
+        "--llm-base-url",
+        default="",
+        help="Override base URL for --llm-backend ollama. Defaults to http://localhost:11434.",
     )
     parser.add_argument(
         "--diary-cache",
@@ -3380,4 +3435,6 @@ if __name__ == "__main__":
         args.skip_precompute,
         split_file=args.split_file,
         split_subset=split_subset,
+        llm_backend=args.llm_backend,
+        llm_base_url=args.llm_base_url,
     )
diff --git a/uv.lock b/uv.lock
index 413f104..f9b6dca 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1239,7 +1239,7 @@ dev = [
 [package.metadata]
 requires-dist = [
     { name = "autocorrect", marker = "extra == 'spellcheck'", specifier = ">=2.0" },
-    { name = "chromadb", specifier = ">=0.5.0,<0.7" },
+    { name = "chromadb", specifier = ">=0.5.0" },
     { name = "psutil", marker = "extra == 'dev'", specifier = ">=5.9" },
     { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0" },
     { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0" },