From 8df7b9bf2c037a0a886115529064b8eeb95906b5 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Tue, 14 Apr 2026 21:20:14 -0300 Subject: [PATCH] benchmarks: add --llm-backend ollama for non-Anthropic rerank MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The rerank pipeline was hardcoded to Anthropic's /v1/messages. Add a backend flag so the same code path can be exercised with any OpenAI-compatible endpoint — local Ollama, Ollama Cloud, or any gateway that speaks /v1/chat/completions. Enables independent verification of the "100% with Haiku rerank" claim by running the full benchmark with a different LLM family (e.g. minimax-m2.7:cloud) and zero Anthropic dependency. Both longmemeval_bench.py and locomo_bench.py: - llm_rerank*() gain backend= / base_url= kwargs - CLI: --llm-backend {anthropic,ollama}, --llm-base-url - API key required only when backend=anthropic (diary/palace modes still require it) - Parse last integer in response (reasoning models emit multi-int output) - Fallback to message.reasoning when content is empty - Raise max_tokens to 1024 for reasoning models --- benchmarks/locomo_bench.py | 92 +++++++++++++++------ benchmarks/longmemeval_bench.py | 141 ++++++++++++++++++++++---------- uv.lock | 2 +- 3 files changed, 169 insertions(+), 66 deletions(-) diff --git a/benchmarks/locomo_bench.py b/benchmarks/locomo_bench.py index 3f62069..dd6dbc4 100644 --- a/benchmarks/locomo_bench.py +++ b/benchmarks/locomo_bench.py @@ -510,11 +510,20 @@ def palace_assign_rooms(sessions, sample_id, api_key, cache, model="claude-haiku def llm_rerank_locomo( - question, retrieved_ids, retrieved_docs, api_key, top_k=10, model="claude-sonnet-4-6" + question, + retrieved_ids, + retrieved_docs, + api_key, + top_k=10, + model="claude-sonnet-4-6", + backend="anthropic", + base_url="", ): """ Ask LLM to pick the single most relevant document for this question. Returns reordered retrieved_ids with the best candidate first. + + Supports backend="anthropic" (default) or "ollama" (OpenAI-compat endpoint). """ candidates = retrieved_ids[:top_k] candidate_docs = retrieved_docs[:top_k] @@ -522,7 +531,6 @@ def llm_rerank_locomo( if len(candidates) <= 1: return retrieved_ids - # Build numbered list of candidates lines = [] for i, (cid, doc) in enumerate(zip(candidates, candidate_docs), 1): snippet = doc[:300].replace("\n", " ") @@ -534,35 +542,51 @@ def llm_rerank_locomo( f"Reply with just the number (1-{len(candidates)}).\n\n" + "\n".join(lines) ) - payload = json.dumps( - { - "model": model, - "max_tokens": 8, - "messages": [{"role": "user", "content": prompt}], - } - ).encode("utf-8") - - req = urllib.request.Request( - "https://api.anthropic.com/v1/messages", - data=payload, - headers={ + if backend == "ollama": + url = (base_url or "http://localhost:11434").rstrip("/") + "/v1/chat/completions" + payload = json.dumps( + { + "model": model, + "messages": [{"role": "user", "content": prompt}], + "max_tokens": 1024, + "temperature": 0.0, + } + ).encode("utf-8") + headers = {"content-type": "application/json"} + if api_key: + headers["authorization"] = f"Bearer {api_key}" + else: + url = "https://api.anthropic.com/v1/messages" + payload = json.dumps( + { + "model": model, + "max_tokens": 8, + "messages": [{"role": "user", "content": prompt}], + } + ).encode("utf-8") + headers = { "x-api-key": api_key, "anthropic-version": "2023-06-01", "content-type": "application/json", - }, - method="POST", - ) + } + + req = urllib.request.Request(url, data=payload, headers=headers, method="POST") import socket as _socket for _attempt in range(3): try: - with urllib.request.urlopen(req, timeout=30) as resp: + with urllib.request.urlopen(req, timeout=120 if backend == "ollama" else 30) as resp: result = json.loads(resp.read()) - raw = result["content"][0]["text"].strip() - m = re.search(r"\b(\d+)\b", raw) + if backend == "ollama": + msg = result["choices"][0]["message"] + raw = (msg.get("content") or "").strip() or (msg.get("reasoning") or "").strip() + else: + raw = result["content"][0]["text"].strip() + # Take LAST integer — reasoning models often count candidates first + m = re.search(r"\b(\d+)\b", raw[::-1]) if m: - pick = int(m.group(1)) + pick = int(m.group(1)[::-1]) if 1 <= pick <= len(candidates): chosen_id = candidates[pick - 1] reordered = [chosen_id] + [cid for cid in retrieved_ids if cid != chosen_id] @@ -608,6 +632,8 @@ def run_benchmark( palace_cache_file=None, palace_model="claude-haiku-4-5-20251001", embed_model="default", + llm_backend="anthropic", + llm_base_url="", ): """Run LoCoMo retrieval benchmark.""" with open(data_file) as f: @@ -619,8 +645,12 @@ def run_benchmark( api_key = "" if llm_rerank_enabled or mode == "palace": api_key = _load_api_key(llm_key) - if not api_key: - print(f"ERROR: --mode {mode} requires an API key (--llm-key or ANTHROPIC_API_KEY).") + # Ollama backend doesn't require an Anthropic key. Palace mode still does + # (it uses Anthropic for room-assignment indexing) — so only relax the + # requirement when rerank is the ONLY llm use and backend is ollama. + needs_key = mode == "palace" or (llm_rerank_enabled and llm_backend == "anthropic") + if needs_key and not api_key: + print(f"ERROR: --mode {mode} / --llm-rerank (anthropic) requires an API key.") sys.exit(1) # Palace mode: load or create room assignment cache @@ -888,6 +918,8 @@ def run_benchmark( api_key, top_k=rerank_pool, model=llm_model, + backend=llm_backend, + base_url=llm_base_url, ) # Compute recall @@ -1013,6 +1045,18 @@ if __name__ == "__main__": help="Model for LLM rerank (default: claude-sonnet-4-6)", ) parser.add_argument("--llm-key", default="", help="API key (or set ANTHROPIC_API_KEY env var)") + parser.add_argument( + "--llm-backend", + choices=["anthropic", "ollama"], + default="anthropic", + help="Which API for --llm-rerank. 'anthropic' (default) or 'ollama' " + "(OpenAI-compat /v1/chat/completions — works for local + Ollama Cloud).", + ) + parser.add_argument( + "--llm-base-url", + default="", + help="Override base URL for --llm-backend ollama. Default: http://localhost:11434.", + ) parser.add_argument( "--hybrid-weight", type=float, @@ -1049,4 +1093,6 @@ if __name__ == "__main__": palace_cache_file=args.palace_cache, palace_model=args.palace_model, embed_model=args.embed_model, + llm_backend=args.llm_backend, + llm_base_url=args.llm_base_url, ) diff --git a/benchmarks/longmemeval_bench.py b/benchmarks/longmemeval_bench.py index 06ec4bc..7243f22 100644 --- a/benchmarks/longmemeval_bench.py +++ b/benchmarks/longmemeval_bench.py @@ -2763,7 +2763,15 @@ def build_palace_and_retrieve_diary( def llm_rerank( - question, rankings, corpus, corpus_ids, api_key, top_k=10, model="claude-haiku-4-5-20251001" + question, + rankings, + corpus, + corpus_ids, + api_key, + top_k=10, + model="claude-haiku-4-5-20251001", + backend="anthropic", + base_url="", ): """ Use an LLM to re-rank the top-k retrieved sessions. @@ -2772,19 +2780,22 @@ def llm_rerank( which single session is most relevant to the question. That session is promoted to rank 1; the rest stay in their existing order. - This closes the gap for "preference" and jargon-dense "assistant" - failures where the right session is in top-10 semantically but not - top-5 — because the semantic gap (battery life ↔ phone hardware) is - too large for embeddings to bridge. + Supports two backends: + - "anthropic": hits https://api.anthropic.com/v1/messages with x-api-key. + - "ollama": hits {base_url}/v1/chat/completions (OpenAI-compat) — + works for local Ollama (default http://localhost:11434) + and Ollama Cloud (:cloud model tags). Args: - question: The benchmark question string - rankings: Current ranked list of corpus indices (from any mode) - corpus: List of document strings - corpus_ids: List of corpus IDs (parallel to corpus) - api_key: Anthropic API key string - top_k: How many top sessions to send to LLM (default: 10) - model: Claude model ID for reranking (default: haiku) + question: The benchmark question string + rankings: Current ranked list of corpus indices (from any mode) + corpus: List of document strings + corpus_ids: List of corpus IDs (parallel to corpus) + api_key: Anthropic API key (only required for backend="anthropic") + top_k: How many top sessions to send to LLM (default: 10) + model: Model id (Claude model for anthropic, e.g. "minimax-m2.7:cloud" for ollama) + backend: "anthropic" or "ollama" + base_url: Override base URL (ollama default: http://localhost:11434) Returns: Reordered rankings list with LLM's best pick promoted to rank 1. @@ -2796,7 +2807,6 @@ def llm_rerank( if not candidates: return rankings - # Format sessions for the prompt — first 500 chars each, labelled 1..N session_blocks = [] for rank, idx in enumerate(candidates): text = corpus[idx][:500].replace("\n", " ").strip() @@ -2813,49 +2823,66 @@ def llm_rerank( f"Most relevant session number:" ) - payload = json.dumps( - { - "model": model, - "max_tokens": 8, - "messages": [{"role": "user", "content": prompt}], - } - ).encode("utf-8") - - req = urllib.request.Request( - "https://api.anthropic.com/v1/messages", - data=payload, - headers={ + if backend == "ollama": + url = (base_url or "http://localhost:11434").rstrip("/") + "/v1/chat/completions" + payload = json.dumps( + { + "model": model, + "messages": [{"role": "user", "content": prompt}], + "max_tokens": 1024, + "temperature": 0.0, + } + ).encode("utf-8") + headers = {"content-type": "application/json"} + if api_key: + headers["authorization"] = f"Bearer {api_key}" + else: + url = "https://api.anthropic.com/v1/messages" + payload = json.dumps( + { + "model": model, + "max_tokens": 8, + "messages": [{"role": "user", "content": prompt}], + } + ).encode("utf-8") + headers = { "x-api-key": api_key, "anthropic-version": "2023-06-01", "content-type": "application/json", - }, - method="POST", - ) + } + + req = urllib.request.Request(url, data=payload, headers=headers, method="POST") import socket as _socket for _attempt in range(3): try: - with urllib.request.urlopen(req, timeout=20) as resp: + with urllib.request.urlopen(req, timeout=120 if backend == "ollama" else 20) as resp: result = json.loads(resp.read()) - raw = result["content"][0]["text"].strip() - # Parse just the first integer from Haiku's response - m = re.search(r"\b(\d+)\b", raw) + if backend == "ollama": + msg = result["choices"][0]["message"] + # Reasoning models (e.g. minimax-m2.7) may emit final answer in "content" + # or embed it in "reasoning". Try content first, fall back to reasoning. + raw = (msg.get("content") or "").strip() + if not raw: + raw = (msg.get("reasoning") or "").strip() + else: + raw = result["content"][0]["text"].strip() + m = re.search(r"\b(\d+)\b", raw[::-1]) # take LAST integer (rerank models often reason first) if m: - pick = int(m.group(1)) + pick = int(m.group(1)[::-1]) if 1 <= pick <= len(candidates): chosen_idx = candidates[pick - 1] reordered = [chosen_idx] + [i for i in rankings if i != chosen_idx] return reordered - break # Got a response, even if unparseable — don't retry + break except (_socket.timeout, TimeoutError): if _attempt < 2: import time as _time - _time.sleep(3) # brief pause then retry - # else fall through to return rankings + _time.sleep(3) except (urllib.error.URLError, KeyError, ValueError, IndexError, OSError): - break # Non-timeout error — fall back immediately + break return rankings @@ -2919,6 +2946,8 @@ def run_benchmark( skip_precompute=False, split_file=None, split_subset=None, + llm_backend="anthropic", + llm_base_url="", ): """Run the full benchmark. @@ -2947,10 +2976,14 @@ def run_benchmark( api_key = "" if llm_rerank_enabled or mode == "diary": api_key = _load_api_key(llm_key) - if not api_key: + # Ollama backend doesn't require an Anthropic API key; a local/cloud Ollama + # daemon with the requested model pulled is enough. Diary mode is always anthropic. + needs_key = (llm_backend == "anthropic") or (mode == "diary") + if needs_key and not api_key: print( - "ERROR: --llm-rerank / --mode diary requires an API key. " - "Set ANTHROPIC_API_KEY or use --llm-key." + "ERROR: --llm-rerank (anthropic backend) / --mode diary requires an API key. " + "Set ANTHROPIC_API_KEY or use --llm-key. For ollama backend, pass " + "--llm-backend ollama." ) sys.exit(1) @@ -3100,7 +3133,15 @@ def run_benchmark( if llm_rerank_enabled: rerank_pool = 20 if mode in ("hybrid_v3", "hybrid_v4", "palace") else 10 rankings = llm_rerank( - question, rankings, corpus, corpus_ids, api_key, top_k=rerank_pool, model=llm_model + question, + rankings, + corpus, + corpus_ids, + api_key, + top_k=rerank_pool, + model=llm_model, + backend=llm_backend, + base_url=llm_base_url, ) # Evaluate at session level @@ -3276,7 +3317,21 @@ if __name__ == "__main__": default="claude-haiku-4-5-20251001", help="Model for LLM re-ranking and diary ingest " "(default: claude-haiku-4-5-20251001). " - "Use 'claude-sonnet-4-6' for Sonnet comparison.", + "Use 'claude-sonnet-4-6' for Sonnet comparison. " + "With --llm-backend ollama, use an Ollama model tag like 'minimax-m2.7:cloud'.", + ) + parser.add_argument( + "--llm-backend", + choices=["anthropic", "ollama"], + default="anthropic", + help="Which API to hit for --llm-rerank. 'anthropic' (default) uses Anthropic's " + "/v1/messages endpoint. 'ollama' uses Ollama's OpenAI-compatible " + "/v1/chat/completions endpoint (works with local Ollama and Ollama Cloud).", + ) + parser.add_argument( + "--llm-base-url", + default="", + help="Override base URL for --llm-backend ollama. Defaults to http://localhost:11434.", ) parser.add_argument( "--diary-cache", @@ -3380,4 +3435,6 @@ if __name__ == "__main__": args.skip_precompute, split_file=args.split_file, split_subset=split_subset, + llm_backend=args.llm_backend, + llm_base_url=args.llm_base_url, ) diff --git a/uv.lock b/uv.lock index 413f104..f9b6dca 100644 --- a/uv.lock +++ b/uv.lock @@ -1239,7 +1239,7 @@ dev = [ [package.metadata] requires-dist = [ { name = "autocorrect", marker = "extra == 'spellcheck'", specifier = ">=2.0" }, - { name = "chromadb", specifier = ">=0.5.0,<0.7" }, + { name = "chromadb", specifier = ">=0.5.0" }, { name = "psutil", marker = "extra == 'dev'", specifier = ">=5.9" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=7.0" }, { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0" },