benchmarks: add --llm-backend ollama for non-Anthropic rerank

The rerank pipeline was hardcoded to Anthropic's /v1/messages. Add a backend flag so the same code path can be exercised with any OpenAI-compatible endpoint — local Ollama, Ollama Cloud, or any gateway that speaks /v1/chat/completions. Enables independent verification of the "100% with Haiku rerank" claim by running the full benchmark with a different LLM family (e.g. minimax-m2.7:cloud) and zero Anthropic dependency. Both longmemeval_bench.py and locomo_bench.py: - llm_rerank*() gain backend= / base_url= kwargs - CLI: --llm-backend {anthropic,ollama}, --llm-base-url - API key required only when backend=anthropic (diary/palace modes still require it) - Parse last integer in response (reasoning models emit multi-int output) - Fallback to message.reasoning when content is empty - Raise max_tokens to 1024 for reasoning models
2026-04-14 21:20:14 -03:00
parent 4aa7e1eebd
commit 8df7b9bf2c
3 changed files with 169 additions and 66 deletions
@@ -510,11 +510,20 @@ def palace_assign_rooms(sessions, sample_id, api_key, cache, model="claude-haiku


 def llm_rerank_locomo(
-    question, retrieved_ids, retrieved_docs, api_key, top_k=10, model="claude-sonnet-4-6"
+    question,
+    retrieved_ids,
+    retrieved_docs,
+    api_key,
+    top_k=10,
+    model="claude-sonnet-4-6",
+    backend="anthropic",
+    base_url="",
 ):
    """
    Ask LLM to pick the single most relevant document for this question.
    Returns reordered retrieved_ids with the best candidate first.
+
+    Supports backend="anthropic" (default) or "ollama" (OpenAI-compat endpoint).
    """
    candidates = retrieved_ids[:top_k]
    candidate_docs = retrieved_docs[:top_k]
@@ -522,7 +531,6 @@ def llm_rerank_locomo(
    if len(candidates) <= 1:
        return retrieved_ids

-    # Build numbered list of candidates
    lines = []
    for i, (cid, doc) in enumerate(zip(candidates, candidate_docs), 1):
        snippet = doc[:300].replace("\n", " ")
@@ -534,35 +542,51 @@ def llm_rerank_locomo(
        f"Reply with just the number (1-{len(candidates)}).\n\n" + "\n".join(lines)
    )

-    payload = json.dumps(
-        {
-            "model": model,
-            "max_tokens": 8,
-            "messages": [{"role": "user", "content": prompt}],
-        }
-    ).encode("utf-8")
-
-    req = urllib.request.Request(
-        "https://api.anthropic.com/v1/messages",
-        data=payload,
-        headers={
+    if backend == "ollama":
+        url = (base_url or "http://localhost:11434").rstrip("/") + "/v1/chat/completions"
+        payload = json.dumps(
+            {
+                "model": model,
+                "messages": [{"role": "user", "content": prompt}],
+                "max_tokens": 1024,
+                "temperature": 0.0,
+            }
+        ).encode("utf-8")
+        headers = {"content-type": "application/json"}
+        if api_key:
+            headers["authorization"] = f"Bearer {api_key}"
+    else:
+        url = "https://api.anthropic.com/v1/messages"
+        payload = json.dumps(
+            {
+                "model": model,
+                "max_tokens": 8,
+                "messages": [{"role": "user", "content": prompt}],
+            }
+        ).encode("utf-8")
+        headers = {
            "x-api-key": api_key,
            "anthropic-version": "2023-06-01",
            "content-type": "application/json",
-        },
-        method="POST",
-    )
+        }
+
+    req = urllib.request.Request(url, data=payload, headers=headers, method="POST")

    import socket as _socket

    for _attempt in range(3):
        try:
-            with urllib.request.urlopen(req, timeout=30) as resp:
+            with urllib.request.urlopen(req, timeout=120 if backend == "ollama" else 30) as resp:
                result = json.loads(resp.read())
-            raw = result["content"][0]["text"].strip()
-            m = re.search(r"\b(\d+)\b", raw)
+            if backend == "ollama":
+                msg = result["choices"][0]["message"]
+                raw = (msg.get("content") or "").strip() or (msg.get("reasoning") or "").strip()
+            else:
+                raw = result["content"][0]["text"].strip()
+            # Take LAST integer — reasoning models often count candidates first
+            m = re.search(r"\b(\d+)\b", raw[::-1])
            if m:
-                pick = int(m.group(1))
+                pick = int(m.group(1)[::-1])
                if 1 <= pick <= len(candidates):
                    chosen_id = candidates[pick - 1]
                    reordered = [chosen_id] + [cid for cid in retrieved_ids if cid != chosen_id]
@@ -608,6 +632,8 @@ def run_benchmark(
    palace_cache_file=None,
    palace_model="claude-haiku-4-5-20251001",
    embed_model="default",
+    llm_backend="anthropic",
+    llm_base_url="",
 ):
    """Run LoCoMo retrieval benchmark."""
    with open(data_file) as f:
@@ -619,8 +645,12 @@ def run_benchmark(
    api_key = ""
    if llm_rerank_enabled or mode == "palace":
        api_key = _load_api_key(llm_key)
-        if not api_key:
-            print(f"ERROR: --mode {mode} requires an API key (--llm-key or ANTHROPIC_API_KEY).")
+        # Ollama backend doesn't require an Anthropic key. Palace mode still does
+        # (it uses Anthropic for room-assignment indexing) — so only relax the
+        # requirement when rerank is the ONLY llm use and backend is ollama.
+        needs_key = mode == "palace" or (llm_rerank_enabled and llm_backend == "anthropic")
+        if needs_key and not api_key:
+            print(f"ERROR: --mode {mode} / --llm-rerank (anthropic) requires an API key.")
            sys.exit(1)

    # Palace mode: load or create room assignment cache
@@ -888,6 +918,8 @@ def run_benchmark(
                        api_key,
                        top_k=rerank_pool,
                        model=llm_model,
+                        backend=llm_backend,
+                        base_url=llm_base_url,
                    )

                # Compute recall
@@ -1013,6 +1045,18 @@ if __name__ == "__main__":
        help="Model for LLM rerank (default: claude-sonnet-4-6)",
    )
    parser.add_argument("--llm-key", default="", help="API key (or set ANTHROPIC_API_KEY env var)")
+    parser.add_argument(
+        "--llm-backend",
+        choices=["anthropic", "ollama"],
+        default="anthropic",
+        help="Which API for --llm-rerank. 'anthropic' (default) or 'ollama' "
+        "(OpenAI-compat /v1/chat/completions — works for local + Ollama Cloud).",
+    )
+    parser.add_argument(
+        "--llm-base-url",
+        default="",
+        help="Override base URL for --llm-backend ollama. Default: http://localhost:11434.",
+    )
    parser.add_argument(
        "--hybrid-weight",
        type=float,
@@ -1049,4 +1093,6 @@ if __name__ == "__main__":
        palace_cache_file=args.palace_cache,
        palace_model=args.palace_model,
        embed_model=args.embed_model,
+        llm_backend=args.llm_backend,
+        llm_base_url=args.llm_base_url,
    )