Merge pull request #895 from MemPalace/bench/v3.3.0-verification

benchmarks: v3.3.0 reproduction results + Ollama rerank backend
2026-04-14 22:06:30 -03:00
parent 29bc868c89 61d02e10fe
commit db4c52e8be
11 changed files with 331421 additions and 65 deletions
@@ -0,0 +1,508 @@
 {
  "dev": [
    "cc06de0d",
    "f9e8c073",
    "b320f3f8",
    "a89d7624",
    "311778f1",
    "gpt4_59c863d7",
    "bbf86515",
    "099778bb",
    "e831120c",
    "dcfa8644",
    "8fb83627",
    "e66b632c",
    "gpt4_7fce9456",
    "55241a1f",
    "352ab8bd",
    "f4f1d8a4",
    "830ce83f",
    "2311e44b",
    "09ba9854",
    "gpt4_a1b77f9c",
    "07741c45",
    "gpt4_70e84552",
    "b46e15ee",
    "6071bd76",
    "6f9b354f",
    "1d4da289",
    "gpt4_8279ba02",
    "6456829e_abs",
    "0db4c65d",
    "d6062bb9",
    "60bf93ed_abs",
    "d3ab962e",
    "87f22b4a",
    "e01b8e2f",
    "gpt4_7ddcf75f",
    "8ebdbe50",
    "26bdc477",
    "29f2956b_abs",
    "2311e44b_abs",
    "75f70248",
    "852ce960",
    "f0e564bc",
    "fca70973",
    "3c1045c8",
    "18bc8abd",
    "afdc33df",
    "54026fce",
    "b9cfe692",
    "6456829e",
    "e6041065"
  ],
  "held_out": [
    "gpt4_15e38248",
    "gpt4_2ba83207",
    "2133c1b5_abs",
    "gpt4_8279ba03",
    "76d63226",
    "1192316e",
    "gpt4_fa19884d",
    "gpt4_372c3eed_abs",
    "1a8a66a6",
    "gpt4_fe651585",
    "e25c3b8d",
    "945e3d21",
    "86b68151",
    "1c0ddc50",
    "1e043500",
    "d682f1a2",
    "gpt4_b5700ca0",
    "91b15a6e",
    "ce6d2d27",
    "f523d9fe",
    "7024f17c",
    "8752c811",
    "gpt4_f420262d",
    "d01c6aa8",
    "4b24c848",
    "7e974930",
    "3fdac837",
    "gpt4_b4a80587",
    "c18a7dc8",
    "80ec1f4f_abs",
    "7527f7e2",
    "6ade9755",
    "89941a94",
    "gpt4_1d80365e",
    "2133c1b5",
    "06db6396",
    "gpt4_88806d6e",
    "88432d0a",
    "3ba21379",
    "0862e8bf",
    "aae3761f",
    "5025383b",
    "gpt4_e061b84f",
    "73d42213",
    "4bc144e2",
    "gpt4_5501fe77",
    "00ca467f",
    "dfde3500",
    "01493427",
    "b6025781",
    "a96c20ee_abs",
    "982b5123_abs",
    "gpt4_fa19884c",
    "gpt4_1a1dc16d",
    "28dc39ac",
    "gpt4_2d58bcd6",
    "51c32626",
    "c4ea545c",
    "1da05512",
    "gpt4_385a5000",
    "577d4d32",
    "72e3ee87",
    "f4f1d8a4_abs",
    "9d25d4e0",
    "b29f3365",
    "b759caee",
    "10e09553",
    "1d4e3b97",
    "d52b4f67",
    "gpt4_e072b769",
    "58ef2f1c",
    "6e984301",
    "41275add",
    "gpt4_59149c77",
    "2ebe6c90",
    "1cea1afa",
    "gpt4_1e4a8aec",
    "6c49646a",
    "8a2466db",
    "gpt4_65aabe59",
    "gpt4_93159ced",
    "51a45a95",
    "af8d2e46",
    "561fabcd",
    "370a8ff4",
    "gpt4_d84a3211",
    "gpt4_7a0daae1",
    "2a1811e2",
    "gpt4_78cf46a3",
    "1568498a",
    "6b7dfb22",
    "6ae235be",
    "bc8a6e93_abs",
    "681a1674",
    "06878be2",
    "1a1907b4",
    "0e4e4c46",
    "gpt4_85da3956",
    "gpt4_f420262c",
    "2bf43736",
    "bc149d6b",
    "09d032c9",
    "5c40ec5b",
    "eac54adc",
    "993da5e2",
    "71a3fd6b",
    "gpt4_0b2f1d21",
    "ad7109d1",
    "4c36ccef",
    "c8c3f81d",
    "edced276_abs",
    "0bc8ad92",
    "gpt4_468eb064",
    "2ebe6c92",
    "cc6d1ec1",
    "4dfccbf8",
    "95228167",
    "ba358f49",
    "45dc21b6",
    "db467c8c",
    "720133ac",
    "67e0d0f2",
    "cc5ded98",
    "726462e0",
    "4100d0a0",
    "3a704032",
    "gpt4_7ca326fa",
    "ec81a493",
    "618f13b2",
    "58470ed2",
    "gpt4_4fc4f797",
    "60036106",
    "157a136e",
    "6222b6eb",
    "69fee5aa",
    "19b5f2b3_abs",
    "gpt4_d12ceb0e",
    "51b23612",
    "2318644b",
    "3fe836c9",
    "gpt4_7de946e7",
    "71017277",
    "f0853d11",
    "dc439ea3",
    "gpt4_2f91af09",
    "9a707b81",
    "bc8a6e93",
    "c14c00dd",
    "8979f9ec",
    "cf22b7bf",
    "gpt4_ec93e27f",
    "gpt4_468eb063",
    "41698283",
    "1de5cff2",
    "21d02d0d",
    "c7cf7dfd",
    "gpt4_ab202e7f",
    "dccbc061",
    "078150f1",
    "e3038f8c",
    "gpt4_c27434e8_abs",
    "2698e78f",
    "031748ae_abs",
    "gpt4_59149c78",
    "c8f1aeed",
    "184da446",
    "gpt4_b5700ca9",
    "89527b6b",
    "0977f2af",
    "853b0a1d",
    "a346bb18",
    "3249768e",
    "gpt4_2f8be40d",
    "gpt4_93159ced_abs",
    "eeda8a6d",
    "7a8d0b71",
    "95bcc1c8",
    "gpt4_2487a7cb",
    "85fa3a3f",
    "7e00a6cb",
    "e3fc4d6e",
    "59524333",
    "37f165cf",
    "0ddfec37",
    "60bf93ed",
    "d7c942c3",
    "80ec1f4f",
    "ceb54acb",
    "9aaed6a3",
    "gpt4_4929293a",
    "ed4ddc30",
    "545bd2b5",
    "2788b940",
    "ef9cf60a",
    "gpt4_7f6b06db",
    "0ea62687",
    "3d86fd0a",
    "3e321797",
    "d24813b1",
    "38146c39",
    "efc3f7c2",
    "7401057b",
    "5809eb10",
    "28bcfaac",
    "1903aded",
    "gpt4_194be4b3",
    "gpt4_e414231f",
    "0ddfec37_abs",
    "c2ac3c61",
    "gpt4_4ef30696",
    "1f2b8d4f",
    "0f05491a",
    "8550ddae",
    "8077ef71",
    "b86304ba",
    "e61a7584",
    "8cf51dda",
    "gpt4_2f584639",
    "08e075c7",
    "5d3d2817",
    "7405e8b1",
    "a3045048",
    "gpt4_731e37d7",
    "c8090214_abs",
    "36580ce8",
    "ba358f49_abs",
    "gpt4_d6585ce8",
    "e56a43b9",
    "2c63a862",
    "gpt4_5438fa52",
    "07b6f563",
    "gpt4_31ff4165",
    "0bb5a684",
    "71315a70",
    "gpt4_cd90e484",
    "gpt4_8c8961ae",
    "gpt4_fe651585_abs",
    "36b9f61e",
    "gpt4_b0863698",
    "gpt4_1d4ab0c9",
    "15745da0_abs",
    "0862e8bf_abs",
    "bcbe585f",
    "a2f3aa27",
    "gpt4_6dc9b45b",
    "ccb36322",
    "f685340e",
    "9ea5eabc",
    "gpt4_372c3eed",
    "37d43f65",
    "bf659f65",
    "b0479f84",
    "gpt4_213fd887",
    "e4e14d04",
    "f8c5f88b",
    "gpt4_18c2b244",
    "a11281a2",
    "gpt4_2655b836",
    "e47becba",
    "gpt4_74aed68e",
    "gpt4_af6db32f",
    "6cb6f249",
    "77eafa52",
    "gpt4_93f6379c",
    "e8a79c70",
    "7a87bd0c",
    "gpt4_6ed717ea",
    "d6233ab6",
    "c19f7a0b",
    "gpt4_61e13b3c",
    "d23cf73b",
    "gpt4_1e4a8aeb",
    "ba61f0b9",
    "118b2229",
    "488d3006",
    "c4a1ceb8",
    "8e91e7d9",
    "42ec0761",
    "65240037",
    "fea54f57",
    "c8090214",
    "b01defab",
    "6aeb4375_abs",
    "faba32e5",
    "c5e8278d",
    "gpt4_e414231e",
    "eeda8a6d_abs",
    "gpt4_8e165409",
    "af082822",
    "22d2cb42",
    "92a0aa75",
    "1c549ce4",
    "25e5aa4f",
    "gpt4_68e94288",
    "4baee567",
    "18dcd5a5",
    "dad224aa",
    "gpt4_f2262a51",
    "29f2956b",
    "21436231",
    "19b5f2b3",
    "gpt4_1916e0ea",
    "gpt4_45189cb4",
    "0a995998",
    "b6019101",
    "9bbe84a2",
    "61f8c8f8",
    "9a707b82",
    "8cf4d046",
    "eac54add",
    "75832dbd",
    "gpt4_98f46fc6",
    "d596882b",
    "88432d0a_abs",
    "16c90bf4",
    "f685340e_abs",
    "b5ef892d",
    "gpt4_f49edff3",
    "gpt4_483dd43c",
    "bb7c3b45",
    "gpt4_7abb270c",
    "gpt4_9a159967",
    "07741c44",
    "4d6b87c8",
    "6aeb4375",
    "gpt4_d6585ce9",
    "60472f9c",
    "caf9ead2",
    "32260d93",
    "60159905",
    "0a34ad58",
    "a40e080f",
    "10d9b85a",
    "a06e4cfe",
    "4f54b7c9",
    "6613b389",
    "70b3e69b",
    "gpt4_7bc6cf22",
    "gpt4_0a05b494",
    "778164c6",
    "195a1a1b",
    "8464fc84",
    "b46e15ed",
    "603deb26",
    "eaca4986",
    "2698e78f_abs",
    "gpt4_21adecb5",
    "2e6d26dc",
    "5831f84d",
    "08f4fc43",
    "3f1e9474",
    "c9f37c46",
    "gpt4_2f56ae70",
    "1b9b7252",
    "35a27287",
    "gpt4_d31cdae3",
    "129d1232",
    "4adc0475",
    "27016adc",
    "46a3abf7",
    "9ee3ecd6",
    "982b5123",
    "09ba9854_abs",
    "0e5e2d1a",
    "e9327a54",
    "86f00804",
    "e982271f",
    "7161e7e2",
    "57f827a0",
    "6a27ffc2",
    "edced276",
    "gpt4_d9af6064",
    "75499fd8",
    "60d45044",
    "gpt4_70e84552_abs",
    "2ce6a0f2",
    "gpt4_4929293b",
    "a1cc6108",
    "gpt4_5dcc0aab",
    "a3838d2b",
    "c7dc5443",
    "505af2f5",
    "gpt4_68e94287",
    "15745da0",
    "0100672e",
    "a82c026e",
    "5e1b23de",
    "71017276",
    "89941a93",
    "6b168ec8",
    "affe2881",
    "0edc2aef",
    "gpt4_2312f94c",
    "a4996e51",
    "c6853660",
    "ef66a6e5",
    "8a137a7f",
    "a96c20ee",
    "fca762bc",
    "ac031881",
    "d905b33f",
    "e493bb7c",
    "a9f6b44c",
    "dd2973ad",
    "8aef76bc",
    "f35224e0",
    "8b9d4367",
    "gpt4_c27434e8",
    "gpt4_a56e767c",
    "eace081b",
    "5a4f22c0",
    "58bf7951",
    "c4f10528",
    "50635ada",
    "06f04340",
    "0bc8ad93",
    "e5ba910e_abs",
    "5a7937c8",
    "a3332713",
    "4388e9dd",
    "8c18457d",
    "gpt4_2c50253f",
    "6a1eabeb",
    "b3c15d39",
    "gpt4_e061b84g",
    "3b6f954b",
    "gpt4_76048e76",
    "4dfccbf7",
    "2b8f3739",
    "d851d5ba",
    "4fd1909e",
    "94f70d80",
    "66f24dbb",
    "a08a253f",
    "6e984302",
    "001be529",
    "gpt4_a2d1d1f6",
    "cc539528",
    "e48988bc",
    "gpt4_4cd9eba1",
    "8e9d538c",
    "a1eacc2a",
    "6d550036",
    "gpt4_e05b82a6",
    "81507db6",
    "caf03d32",
    "031748ae",
    "c960da58",
    "1faac195",
    "gpt4_4edbafa2"
  ],
  "seed": 42,
  "dev_size": 50
 }
@@ -510,11 +510,20 @@ def palace_assign_rooms(sessions, sample_id, api_key, cache, model="claude-haiku
 def llm_rerank_locomo(
-    question, retrieved_ids, retrieved_docs, api_key, top_k=10, model="claude-sonnet-4-6"
+    question,
    retrieved_ids,
    retrieved_docs,
    api_key,
    top_k=10,
    model="claude-sonnet-4-6",
    backend="anthropic",
    base_url="",
 ):
    """
    Ask LLM to pick the single most relevant document for this question.
    Returns reordered retrieved_ids with the best candidate first.
    Supports backend="anthropic" (default) or "ollama" (OpenAI-compat endpoint).
    """
    candidates = retrieved_ids[:top_k]
    candidate_docs = retrieved_docs[:top_k]
@@ -522,7 +531,6 @@ def llm_rerank_locomo(
    if len(candidates) <= 1:
        return retrieved_ids
    # Build numbered list of candidates
    lines = []
    for i, (cid, doc) in enumerate(zip(candidates, candidate_docs), 1):
        snippet = doc[:300].replace("\n", " ")
@@ -534,35 +542,51 @@ def llm_rerank_locomo(
        f"Reply with just the number (1-{len(candidates)}).\n\n" + "\n".join(lines)
    )
-    payload = json.dumps(
+    if backend == "ollama":
-        {
+        url = (base_url or "http://localhost:11434").rstrip("/") + "/v1/chat/completions"
-            "model": model,
+        payload = json.dumps(
-            "max_tokens": 8,
+            {
-            "messages": [{"role": "user", "content": prompt}],
+                "model": model,
-        }
+                "messages": [{"role": "user", "content": prompt}],
-    ).encode("utf-8")
+                "max_tokens": 1024,
-
+                "temperature": 0.0,
-    req = urllib.request.Request(
+            }
-        "https://api.anthropic.com/v1/messages",
+        ).encode("utf-8")
-        data=payload,
+        headers = {"content-type": "application/json"}
-        headers={
+        if api_key:
            headers["authorization"] = f"Bearer {api_key}"
    else:
        url = "https://api.anthropic.com/v1/messages"
        payload = json.dumps(
            {
                "model": model,
                "max_tokens": 8,
                "messages": [{"role": "user", "content": prompt}],
            }
        ).encode("utf-8")
        headers = {
            "x-api-key": api_key,
            "anthropic-version": "2023-06-01",
            "content-type": "application/json",
-        },
+        }
-        method="POST",
+
-    )
+    req = urllib.request.Request(url, data=payload, headers=headers, method="POST")
    import socket as _socket
    for _attempt in range(3):
        try:
-            with urllib.request.urlopen(req, timeout=30) as resp:
+            with urllib.request.urlopen(req, timeout=120 if backend == "ollama" else 30) as resp:
                result = json.loads(resp.read())
-            raw = result["content"][0]["text"].strip()
+            if backend == "ollama":
-            m = re.search(r"\b(\d+)\b", raw)
+                msg = result["choices"][0]["message"]
                raw = (msg.get("content") or "").strip() or (msg.get("reasoning") or "").strip()
            else:
                raw = result["content"][0]["text"].strip()
            # Take LAST integer — reasoning models often count candidates first
            m = re.search(r"\b(\d+)\b", raw[::-1])
            if m:
-                pick = int(m.group(1))
+                pick = int(m.group(1)[::-1])
                if 1 <= pick <= len(candidates):
                    chosen_id = candidates[pick - 1]
                    reordered = [chosen_id] + [cid for cid in retrieved_ids if cid != chosen_id]
@@ -608,6 +632,8 @@ def run_benchmark(
    palace_cache_file=None,
    palace_model="claude-haiku-4-5-20251001",
    embed_model="default",
    llm_backend="anthropic",
    llm_base_url="",
 ):
    """Run LoCoMo retrieval benchmark."""
    with open(data_file) as f:
@@ -619,8 +645,12 @@ def run_benchmark(
    api_key = ""
    if llm_rerank_enabled or mode == "palace":
        api_key = _load_api_key(llm_key)
-        if not api_key:
+        # Ollama backend doesn't require an Anthropic key. Palace mode still does
-            print(f"ERROR: --mode {mode} requires an API key (--llm-key or ANTHROPIC_API_KEY).")
+        # (it uses Anthropic for room-assignment indexing) — so only relax the
        # requirement when rerank is the ONLY llm use and backend is ollama.
        needs_key = mode == "palace" or (llm_rerank_enabled and llm_backend == "anthropic")
        if needs_key and not api_key:
            print(f"ERROR: --mode {mode} / --llm-rerank (anthropic) requires an API key.")
            sys.exit(1)
    # Palace mode: load or create room assignment cache
@@ -888,6 +918,8 @@ def run_benchmark(
                        api_key,
                        top_k=rerank_pool,
                        model=llm_model,
                        backend=llm_backend,
                        base_url=llm_base_url,
                    )
                # Compute recall
@@ -1013,6 +1045,18 @@ if __name__ == "__main__":
        help="Model for LLM rerank (default: claude-sonnet-4-6)",
    )
    parser.add_argument("--llm-key", default="", help="API key (or set ANTHROPIC_API_KEY env var)")
    parser.add_argument(
        "--llm-backend",
        choices=["anthropic", "ollama"],
        default="anthropic",
        help="Which API for --llm-rerank. 'anthropic' (default) or 'ollama' "
        "(OpenAI-compat /v1/chat/completions — works for local + Ollama Cloud).",
    )
    parser.add_argument(
        "--llm-base-url",
        default="",
        help="Override base URL for --llm-backend ollama. Default: http://localhost:11434.",
    )
    parser.add_argument(
        "--hybrid-weight",
        type=float,
@@ -1049,4 +1093,6 @@ if __name__ == "__main__":
        palace_cache_file=args.palace_cache,
        palace_model=args.palace_model,
        embed_model=args.embed_model,
        llm_backend=args.llm_backend,
        llm_base_url=args.llm_base_url,
    )
@@ -2763,7 +2763,15 @@ def build_palace_and_retrieve_diary(
 def llm_rerank(
-    question, rankings, corpus, corpus_ids, api_key, top_k=10, model="claude-haiku-4-5-20251001"
+    question,
    rankings,
    corpus,
    corpus_ids,
    api_key,
    top_k=10,
    model="claude-haiku-4-5-20251001",
    backend="anthropic",
    base_url="",
 ):
    """
    Use an LLM to re-rank the top-k retrieved sessions.
@@ -2772,19 +2780,22 @@ def llm_rerank(
    which single session is most relevant to the question. That session
    is promoted to rank 1; the rest stay in their existing order.
-    This closes the gap for "preference" and jargon-dense "assistant"
+    Supports two backends:
-    failures where the right session is in top-10 semantically but not
+      - "anthropic": hits https://api.anthropic.com/v1/messages with x-api-key.
-    top-5 — because the semantic gap (battery life ↔ phone hardware) is
+      - "ollama":    hits {base_url}/v1/chat/completions (OpenAI-compat) —
-    too large for embeddings to bridge.
+                     works for local Ollama (default http://localhost:11434)
                     and Ollama Cloud (:cloud model tags).
    Args:
-        question:    The benchmark question string
+        question:   The benchmark question string
-        rankings:    Current ranked list of corpus indices (from any mode)
+        rankings:   Current ranked list of corpus indices (from any mode)
-        corpus:      List of document strings
+        corpus:     List of document strings
-        corpus_ids:  List of corpus IDs (parallel to corpus)
+        corpus_ids: List of corpus IDs (parallel to corpus)
-        api_key:     Anthropic API key string
+        api_key:    Anthropic API key (only required for backend="anthropic")
-        top_k:       How many top sessions to send to LLM (default: 10)
+        top_k:      How many top sessions to send to LLM (default: 10)
-        model:       Claude model ID for reranking (default: haiku)
+        model:      Model id (Claude model for anthropic, e.g. "minimax-m2.7:cloud" for ollama)
        backend:    "anthropic" or "ollama"
        base_url:   Override base URL (ollama default: http://localhost:11434)
    Returns:
        Reordered rankings list with LLM's best pick promoted to rank 1.
@@ -2796,7 +2807,6 @@ def llm_rerank(
    if not candidates:
        return rankings
    # Format sessions for the prompt — first 500 chars each, labelled 1..N
    session_blocks = []
    for rank, idx in enumerate(candidates):
        text = corpus[idx][:500].replace("\n", " ").strip()
@@ -2813,49 +2823,68 @@ def llm_rerank(
        f"Most relevant session number:"
    )
-    payload = json.dumps(
+    if backend == "ollama":
-        {
+        url = (base_url or "http://localhost:11434").rstrip("/") + "/v1/chat/completions"
-            "model": model,
+        payload = json.dumps(
-            "max_tokens": 8,
+            {
-            "messages": [{"role": "user", "content": prompt}],
+                "model": model,
-        }
+                "messages": [{"role": "user", "content": prompt}],
-    ).encode("utf-8")
+                "max_tokens": 1024,
-
+                "temperature": 0.0,
-    req = urllib.request.Request(
+            }
-        "https://api.anthropic.com/v1/messages",
+        ).encode("utf-8")
-        data=payload,
+        headers = {"content-type": "application/json"}
-        headers={
+        if api_key:
            headers["authorization"] = f"Bearer {api_key}"
    else:
        url = "https://api.anthropic.com/v1/messages"
        payload = json.dumps(
            {
                "model": model,
                "max_tokens": 8,
                "messages": [{"role": "user", "content": prompt}],
            }
        ).encode("utf-8")
        headers = {
            "x-api-key": api_key,
            "anthropic-version": "2023-06-01",
            "content-type": "application/json",
-        },
+        }
-        method="POST",
+
-    )
+    req = urllib.request.Request(url, data=payload, headers=headers, method="POST")
    import socket as _socket
    for _attempt in range(3):
        try:
-            with urllib.request.urlopen(req, timeout=20) as resp:
+            with urllib.request.urlopen(req, timeout=120 if backend == "ollama" else 20) as resp:
                result = json.loads(resp.read())
-            raw = result["content"][0]["text"].strip()
+            if backend == "ollama":
-            # Parse just the first integer from Haiku's response
+                msg = result["choices"][0]["message"]
-            m = re.search(r"\b(\d+)\b", raw)
+                # Reasoning models (e.g. minimax-m2.7) may emit final answer in "content"
                # or embed it in "reasoning". Try content first, fall back to reasoning.
                raw = (msg.get("content") or "").strip()
                if not raw:
                    raw = (msg.get("reasoning") or "").strip()
            else:
                raw = result["content"][0]["text"].strip()
            m = re.search(
                r"\b(\d+)\b", raw[::-1]
            )  # take LAST integer (rerank models often reason first)
            if m:
-                pick = int(m.group(1))
+                pick = int(m.group(1)[::-1])
                if 1 <= pick <= len(candidates):
                    chosen_idx = candidates[pick - 1]
                    reordered = [chosen_idx] + [i for i in rankings if i != chosen_idx]
                    return reordered
-            break  # Got a response, even if unparseable — don't retry
+            break
        except (_socket.timeout, TimeoutError):
            if _attempt < 2:
                import time as _time
-                _time.sleep(3)  # brief pause then retry
+                _time.sleep(3)
            # else fall through to return rankings
        except (urllib.error.URLError, KeyError, ValueError, IndexError, OSError):
-            break  # Non-timeout error — fall back immediately
+            break
    return rankings
@@ -2919,6 +2948,8 @@ def run_benchmark(
    skip_precompute=False,
    split_file=None,
    split_subset=None,
    llm_backend="anthropic",
    llm_base_url="",
 ):
    """Run the full benchmark.
@@ -2947,10 +2978,14 @@ def run_benchmark(
    api_key = ""
    if llm_rerank_enabled or mode == "diary":
        api_key = _load_api_key(llm_key)
-        if not api_key:
+        # Ollama backend doesn't require an Anthropic API key; a local/cloud Ollama
        # daemon with the requested model pulled is enough. Diary mode is always anthropic.
        needs_key = (llm_backend == "anthropic") or (mode == "diary")
        if needs_key and not api_key:
            print(
-                "ERROR: --llm-rerank / --mode diary requires an API key. "
+                "ERROR: --llm-rerank (anthropic backend) / --mode diary requires an API key. "
-                "Set ANTHROPIC_API_KEY or use --llm-key."
+                "Set ANTHROPIC_API_KEY or use --llm-key. For ollama backend, pass "
                "--llm-backend ollama."
            )
            sys.exit(1)
@@ -3100,7 +3135,15 @@ def run_benchmark(
        if llm_rerank_enabled:
            rerank_pool = 20 if mode in ("hybrid_v3", "hybrid_v4", "palace") else 10
            rankings = llm_rerank(
-                question, rankings, corpus, corpus_ids, api_key, top_k=rerank_pool, model=llm_model
+                question,
                rankings,
                corpus,
                corpus_ids,
                api_key,
                top_k=rerank_pool,
                model=llm_model,
                backend=llm_backend,
                base_url=llm_base_url,
            )
        # Evaluate at session level
@@ -3276,7 +3319,21 @@ if __name__ == "__main__":
        default="claude-haiku-4-5-20251001",
        help="Model for LLM re-ranking and diary ingest "
        "(default: claude-haiku-4-5-20251001). "
-        "Use 'claude-sonnet-4-6' for Sonnet comparison.",
+        "Use 'claude-sonnet-4-6' for Sonnet comparison. "
        "With --llm-backend ollama, use an Ollama model tag like 'minimax-m2.7:cloud'.",
    )
    parser.add_argument(
        "--llm-backend",
        choices=["anthropic", "ollama"],
        default="anthropic",
        help="Which API to hit for --llm-rerank. 'anthropic' (default) uses Anthropic's "
        "/v1/messages endpoint. 'ollama' uses Ollama's OpenAI-compatible "
        "/v1/chat/completions endpoint (works with local Ollama and Ollama Cloud).",
    )
    parser.add_argument(
        "--llm-base-url",
        default="",
        help="Override base URL for --llm-backend ollama. Defaults to http://localhost:11434.",
    )
    parser.add_argument(
        "--diary-cache",
@@ -3380,4 +3437,6 @@ if __name__ == "__main__":
        args.skip_precompute,
        split_file=args.split_file,
        split_subset=split_subset,
        llm_backend=args.llm_backend,
        llm_base_url=args.llm_base_url,
    )