Merge pull request #895 from MemPalace/bench/v3.3.0-verification

benchmarks: v3.3.0 reproduction results + Ollama rerank backend
2026-04-14 22:06:30 -03:00
parent 29bc868c89 61d02e10fe
commit db4c52e8be
11 changed files with 331421 additions and 65 deletions
@@ -0,0 +1,508 @@
+{
+  "dev": [
+    "cc06de0d",
+    "f9e8c073",
+    "b320f3f8",
+    "a89d7624",
+    "311778f1",
+    "gpt4_59c863d7",
+    "bbf86515",
+    "099778bb",
+    "e831120c",
+    "dcfa8644",
+    "8fb83627",
+    "e66b632c",
+    "gpt4_7fce9456",
+    "55241a1f",
+    "352ab8bd",
+    "f4f1d8a4",
+    "830ce83f",
+    "2311e44b",
+    "09ba9854",
+    "gpt4_a1b77f9c",
+    "07741c45",
+    "gpt4_70e84552",
+    "b46e15ee",
+    "6071bd76",
+    "6f9b354f",
+    "1d4da289",
+    "gpt4_8279ba02",
+    "6456829e_abs",
+    "0db4c65d",
+    "d6062bb9",
+    "60bf93ed_abs",
+    "d3ab962e",
+    "87f22b4a",
+    "e01b8e2f",
+    "gpt4_7ddcf75f",
+    "8ebdbe50",
+    "26bdc477",
+    "29f2956b_abs",
+    "2311e44b_abs",
+    "75f70248",
+    "852ce960",
+    "f0e564bc",
+    "fca70973",
+    "3c1045c8",
+    "18bc8abd",
+    "afdc33df",
+    "54026fce",
+    "b9cfe692",
+    "6456829e",
+    "e6041065"
+  ],
+  "held_out": [
+    "gpt4_15e38248",
+    "gpt4_2ba83207",
+    "2133c1b5_abs",
+    "gpt4_8279ba03",
+    "76d63226",
+    "1192316e",
+    "gpt4_fa19884d",
+    "gpt4_372c3eed_abs",
+    "1a8a66a6",
+    "gpt4_fe651585",
+    "e25c3b8d",
+    "945e3d21",
+    "86b68151",
+    "1c0ddc50",
+    "1e043500",
+    "d682f1a2",
+    "gpt4_b5700ca0",
+    "91b15a6e",
+    "ce6d2d27",
+    "f523d9fe",
+    "7024f17c",
+    "8752c811",
+    "gpt4_f420262d",
+    "d01c6aa8",
+    "4b24c848",
+    "7e974930",
+    "3fdac837",
+    "gpt4_b4a80587",
+    "c18a7dc8",
+    "80ec1f4f_abs",
+    "7527f7e2",
+    "6ade9755",
+    "89941a94",
+    "gpt4_1d80365e",
+    "2133c1b5",
+    "06db6396",
+    "gpt4_88806d6e",
+    "88432d0a",
+    "3ba21379",
+    "0862e8bf",
+    "aae3761f",
+    "5025383b",
+    "gpt4_e061b84f",
+    "73d42213",
+    "4bc144e2",
+    "gpt4_5501fe77",
+    "00ca467f",
+    "dfde3500",
+    "01493427",
+    "b6025781",
+    "a96c20ee_abs",
+    "982b5123_abs",
+    "gpt4_fa19884c",
+    "gpt4_1a1dc16d",
+    "28dc39ac",
+    "gpt4_2d58bcd6",
+    "51c32626",
+    "c4ea545c",
+    "1da05512",
+    "gpt4_385a5000",
+    "577d4d32",
+    "72e3ee87",
+    "f4f1d8a4_abs",
+    "9d25d4e0",
+    "b29f3365",
+    "b759caee",
+    "10e09553",
+    "1d4e3b97",
+    "d52b4f67",
+    "gpt4_e072b769",
+    "58ef2f1c",
+    "6e984301",
+    "41275add",
+    "gpt4_59149c77",
+    "2ebe6c90",
+    "1cea1afa",
+    "gpt4_1e4a8aec",
+    "6c49646a",
+    "8a2466db",
+    "gpt4_65aabe59",
+    "gpt4_93159ced",
+    "51a45a95",
+    "af8d2e46",
+    "561fabcd",
+    "370a8ff4",
+    "gpt4_d84a3211",
+    "gpt4_7a0daae1",
+    "2a1811e2",
+    "gpt4_78cf46a3",
+    "1568498a",
+    "6b7dfb22",
+    "6ae235be",
+    "bc8a6e93_abs",
+    "681a1674",
+    "06878be2",
+    "1a1907b4",
+    "0e4e4c46",
+    "gpt4_85da3956",
+    "gpt4_f420262c",
+    "2bf43736",
+    "bc149d6b",
+    "09d032c9",
+    "5c40ec5b",
+    "eac54adc",
+    "993da5e2",
+    "71a3fd6b",
+    "gpt4_0b2f1d21",
+    "ad7109d1",
+    "4c36ccef",
+    "c8c3f81d",
+    "edced276_abs",
+    "0bc8ad92",
+    "gpt4_468eb064",
+    "2ebe6c92",
+    "cc6d1ec1",
+    "4dfccbf8",
+    "95228167",
+    "ba358f49",
+    "45dc21b6",
+    "db467c8c",
+    "720133ac",
+    "67e0d0f2",
+    "cc5ded98",
+    "726462e0",
+    "4100d0a0",
+    "3a704032",
+    "gpt4_7ca326fa",
+    "ec81a493",
+    "618f13b2",
+    "58470ed2",
+    "gpt4_4fc4f797",
+    "60036106",
+    "157a136e",
+    "6222b6eb",
+    "69fee5aa",
+    "19b5f2b3_abs",
+    "gpt4_d12ceb0e",
+    "51b23612",
+    "2318644b",
+    "3fe836c9",
+    "gpt4_7de946e7",
+    "71017277",
+    "f0853d11",
+    "dc439ea3",
+    "gpt4_2f91af09",
+    "9a707b81",
+    "bc8a6e93",
+    "c14c00dd",
+    "8979f9ec",
+    "cf22b7bf",
+    "gpt4_ec93e27f",
+    "gpt4_468eb063",
+    "41698283",
+    "1de5cff2",
+    "21d02d0d",
+    "c7cf7dfd",
+    "gpt4_ab202e7f",
+    "dccbc061",
+    "078150f1",
+    "e3038f8c",
+    "gpt4_c27434e8_abs",
+    "2698e78f",
+    "031748ae_abs",
+    "gpt4_59149c78",
+    "c8f1aeed",
+    "184da446",
+    "gpt4_b5700ca9",
+    "89527b6b",
+    "0977f2af",
+    "853b0a1d",
+    "a346bb18",
+    "3249768e",
+    "gpt4_2f8be40d",
+    "gpt4_93159ced_abs",
+    "eeda8a6d",
+    "7a8d0b71",
+    "95bcc1c8",
+    "gpt4_2487a7cb",
+    "85fa3a3f",
+    "7e00a6cb",
+    "e3fc4d6e",
+    "59524333",
+    "37f165cf",
+    "0ddfec37",
+    "60bf93ed",
+    "d7c942c3",
+    "80ec1f4f",
+    "ceb54acb",
+    "9aaed6a3",
+    "gpt4_4929293a",
+    "ed4ddc30",
+    "545bd2b5",
+    "2788b940",
+    "ef9cf60a",
+    "gpt4_7f6b06db",
+    "0ea62687",
+    "3d86fd0a",
+    "3e321797",
+    "d24813b1",
+    "38146c39",
+    "efc3f7c2",
+    "7401057b",
+    "5809eb10",
+    "28bcfaac",
+    "1903aded",
+    "gpt4_194be4b3",
+    "gpt4_e414231f",
+    "0ddfec37_abs",
+    "c2ac3c61",
+    "gpt4_4ef30696",
+    "1f2b8d4f",
+    "0f05491a",
+    "8550ddae",
+    "8077ef71",
+    "b86304ba",
+    "e61a7584",
+    "8cf51dda",
+    "gpt4_2f584639",
+    "08e075c7",
+    "5d3d2817",
+    "7405e8b1",
+    "a3045048",
+    "gpt4_731e37d7",
+    "c8090214_abs",
+    "36580ce8",
+    "ba358f49_abs",
+    "gpt4_d6585ce8",
+    "e56a43b9",
+    "2c63a862",
+    "gpt4_5438fa52",
+    "07b6f563",
+    "gpt4_31ff4165",
+    "0bb5a684",
+    "71315a70",
+    "gpt4_cd90e484",
+    "gpt4_8c8961ae",
+    "gpt4_fe651585_abs",
+    "36b9f61e",
+    "gpt4_b0863698",
+    "gpt4_1d4ab0c9",
+    "15745da0_abs",
+    "0862e8bf_abs",
+    "bcbe585f",
+    "a2f3aa27",
+    "gpt4_6dc9b45b",
+    "ccb36322",
+    "f685340e",
+    "9ea5eabc",
+    "gpt4_372c3eed",
+    "37d43f65",
+    "bf659f65",
+    "b0479f84",
+    "gpt4_213fd887",
+    "e4e14d04",
+    "f8c5f88b",
+    "gpt4_18c2b244",
+    "a11281a2",
+    "gpt4_2655b836",
+    "e47becba",
+    "gpt4_74aed68e",
+    "gpt4_af6db32f",
+    "6cb6f249",
+    "77eafa52",
+    "gpt4_93f6379c",
+    "e8a79c70",
+    "7a87bd0c",
+    "gpt4_6ed717ea",
+    "d6233ab6",
+    "c19f7a0b",
+    "gpt4_61e13b3c",
+    "d23cf73b",
+    "gpt4_1e4a8aeb",
+    "ba61f0b9",
+    "118b2229",
+    "488d3006",
+    "c4a1ceb8",
+    "8e91e7d9",
+    "42ec0761",
+    "65240037",
+    "fea54f57",
+    "c8090214",
+    "b01defab",
+    "6aeb4375_abs",
+    "faba32e5",
+    "c5e8278d",
+    "gpt4_e414231e",
+    "eeda8a6d_abs",
+    "gpt4_8e165409",
+    "af082822",
+    "22d2cb42",
+    "92a0aa75",
+    "1c549ce4",
+    "25e5aa4f",
+    "gpt4_68e94288",
+    "4baee567",
+    "18dcd5a5",
+    "dad224aa",
+    "gpt4_f2262a51",
+    "29f2956b",
+    "21436231",
+    "19b5f2b3",
+    "gpt4_1916e0ea",
+    "gpt4_45189cb4",
+    "0a995998",
+    "b6019101",
+    "9bbe84a2",
+    "61f8c8f8",
+    "9a707b82",
+    "8cf4d046",
+    "eac54add",
+    "75832dbd",
+    "gpt4_98f46fc6",
+    "d596882b",
+    "88432d0a_abs",
+    "16c90bf4",
+    "f685340e_abs",
+    "b5ef892d",
+    "gpt4_f49edff3",
+    "gpt4_483dd43c",
+    "bb7c3b45",
+    "gpt4_7abb270c",
+    "gpt4_9a159967",
+    "07741c44",
+    "4d6b87c8",
+    "6aeb4375",
+    "gpt4_d6585ce9",
+    "60472f9c",
+    "caf9ead2",
+    "32260d93",
+    "60159905",
+    "0a34ad58",
+    "a40e080f",
+    "10d9b85a",
+    "a06e4cfe",
+    "4f54b7c9",
+    "6613b389",
+    "70b3e69b",
+    "gpt4_7bc6cf22",
+    "gpt4_0a05b494",
+    "778164c6",
+    "195a1a1b",
+    "8464fc84",
+    "b46e15ed",
+    "603deb26",
+    "eaca4986",
+    "2698e78f_abs",
+    "gpt4_21adecb5",
+    "2e6d26dc",
+    "5831f84d",
+    "08f4fc43",
+    "3f1e9474",
+    "c9f37c46",
+    "gpt4_2f56ae70",
+    "1b9b7252",
+    "35a27287",
+    "gpt4_d31cdae3",
+    "129d1232",
+    "4adc0475",
+    "27016adc",
+    "46a3abf7",
+    "9ee3ecd6",
+    "982b5123",
+    "09ba9854_abs",
+    "0e5e2d1a",
+    "e9327a54",
+    "86f00804",
+    "e982271f",
+    "7161e7e2",
+    "57f827a0",
+    "6a27ffc2",
+    "edced276",
+    "gpt4_d9af6064",
+    "75499fd8",
+    "60d45044",
+    "gpt4_70e84552_abs",
+    "2ce6a0f2",
+    "gpt4_4929293b",
+    "a1cc6108",
+    "gpt4_5dcc0aab",
+    "a3838d2b",
+    "c7dc5443",
+    "505af2f5",
+    "gpt4_68e94287",
+    "15745da0",
+    "0100672e",
+    "a82c026e",
+    "5e1b23de",
+    "71017276",
+    "89941a93",
+    "6b168ec8",
+    "affe2881",
+    "0edc2aef",
+    "gpt4_2312f94c",
+    "a4996e51",
+    "c6853660",
+    "ef66a6e5",
+    "8a137a7f",
+    "a96c20ee",
+    "fca762bc",
+    "ac031881",
+    "d905b33f",
+    "e493bb7c",
+    "a9f6b44c",
+    "dd2973ad",
+    "8aef76bc",
+    "f35224e0",
+    "8b9d4367",
+    "gpt4_c27434e8",
+    "gpt4_a56e767c",
+    "eace081b",
+    "5a4f22c0",
+    "58bf7951",
+    "c4f10528",
+    "50635ada",
+    "06f04340",
+    "0bc8ad93",
+    "e5ba910e_abs",
+    "5a7937c8",
+    "a3332713",
+    "4388e9dd",
+    "8c18457d",
+    "gpt4_2c50253f",
+    "6a1eabeb",
+    "b3c15d39",
+    "gpt4_e061b84g",
+    "3b6f954b",
+    "gpt4_76048e76",
+    "4dfccbf7",
+    "2b8f3739",
+    "d851d5ba",
+    "4fd1909e",
+    "94f70d80",
+    "66f24dbb",
+    "a08a253f",
+    "6e984302",
+    "001be529",
+    "gpt4_a2d1d1f6",
+    "cc539528",
+    "e48988bc",
+    "gpt4_4cd9eba1",
+    "8e9d538c",
+    "a1eacc2a",
+    "6d550036",
+    "gpt4_e05b82a6",
+    "81507db6",
+    "caf03d32",
+    "031748ae",
+    "c960da58",
+    "1faac195",
+    "gpt4_4edbafa2"
+  ],
+  "seed": 42,
+  "dev_size": 50
+}
@@ -510,11 +510,20 @@ def palace_assign_rooms(sessions, sample_id, api_key, cache, model="claude-haiku


 def llm_rerank_locomo(
-    question, retrieved_ids, retrieved_docs, api_key, top_k=10, model="claude-sonnet-4-6"
+    question,
+    retrieved_ids,
+    retrieved_docs,
+    api_key,
+    top_k=10,
+    model="claude-sonnet-4-6",
+    backend="anthropic",
+    base_url="",
 ):
    """
    Ask LLM to pick the single most relevant document for this question.
    Returns reordered retrieved_ids with the best candidate first.
+
+    Supports backend="anthropic" (default) or "ollama" (OpenAI-compat endpoint).
    """
    candidates = retrieved_ids[:top_k]
    candidate_docs = retrieved_docs[:top_k]
@@ -522,7 +531,6 @@ def llm_rerank_locomo(
    if len(candidates) <= 1:
        return retrieved_ids

-    # Build numbered list of candidates
    lines = []
    for i, (cid, doc) in enumerate(zip(candidates, candidate_docs), 1):
        snippet = doc[:300].replace("\n", " ")
@@ -534,35 +542,51 @@ def llm_rerank_locomo(
        f"Reply with just the number (1-{len(candidates)}).\n\n" + "\n".join(lines)
    )

-    payload = json.dumps(
-        {
-            "model": model,
-            "max_tokens": 8,
-            "messages": [{"role": "user", "content": prompt}],
-        }
-    ).encode("utf-8")
-
-    req = urllib.request.Request(
-        "https://api.anthropic.com/v1/messages",
-        data=payload,
-        headers={
+    if backend == "ollama":
+        url = (base_url or "http://localhost:11434").rstrip("/") + "/v1/chat/completions"
+        payload = json.dumps(
+            {
+                "model": model,
+                "messages": [{"role": "user", "content": prompt}],
+                "max_tokens": 1024,
+                "temperature": 0.0,
+            }
+        ).encode("utf-8")
+        headers = {"content-type": "application/json"}
+        if api_key:
+            headers["authorization"] = f"Bearer {api_key}"
+    else:
+        url = "https://api.anthropic.com/v1/messages"
+        payload = json.dumps(
+            {
+                "model": model,
+                "max_tokens": 8,
+                "messages": [{"role": "user", "content": prompt}],
+            }
+        ).encode("utf-8")
+        headers = {
            "x-api-key": api_key,
            "anthropic-version": "2023-06-01",
            "content-type": "application/json",
-        },
-        method="POST",
-    )
+        }
+
+    req = urllib.request.Request(url, data=payload, headers=headers, method="POST")

    import socket as _socket

    for _attempt in range(3):
        try:
-            with urllib.request.urlopen(req, timeout=30) as resp:
+            with urllib.request.urlopen(req, timeout=120 if backend == "ollama" else 30) as resp:
                result = json.loads(resp.read())
-            raw = result["content"][0]["text"].strip()
-            m = re.search(r"\b(\d+)\b", raw)
+            if backend == "ollama":
+                msg = result["choices"][0]["message"]
+                raw = (msg.get("content") or "").strip() or (msg.get("reasoning") or "").strip()
+            else:
+                raw = result["content"][0]["text"].strip()
+            # Take LAST integer — reasoning models often count candidates first
+            m = re.search(r"\b(\d+)\b", raw[::-1])
            if m:
-                pick = int(m.group(1))
+                pick = int(m.group(1)[::-1])
                if 1 <= pick <= len(candidates):
                    chosen_id = candidates[pick - 1]
                    reordered = [chosen_id] + [cid for cid in retrieved_ids if cid != chosen_id]
@@ -608,6 +632,8 @@ def run_benchmark(
    palace_cache_file=None,
    palace_model="claude-haiku-4-5-20251001",
    embed_model="default",
+    llm_backend="anthropic",
+    llm_base_url="",
 ):
    """Run LoCoMo retrieval benchmark."""
    with open(data_file) as f:
@@ -619,8 +645,12 @@ def run_benchmark(
    api_key = ""
    if llm_rerank_enabled or mode == "palace":
        api_key = _load_api_key(llm_key)
-        if not api_key:
-            print(f"ERROR: --mode {mode} requires an API key (--llm-key or ANTHROPIC_API_KEY).")
+        # Ollama backend doesn't require an Anthropic key. Palace mode still does
+        # (it uses Anthropic for room-assignment indexing) — so only relax the
+        # requirement when rerank is the ONLY llm use and backend is ollama.
+        needs_key = mode == "palace" or (llm_rerank_enabled and llm_backend == "anthropic")
+        if needs_key and not api_key:
+            print(f"ERROR: --mode {mode} / --llm-rerank (anthropic) requires an API key.")
            sys.exit(1)

    # Palace mode: load or create room assignment cache
@@ -888,6 +918,8 @@ def run_benchmark(
                        api_key,
                        top_k=rerank_pool,
                        model=llm_model,
+                        backend=llm_backend,
+                        base_url=llm_base_url,
                    )

                # Compute recall
@@ -1013,6 +1045,18 @@ if __name__ == "__main__":
        help="Model for LLM rerank (default: claude-sonnet-4-6)",
    )
    parser.add_argument("--llm-key", default="", help="API key (or set ANTHROPIC_API_KEY env var)")
+    parser.add_argument(
+        "--llm-backend",
+        choices=["anthropic", "ollama"],
+        default="anthropic",
+        help="Which API for --llm-rerank. 'anthropic' (default) or 'ollama' "
+        "(OpenAI-compat /v1/chat/completions — works for local + Ollama Cloud).",
+    )
+    parser.add_argument(
+        "--llm-base-url",
+        default="",
+        help="Override base URL for --llm-backend ollama. Default: http://localhost:11434.",
+    )
    parser.add_argument(
        "--hybrid-weight",
        type=float,
@@ -1049,4 +1093,6 @@ if __name__ == "__main__":
        palace_cache_file=args.palace_cache,
        palace_model=args.palace_model,
        embed_model=args.embed_model,
+        llm_backend=args.llm_backend,
+        llm_base_url=args.llm_base_url,
    )
@@ -2763,7 +2763,15 @@ def build_palace_and_retrieve_diary(


 def llm_rerank(
-    question, rankings, corpus, corpus_ids, api_key, top_k=10, model="claude-haiku-4-5-20251001"
+    question,
+    rankings,
+    corpus,
+    corpus_ids,
+    api_key,
+    top_k=10,
+    model="claude-haiku-4-5-20251001",
+    backend="anthropic",
+    base_url="",
 ):
    """
    Use an LLM to re-rank the top-k retrieved sessions.
@@ -2772,19 +2780,22 @@ def llm_rerank(
    which single session is most relevant to the question. That session
    is promoted to rank 1; the rest stay in their existing order.

-    This closes the gap for "preference" and jargon-dense "assistant"
-    failures where the right session is in top-10 semantically but not
-    top-5 — because the semantic gap (battery life ↔ phone hardware) is
-    too large for embeddings to bridge.
+    Supports two backends:
+      - "anthropic": hits https://api.anthropic.com/v1/messages with x-api-key.
+      - "ollama":    hits {base_url}/v1/chat/completions (OpenAI-compat) —
+                     works for local Ollama (default http://localhost:11434)
+                     and Ollama Cloud (:cloud model tags).

    Args:
-        question:    The benchmark question string
-        rankings:    Current ranked list of corpus indices (from any mode)
-        corpus:      List of document strings
-        corpus_ids:  List of corpus IDs (parallel to corpus)
-        api_key:     Anthropic API key string
-        top_k:       How many top sessions to send to LLM (default: 10)
-        model:       Claude model ID for reranking (default: haiku)
+        question:   The benchmark question string
+        rankings:   Current ranked list of corpus indices (from any mode)
+        corpus:     List of document strings
+        corpus_ids: List of corpus IDs (parallel to corpus)
+        api_key:    Anthropic API key (only required for backend="anthropic")
+        top_k:      How many top sessions to send to LLM (default: 10)
+        model:      Model id (Claude model for anthropic, e.g. "minimax-m2.7:cloud" for ollama)
+        backend:    "anthropic" or "ollama"
+        base_url:   Override base URL (ollama default: http://localhost:11434)

    Returns:
        Reordered rankings list with LLM's best pick promoted to rank 1.
@@ -2796,7 +2807,6 @@ def llm_rerank(
    if not candidates:
        return rankings

-    # Format sessions for the prompt — first 500 chars each, labelled 1..N
    session_blocks = []
    for rank, idx in enumerate(candidates):
        text = corpus[idx][:500].replace("\n", " ").strip()
@@ -2813,49 +2823,68 @@ def llm_rerank(
        f"Most relevant session number:"
    )

-    payload = json.dumps(
-        {
-            "model": model,
-            "max_tokens": 8,
-            "messages": [{"role": "user", "content": prompt}],
-        }
-    ).encode("utf-8")
-
-    req = urllib.request.Request(
-        "https://api.anthropic.com/v1/messages",
-        data=payload,
-        headers={
+    if backend == "ollama":
+        url = (base_url or "http://localhost:11434").rstrip("/") + "/v1/chat/completions"
+        payload = json.dumps(
+            {
+                "model": model,
+                "messages": [{"role": "user", "content": prompt}],
+                "max_tokens": 1024,
+                "temperature": 0.0,
+            }
+        ).encode("utf-8")
+        headers = {"content-type": "application/json"}
+        if api_key:
+            headers["authorization"] = f"Bearer {api_key}"
+    else:
+        url = "https://api.anthropic.com/v1/messages"
+        payload = json.dumps(
+            {
+                "model": model,
+                "max_tokens": 8,
+                "messages": [{"role": "user", "content": prompt}],
+            }
+        ).encode("utf-8")
+        headers = {
            "x-api-key": api_key,
            "anthropic-version": "2023-06-01",
            "content-type": "application/json",
-        },
-        method="POST",
-    )
+        }
+
+    req = urllib.request.Request(url, data=payload, headers=headers, method="POST")

    import socket as _socket

    for _attempt in range(3):
        try:
-            with urllib.request.urlopen(req, timeout=20) as resp:
+            with urllib.request.urlopen(req, timeout=120 if backend == "ollama" else 20) as resp:
                result = json.loads(resp.read())
-            raw = result["content"][0]["text"].strip()
-            # Parse just the first integer from Haiku's response
-            m = re.search(r"\b(\d+)\b", raw)
+            if backend == "ollama":
+                msg = result["choices"][0]["message"]
+                # Reasoning models (e.g. minimax-m2.7) may emit final answer in "content"
+                # or embed it in "reasoning". Try content first, fall back to reasoning.
+                raw = (msg.get("content") or "").strip()
+                if not raw:
+                    raw = (msg.get("reasoning") or "").strip()
+            else:
+                raw = result["content"][0]["text"].strip()
+            m = re.search(
+                r"\b(\d+)\b", raw[::-1]
+            )  # take LAST integer (rerank models often reason first)
            if m:
-                pick = int(m.group(1))
+                pick = int(m.group(1)[::-1])
                if 1 <= pick <= len(candidates):
                    chosen_idx = candidates[pick - 1]
                    reordered = [chosen_idx] + [i for i in rankings if i != chosen_idx]
                    return reordered
-            break  # Got a response, even if unparseable — don't retry
+            break
        except (_socket.timeout, TimeoutError):
            if _attempt < 2:
                import time as _time

-                _time.sleep(3)  # brief pause then retry
-            # else fall through to return rankings
+                _time.sleep(3)
        except (urllib.error.URLError, KeyError, ValueError, IndexError, OSError):
-            break  # Non-timeout error — fall back immediately
+            break

    return rankings

@@ -2919,6 +2948,8 @@ def run_benchmark(
    skip_precompute=False,
    split_file=None,
    split_subset=None,
+    llm_backend="anthropic",
+    llm_base_url="",
 ):
    """Run the full benchmark.

@@ -2947,10 +2978,14 @@ def run_benchmark(
    api_key = ""
    if llm_rerank_enabled or mode == "diary":
        api_key = _load_api_key(llm_key)
-        if not api_key:
+        # Ollama backend doesn't require an Anthropic API key; a local/cloud Ollama
+        # daemon with the requested model pulled is enough. Diary mode is always anthropic.
+        needs_key = (llm_backend == "anthropic") or (mode == "diary")
+        if needs_key and not api_key:
            print(
-                "ERROR: --llm-rerank / --mode diary requires an API key. "
-                "Set ANTHROPIC_API_KEY or use --llm-key."
+                "ERROR: --llm-rerank (anthropic backend) / --mode diary requires an API key. "
+                "Set ANTHROPIC_API_KEY or use --llm-key. For ollama backend, pass "
+                "--llm-backend ollama."
            )
            sys.exit(1)

@@ -3100,7 +3135,15 @@ def run_benchmark(
        if llm_rerank_enabled:
            rerank_pool = 20 if mode in ("hybrid_v3", "hybrid_v4", "palace") else 10
            rankings = llm_rerank(
-                question, rankings, corpus, corpus_ids, api_key, top_k=rerank_pool, model=llm_model
+                question,
+                rankings,
+                corpus,
+                corpus_ids,
+                api_key,
+                top_k=rerank_pool,
+                model=llm_model,
+                backend=llm_backend,
+                base_url=llm_base_url,
            )

        # Evaluate at session level
@@ -3276,7 +3319,21 @@ if __name__ == "__main__":
        default="claude-haiku-4-5-20251001",
        help="Model for LLM re-ranking and diary ingest "
        "(default: claude-haiku-4-5-20251001). "
-        "Use 'claude-sonnet-4-6' for Sonnet comparison.",
+        "Use 'claude-sonnet-4-6' for Sonnet comparison. "
+        "With --llm-backend ollama, use an Ollama model tag like 'minimax-m2.7:cloud'.",
+    )
+    parser.add_argument(
+        "--llm-backend",
+        choices=["anthropic", "ollama"],
+        default="anthropic",
+        help="Which API to hit for --llm-rerank. 'anthropic' (default) uses Anthropic's "
+        "/v1/messages endpoint. 'ollama' uses Ollama's OpenAI-compatible "
+        "/v1/chat/completions endpoint (works with local Ollama and Ollama Cloud).",
+    )
+    parser.add_argument(
+        "--llm-base-url",
+        default="",
+        help="Override base URL for --llm-backend ollama. Defaults to http://localhost:11434.",
    )
    parser.add_argument(
        "--diary-cache",
@@ -3380,4 +3437,6 @@ if __name__ == "__main__":
        args.skip_precompute,
        split_file=args.split_file,
        split_subset=split_subset,
+        llm_backend=args.llm_backend,
+        llm_base_url=args.llm_base_url,
    )