Merge pull request #895 from MemPalace/bench/v3.3.0-verification

benchmarks: v3.3.0 reproduction results + Ollama rerank backend
This commit is contained in:
Igor Lins e Silva
2026-04-14 22:06:30 -03:00
committed by GitHub
11 changed files with 331421 additions and 65 deletions
+508
View File
@@ -0,0 +1,508 @@
{
"dev": [
"cc06de0d",
"f9e8c073",
"b320f3f8",
"a89d7624",
"311778f1",
"gpt4_59c863d7",
"bbf86515",
"099778bb",
"e831120c",
"dcfa8644",
"8fb83627",
"e66b632c",
"gpt4_7fce9456",
"55241a1f",
"352ab8bd",
"f4f1d8a4",
"830ce83f",
"2311e44b",
"09ba9854",
"gpt4_a1b77f9c",
"07741c45",
"gpt4_70e84552",
"b46e15ee",
"6071bd76",
"6f9b354f",
"1d4da289",
"gpt4_8279ba02",
"6456829e_abs",
"0db4c65d",
"d6062bb9",
"60bf93ed_abs",
"d3ab962e",
"87f22b4a",
"e01b8e2f",
"gpt4_7ddcf75f",
"8ebdbe50",
"26bdc477",
"29f2956b_abs",
"2311e44b_abs",
"75f70248",
"852ce960",
"f0e564bc",
"fca70973",
"3c1045c8",
"18bc8abd",
"afdc33df",
"54026fce",
"b9cfe692",
"6456829e",
"e6041065"
],
"held_out": [
"gpt4_15e38248",
"gpt4_2ba83207",
"2133c1b5_abs",
"gpt4_8279ba03",
"76d63226",
"1192316e",
"gpt4_fa19884d",
"gpt4_372c3eed_abs",
"1a8a66a6",
"gpt4_fe651585",
"e25c3b8d",
"945e3d21",
"86b68151",
"1c0ddc50",
"1e043500",
"d682f1a2",
"gpt4_b5700ca0",
"91b15a6e",
"ce6d2d27",
"f523d9fe",
"7024f17c",
"8752c811",
"gpt4_f420262d",
"d01c6aa8",
"4b24c848",
"7e974930",
"3fdac837",
"gpt4_b4a80587",
"c18a7dc8",
"80ec1f4f_abs",
"7527f7e2",
"6ade9755",
"89941a94",
"gpt4_1d80365e",
"2133c1b5",
"06db6396",
"gpt4_88806d6e",
"88432d0a",
"3ba21379",
"0862e8bf",
"aae3761f",
"5025383b",
"gpt4_e061b84f",
"73d42213",
"4bc144e2",
"gpt4_5501fe77",
"00ca467f",
"dfde3500",
"01493427",
"b6025781",
"a96c20ee_abs",
"982b5123_abs",
"gpt4_fa19884c",
"gpt4_1a1dc16d",
"28dc39ac",
"gpt4_2d58bcd6",
"51c32626",
"c4ea545c",
"1da05512",
"gpt4_385a5000",
"577d4d32",
"72e3ee87",
"f4f1d8a4_abs",
"9d25d4e0",
"b29f3365",
"b759caee",
"10e09553",
"1d4e3b97",
"d52b4f67",
"gpt4_e072b769",
"58ef2f1c",
"6e984301",
"41275add",
"gpt4_59149c77",
"2ebe6c90",
"1cea1afa",
"gpt4_1e4a8aec",
"6c49646a",
"8a2466db",
"gpt4_65aabe59",
"gpt4_93159ced",
"51a45a95",
"af8d2e46",
"561fabcd",
"370a8ff4",
"gpt4_d84a3211",
"gpt4_7a0daae1",
"2a1811e2",
"gpt4_78cf46a3",
"1568498a",
"6b7dfb22",
"6ae235be",
"bc8a6e93_abs",
"681a1674",
"06878be2",
"1a1907b4",
"0e4e4c46",
"gpt4_85da3956",
"gpt4_f420262c",
"2bf43736",
"bc149d6b",
"09d032c9",
"5c40ec5b",
"eac54adc",
"993da5e2",
"71a3fd6b",
"gpt4_0b2f1d21",
"ad7109d1",
"4c36ccef",
"c8c3f81d",
"edced276_abs",
"0bc8ad92",
"gpt4_468eb064",
"2ebe6c92",
"cc6d1ec1",
"4dfccbf8",
"95228167",
"ba358f49",
"45dc21b6",
"db467c8c",
"720133ac",
"67e0d0f2",
"cc5ded98",
"726462e0",
"4100d0a0",
"3a704032",
"gpt4_7ca326fa",
"ec81a493",
"618f13b2",
"58470ed2",
"gpt4_4fc4f797",
"60036106",
"157a136e",
"6222b6eb",
"69fee5aa",
"19b5f2b3_abs",
"gpt4_d12ceb0e",
"51b23612",
"2318644b",
"3fe836c9",
"gpt4_7de946e7",
"71017277",
"f0853d11",
"dc439ea3",
"gpt4_2f91af09",
"9a707b81",
"bc8a6e93",
"c14c00dd",
"8979f9ec",
"cf22b7bf",
"gpt4_ec93e27f",
"gpt4_468eb063",
"41698283",
"1de5cff2",
"21d02d0d",
"c7cf7dfd",
"gpt4_ab202e7f",
"dccbc061",
"078150f1",
"e3038f8c",
"gpt4_c27434e8_abs",
"2698e78f",
"031748ae_abs",
"gpt4_59149c78",
"c8f1aeed",
"184da446",
"gpt4_b5700ca9",
"89527b6b",
"0977f2af",
"853b0a1d",
"a346bb18",
"3249768e",
"gpt4_2f8be40d",
"gpt4_93159ced_abs",
"eeda8a6d",
"7a8d0b71",
"95bcc1c8",
"gpt4_2487a7cb",
"85fa3a3f",
"7e00a6cb",
"e3fc4d6e",
"59524333",
"37f165cf",
"0ddfec37",
"60bf93ed",
"d7c942c3",
"80ec1f4f",
"ceb54acb",
"9aaed6a3",
"gpt4_4929293a",
"ed4ddc30",
"545bd2b5",
"2788b940",
"ef9cf60a",
"gpt4_7f6b06db",
"0ea62687",
"3d86fd0a",
"3e321797",
"d24813b1",
"38146c39",
"efc3f7c2",
"7401057b",
"5809eb10",
"28bcfaac",
"1903aded",
"gpt4_194be4b3",
"gpt4_e414231f",
"0ddfec37_abs",
"c2ac3c61",
"gpt4_4ef30696",
"1f2b8d4f",
"0f05491a",
"8550ddae",
"8077ef71",
"b86304ba",
"e61a7584",
"8cf51dda",
"gpt4_2f584639",
"08e075c7",
"5d3d2817",
"7405e8b1",
"a3045048",
"gpt4_731e37d7",
"c8090214_abs",
"36580ce8",
"ba358f49_abs",
"gpt4_d6585ce8",
"e56a43b9",
"2c63a862",
"gpt4_5438fa52",
"07b6f563",
"gpt4_31ff4165",
"0bb5a684",
"71315a70",
"gpt4_cd90e484",
"gpt4_8c8961ae",
"gpt4_fe651585_abs",
"36b9f61e",
"gpt4_b0863698",
"gpt4_1d4ab0c9",
"15745da0_abs",
"0862e8bf_abs",
"bcbe585f",
"a2f3aa27",
"gpt4_6dc9b45b",
"ccb36322",
"f685340e",
"9ea5eabc",
"gpt4_372c3eed",
"37d43f65",
"bf659f65",
"b0479f84",
"gpt4_213fd887",
"e4e14d04",
"f8c5f88b",
"gpt4_18c2b244",
"a11281a2",
"gpt4_2655b836",
"e47becba",
"gpt4_74aed68e",
"gpt4_af6db32f",
"6cb6f249",
"77eafa52",
"gpt4_93f6379c",
"e8a79c70",
"7a87bd0c",
"gpt4_6ed717ea",
"d6233ab6",
"c19f7a0b",
"gpt4_61e13b3c",
"d23cf73b",
"gpt4_1e4a8aeb",
"ba61f0b9",
"118b2229",
"488d3006",
"c4a1ceb8",
"8e91e7d9",
"42ec0761",
"65240037",
"fea54f57",
"c8090214",
"b01defab",
"6aeb4375_abs",
"faba32e5",
"c5e8278d",
"gpt4_e414231e",
"eeda8a6d_abs",
"gpt4_8e165409",
"af082822",
"22d2cb42",
"92a0aa75",
"1c549ce4",
"25e5aa4f",
"gpt4_68e94288",
"4baee567",
"18dcd5a5",
"dad224aa",
"gpt4_f2262a51",
"29f2956b",
"21436231",
"19b5f2b3",
"gpt4_1916e0ea",
"gpt4_45189cb4",
"0a995998",
"b6019101",
"9bbe84a2",
"61f8c8f8",
"9a707b82",
"8cf4d046",
"eac54add",
"75832dbd",
"gpt4_98f46fc6",
"d596882b",
"88432d0a_abs",
"16c90bf4",
"f685340e_abs",
"b5ef892d",
"gpt4_f49edff3",
"gpt4_483dd43c",
"bb7c3b45",
"gpt4_7abb270c",
"gpt4_9a159967",
"07741c44",
"4d6b87c8",
"6aeb4375",
"gpt4_d6585ce9",
"60472f9c",
"caf9ead2",
"32260d93",
"60159905",
"0a34ad58",
"a40e080f",
"10d9b85a",
"a06e4cfe",
"4f54b7c9",
"6613b389",
"70b3e69b",
"gpt4_7bc6cf22",
"gpt4_0a05b494",
"778164c6",
"195a1a1b",
"8464fc84",
"b46e15ed",
"603deb26",
"eaca4986",
"2698e78f_abs",
"gpt4_21adecb5",
"2e6d26dc",
"5831f84d",
"08f4fc43",
"3f1e9474",
"c9f37c46",
"gpt4_2f56ae70",
"1b9b7252",
"35a27287",
"gpt4_d31cdae3",
"129d1232",
"4adc0475",
"27016adc",
"46a3abf7",
"9ee3ecd6",
"982b5123",
"09ba9854_abs",
"0e5e2d1a",
"e9327a54",
"86f00804",
"e982271f",
"7161e7e2",
"57f827a0",
"6a27ffc2",
"edced276",
"gpt4_d9af6064",
"75499fd8",
"60d45044",
"gpt4_70e84552_abs",
"2ce6a0f2",
"gpt4_4929293b",
"a1cc6108",
"gpt4_5dcc0aab",
"a3838d2b",
"c7dc5443",
"505af2f5",
"gpt4_68e94287",
"15745da0",
"0100672e",
"a82c026e",
"5e1b23de",
"71017276",
"89941a93",
"6b168ec8",
"affe2881",
"0edc2aef",
"gpt4_2312f94c",
"a4996e51",
"c6853660",
"ef66a6e5",
"8a137a7f",
"a96c20ee",
"fca762bc",
"ac031881",
"d905b33f",
"e493bb7c",
"a9f6b44c",
"dd2973ad",
"8aef76bc",
"f35224e0",
"8b9d4367",
"gpt4_c27434e8",
"gpt4_a56e767c",
"eace081b",
"5a4f22c0",
"58bf7951",
"c4f10528",
"50635ada",
"06f04340",
"0bc8ad93",
"e5ba910e_abs",
"5a7937c8",
"a3332713",
"4388e9dd",
"8c18457d",
"gpt4_2c50253f",
"6a1eabeb",
"b3c15d39",
"gpt4_e061b84g",
"3b6f954b",
"gpt4_76048e76",
"4dfccbf7",
"2b8f3739",
"d851d5ba",
"4fd1909e",
"94f70d80",
"66f24dbb",
"a08a253f",
"6e984302",
"001be529",
"gpt4_a2d1d1f6",
"cc539528",
"e48988bc",
"gpt4_4cd9eba1",
"8e9d538c",
"a1eacc2a",
"6d550036",
"gpt4_e05b82a6",
"81507db6",
"caf03d32",
"031748ae",
"c960da58",
"1faac195",
"gpt4_4edbafa2"
],
"seed": 42,
"dev_size": 50
}
+69 -23
View File
@@ -510,11 +510,20 @@ def palace_assign_rooms(sessions, sample_id, api_key, cache, model="claude-haiku
def llm_rerank_locomo( def llm_rerank_locomo(
question, retrieved_ids, retrieved_docs, api_key, top_k=10, model="claude-sonnet-4-6" question,
retrieved_ids,
retrieved_docs,
api_key,
top_k=10,
model="claude-sonnet-4-6",
backend="anthropic",
base_url="",
): ):
""" """
Ask LLM to pick the single most relevant document for this question. Ask LLM to pick the single most relevant document for this question.
Returns reordered retrieved_ids with the best candidate first. Returns reordered retrieved_ids with the best candidate first.
Supports backend="anthropic" (default) or "ollama" (OpenAI-compat endpoint).
""" """
candidates = retrieved_ids[:top_k] candidates = retrieved_ids[:top_k]
candidate_docs = retrieved_docs[:top_k] candidate_docs = retrieved_docs[:top_k]
@@ -522,7 +531,6 @@ def llm_rerank_locomo(
if len(candidates) <= 1: if len(candidates) <= 1:
return retrieved_ids return retrieved_ids
# Build numbered list of candidates
lines = [] lines = []
for i, (cid, doc) in enumerate(zip(candidates, candidate_docs), 1): for i, (cid, doc) in enumerate(zip(candidates, candidate_docs), 1):
snippet = doc[:300].replace("\n", " ") snippet = doc[:300].replace("\n", " ")
@@ -534,35 +542,51 @@ def llm_rerank_locomo(
f"Reply with just the number (1-{len(candidates)}).\n\n" + "\n".join(lines) f"Reply with just the number (1-{len(candidates)}).\n\n" + "\n".join(lines)
) )
payload = json.dumps( if backend == "ollama":
{ url = (base_url or "http://localhost:11434").rstrip("/") + "/v1/chat/completions"
"model": model, payload = json.dumps(
"max_tokens": 8, {
"messages": [{"role": "user", "content": prompt}], "model": model,
} "messages": [{"role": "user", "content": prompt}],
).encode("utf-8") "max_tokens": 1024,
"temperature": 0.0,
req = urllib.request.Request( }
"https://api.anthropic.com/v1/messages", ).encode("utf-8")
data=payload, headers = {"content-type": "application/json"}
headers={ if api_key:
headers["authorization"] = f"Bearer {api_key}"
else:
url = "https://api.anthropic.com/v1/messages"
payload = json.dumps(
{
"model": model,
"max_tokens": 8,
"messages": [{"role": "user", "content": prompt}],
}
).encode("utf-8")
headers = {
"x-api-key": api_key, "x-api-key": api_key,
"anthropic-version": "2023-06-01", "anthropic-version": "2023-06-01",
"content-type": "application/json", "content-type": "application/json",
}, }
method="POST",
) req = urllib.request.Request(url, data=payload, headers=headers, method="POST")
import socket as _socket import socket as _socket
for _attempt in range(3): for _attempt in range(3):
try: try:
with urllib.request.urlopen(req, timeout=30) as resp: with urllib.request.urlopen(req, timeout=120 if backend == "ollama" else 30) as resp:
result = json.loads(resp.read()) result = json.loads(resp.read())
raw = result["content"][0]["text"].strip() if backend == "ollama":
m = re.search(r"\b(\d+)\b", raw) msg = result["choices"][0]["message"]
raw = (msg.get("content") or "").strip() or (msg.get("reasoning") or "").strip()
else:
raw = result["content"][0]["text"].strip()
# Take LAST integer — reasoning models often count candidates first
m = re.search(r"\b(\d+)\b", raw[::-1])
if m: if m:
pick = int(m.group(1)) pick = int(m.group(1)[::-1])
if 1 <= pick <= len(candidates): if 1 <= pick <= len(candidates):
chosen_id = candidates[pick - 1] chosen_id = candidates[pick - 1]
reordered = [chosen_id] + [cid for cid in retrieved_ids if cid != chosen_id] reordered = [chosen_id] + [cid for cid in retrieved_ids if cid != chosen_id]
@@ -608,6 +632,8 @@ def run_benchmark(
palace_cache_file=None, palace_cache_file=None,
palace_model="claude-haiku-4-5-20251001", palace_model="claude-haiku-4-5-20251001",
embed_model="default", embed_model="default",
llm_backend="anthropic",
llm_base_url="",
): ):
"""Run LoCoMo retrieval benchmark.""" """Run LoCoMo retrieval benchmark."""
with open(data_file) as f: with open(data_file) as f:
@@ -619,8 +645,12 @@ def run_benchmark(
api_key = "" api_key = ""
if llm_rerank_enabled or mode == "palace": if llm_rerank_enabled or mode == "palace":
api_key = _load_api_key(llm_key) api_key = _load_api_key(llm_key)
if not api_key: # Ollama backend doesn't require an Anthropic key. Palace mode still does
print(f"ERROR: --mode {mode} requires an API key (--llm-key or ANTHROPIC_API_KEY).") # (it uses Anthropic for room-assignment indexing) — so only relax the
# requirement when rerank is the ONLY llm use and backend is ollama.
needs_key = mode == "palace" or (llm_rerank_enabled and llm_backend == "anthropic")
if needs_key and not api_key:
print(f"ERROR: --mode {mode} / --llm-rerank (anthropic) requires an API key.")
sys.exit(1) sys.exit(1)
# Palace mode: load or create room assignment cache # Palace mode: load or create room assignment cache
@@ -888,6 +918,8 @@ def run_benchmark(
api_key, api_key,
top_k=rerank_pool, top_k=rerank_pool,
model=llm_model, model=llm_model,
backend=llm_backend,
base_url=llm_base_url,
) )
# Compute recall # Compute recall
@@ -1013,6 +1045,18 @@ if __name__ == "__main__":
help="Model for LLM rerank (default: claude-sonnet-4-6)", help="Model for LLM rerank (default: claude-sonnet-4-6)",
) )
parser.add_argument("--llm-key", default="", help="API key (or set ANTHROPIC_API_KEY env var)") parser.add_argument("--llm-key", default="", help="API key (or set ANTHROPIC_API_KEY env var)")
parser.add_argument(
"--llm-backend",
choices=["anthropic", "ollama"],
default="anthropic",
help="Which API for --llm-rerank. 'anthropic' (default) or 'ollama' "
"(OpenAI-compat /v1/chat/completions — works for local + Ollama Cloud).",
)
parser.add_argument(
"--llm-base-url",
default="",
help="Override base URL for --llm-backend ollama. Default: http://localhost:11434.",
)
parser.add_argument( parser.add_argument(
"--hybrid-weight", "--hybrid-weight",
type=float, type=float,
@@ -1049,4 +1093,6 @@ if __name__ == "__main__":
palace_cache_file=args.palace_cache, palace_cache_file=args.palace_cache,
palace_model=args.palace_model, palace_model=args.palace_model,
embed_model=args.embed_model, embed_model=args.embed_model,
llm_backend=args.llm_backend,
llm_base_url=args.llm_base_url,
) )
+101 -42
View File
@@ -2763,7 +2763,15 @@ def build_palace_and_retrieve_diary(
def llm_rerank( def llm_rerank(
question, rankings, corpus, corpus_ids, api_key, top_k=10, model="claude-haiku-4-5-20251001" question,
rankings,
corpus,
corpus_ids,
api_key,
top_k=10,
model="claude-haiku-4-5-20251001",
backend="anthropic",
base_url="",
): ):
""" """
Use an LLM to re-rank the top-k retrieved sessions. Use an LLM to re-rank the top-k retrieved sessions.
@@ -2772,19 +2780,22 @@ def llm_rerank(
which single session is most relevant to the question. That session which single session is most relevant to the question. That session
is promoted to rank 1; the rest stay in their existing order. is promoted to rank 1; the rest stay in their existing order.
This closes the gap for "preference" and jargon-dense "assistant" Supports two backends:
failures where the right session is in top-10 semantically but not - "anthropic": hits https://api.anthropic.com/v1/messages with x-api-key.
top-5 — because the semantic gap (battery life ↔ phone hardware) is - "ollama": hits {base_url}/v1/chat/completions (OpenAI-compat) —
too large for embeddings to bridge. works for local Ollama (default http://localhost:11434)
and Ollama Cloud (:cloud model tags).
Args: Args:
question: The benchmark question string question: The benchmark question string
rankings: Current ranked list of corpus indices (from any mode) rankings: Current ranked list of corpus indices (from any mode)
corpus: List of document strings corpus: List of document strings
corpus_ids: List of corpus IDs (parallel to corpus) corpus_ids: List of corpus IDs (parallel to corpus)
api_key: Anthropic API key string api_key: Anthropic API key (only required for backend="anthropic")
top_k: How many top sessions to send to LLM (default: 10) top_k: How many top sessions to send to LLM (default: 10)
model: Claude model ID for reranking (default: haiku) model: Model id (Claude model for anthropic, e.g. "minimax-m2.7:cloud" for ollama)
backend: "anthropic" or "ollama"
base_url: Override base URL (ollama default: http://localhost:11434)
Returns: Returns:
Reordered rankings list with LLM's best pick promoted to rank 1. Reordered rankings list with LLM's best pick promoted to rank 1.
@@ -2796,7 +2807,6 @@ def llm_rerank(
if not candidates: if not candidates:
return rankings return rankings
# Format sessions for the prompt — first 500 chars each, labelled 1..N
session_blocks = [] session_blocks = []
for rank, idx in enumerate(candidates): for rank, idx in enumerate(candidates):
text = corpus[idx][:500].replace("\n", " ").strip() text = corpus[idx][:500].replace("\n", " ").strip()
@@ -2813,49 +2823,68 @@ def llm_rerank(
f"Most relevant session number:" f"Most relevant session number:"
) )
payload = json.dumps( if backend == "ollama":
{ url = (base_url or "http://localhost:11434").rstrip("/") + "/v1/chat/completions"
"model": model, payload = json.dumps(
"max_tokens": 8, {
"messages": [{"role": "user", "content": prompt}], "model": model,
} "messages": [{"role": "user", "content": prompt}],
).encode("utf-8") "max_tokens": 1024,
"temperature": 0.0,
req = urllib.request.Request( }
"https://api.anthropic.com/v1/messages", ).encode("utf-8")
data=payload, headers = {"content-type": "application/json"}
headers={ if api_key:
headers["authorization"] = f"Bearer {api_key}"
else:
url = "https://api.anthropic.com/v1/messages"
payload = json.dumps(
{
"model": model,
"max_tokens": 8,
"messages": [{"role": "user", "content": prompt}],
}
).encode("utf-8")
headers = {
"x-api-key": api_key, "x-api-key": api_key,
"anthropic-version": "2023-06-01", "anthropic-version": "2023-06-01",
"content-type": "application/json", "content-type": "application/json",
}, }
method="POST",
) req = urllib.request.Request(url, data=payload, headers=headers, method="POST")
import socket as _socket import socket as _socket
for _attempt in range(3): for _attempt in range(3):
try: try:
with urllib.request.urlopen(req, timeout=20) as resp: with urllib.request.urlopen(req, timeout=120 if backend == "ollama" else 20) as resp:
result = json.loads(resp.read()) result = json.loads(resp.read())
raw = result["content"][0]["text"].strip() if backend == "ollama":
# Parse just the first integer from Haiku's response msg = result["choices"][0]["message"]
m = re.search(r"\b(\d+)\b", raw) # Reasoning models (e.g. minimax-m2.7) may emit final answer in "content"
# or embed it in "reasoning". Try content first, fall back to reasoning.
raw = (msg.get("content") or "").strip()
if not raw:
raw = (msg.get("reasoning") or "").strip()
else:
raw = result["content"][0]["text"].strip()
m = re.search(
r"\b(\d+)\b", raw[::-1]
) # take LAST integer (rerank models often reason first)
if m: if m:
pick = int(m.group(1)) pick = int(m.group(1)[::-1])
if 1 <= pick <= len(candidates): if 1 <= pick <= len(candidates):
chosen_idx = candidates[pick - 1] chosen_idx = candidates[pick - 1]
reordered = [chosen_idx] + [i for i in rankings if i != chosen_idx] reordered = [chosen_idx] + [i for i in rankings if i != chosen_idx]
return reordered return reordered
break # Got a response, even if unparseable — don't retry break
except (_socket.timeout, TimeoutError): except (_socket.timeout, TimeoutError):
if _attempt < 2: if _attempt < 2:
import time as _time import time as _time
_time.sleep(3) # brief pause then retry _time.sleep(3)
# else fall through to return rankings
except (urllib.error.URLError, KeyError, ValueError, IndexError, OSError): except (urllib.error.URLError, KeyError, ValueError, IndexError, OSError):
break # Non-timeout error — fall back immediately break
return rankings return rankings
@@ -2919,6 +2948,8 @@ def run_benchmark(
skip_precompute=False, skip_precompute=False,
split_file=None, split_file=None,
split_subset=None, split_subset=None,
llm_backend="anthropic",
llm_base_url="",
): ):
"""Run the full benchmark. """Run the full benchmark.
@@ -2947,10 +2978,14 @@ def run_benchmark(
api_key = "" api_key = ""
if llm_rerank_enabled or mode == "diary": if llm_rerank_enabled or mode == "diary":
api_key = _load_api_key(llm_key) api_key = _load_api_key(llm_key)
if not api_key: # Ollama backend doesn't require an Anthropic API key; a local/cloud Ollama
# daemon with the requested model pulled is enough. Diary mode is always anthropic.
needs_key = (llm_backend == "anthropic") or (mode == "diary")
if needs_key and not api_key:
print( print(
"ERROR: --llm-rerank / --mode diary requires an API key. " "ERROR: --llm-rerank (anthropic backend) / --mode diary requires an API key. "
"Set ANTHROPIC_API_KEY or use --llm-key." "Set ANTHROPIC_API_KEY or use --llm-key. For ollama backend, pass "
"--llm-backend ollama."
) )
sys.exit(1) sys.exit(1)
@@ -3100,7 +3135,15 @@ def run_benchmark(
if llm_rerank_enabled: if llm_rerank_enabled:
rerank_pool = 20 if mode in ("hybrid_v3", "hybrid_v4", "palace") else 10 rerank_pool = 20 if mode in ("hybrid_v3", "hybrid_v4", "palace") else 10
rankings = llm_rerank( rankings = llm_rerank(
question, rankings, corpus, corpus_ids, api_key, top_k=rerank_pool, model=llm_model question,
rankings,
corpus,
corpus_ids,
api_key,
top_k=rerank_pool,
model=llm_model,
backend=llm_backend,
base_url=llm_base_url,
) )
# Evaluate at session level # Evaluate at session level
@@ -3276,7 +3319,21 @@ if __name__ == "__main__":
default="claude-haiku-4-5-20251001", default="claude-haiku-4-5-20251001",
help="Model for LLM re-ranking and diary ingest " help="Model for LLM re-ranking and diary ingest "
"(default: claude-haiku-4-5-20251001). " "(default: claude-haiku-4-5-20251001). "
"Use 'claude-sonnet-4-6' for Sonnet comparison.", "Use 'claude-sonnet-4-6' for Sonnet comparison. "
"With --llm-backend ollama, use an Ollama model tag like 'minimax-m2.7:cloud'.",
)
parser.add_argument(
"--llm-backend",
choices=["anthropic", "ollama"],
default="anthropic",
help="Which API to hit for --llm-rerank. 'anthropic' (default) uses Anthropic's "
"/v1/messages endpoint. 'ollama' uses Ollama's OpenAI-compatible "
"/v1/chat/completions endpoint (works with local Ollama and Ollama Cloud).",
)
parser.add_argument(
"--llm-base-url",
default="",
help="Override base URL for --llm-backend ollama. Defaults to http://localhost:11434.",
) )
parser.add_argument( parser.add_argument(
"--diary-cache", "--diary-cache",
@@ -3380,4 +3437,6 @@ if __name__ == "__main__":
args.skip_precompute, args.skip_precompute,
split_file=args.split_file, split_file=args.split_file,
split_subset=split_subset, split_subset=split_subset,
llm_backend=args.llm_backend,
llm_base_url=args.llm_base_url,
) )
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long