benchmarks: add --llm-backend ollama for non-Anthropic rerank

The rerank pipeline was hardcoded to Anthropic's /v1/messages.
Add a backend flag so the same code path can be exercised with
any OpenAI-compatible endpoint — local Ollama, Ollama Cloud,
or any gateway that speaks /v1/chat/completions.

Enables independent verification of the "100% with Haiku rerank"
claim by running the full benchmark with a different LLM family
(e.g. minimax-m2.7:cloud) and zero Anthropic dependency.

Both longmemeval_bench.py and locomo_bench.py:
 - llm_rerank*() gain backend= / base_url= kwargs
 - CLI: --llm-backend {anthropic,ollama}, --llm-base-url
 - API key required only when backend=anthropic (diary/palace modes still require it)
 - Parse last integer in response (reasoning models emit multi-int output)
 - Fallback to message.reasoning when content is empty
 - Raise max_tokens to 1024 for reasoning models
This commit is contained in:
Igor Lins e Silva
2026-04-14 21:20:14 -03:00
parent 4aa7e1eebd
commit 8df7b9bf2c
3 changed files with 169 additions and 66 deletions
+69 -23
View File
@@ -510,11 +510,20 @@ def palace_assign_rooms(sessions, sample_id, api_key, cache, model="claude-haiku
def llm_rerank_locomo(
question, retrieved_ids, retrieved_docs, api_key, top_k=10, model="claude-sonnet-4-6"
question,
retrieved_ids,
retrieved_docs,
api_key,
top_k=10,
model="claude-sonnet-4-6",
backend="anthropic",
base_url="",
):
"""
Ask LLM to pick the single most relevant document for this question.
Returns reordered retrieved_ids with the best candidate first.
Supports backend="anthropic" (default) or "ollama" (OpenAI-compat endpoint).
"""
candidates = retrieved_ids[:top_k]
candidate_docs = retrieved_docs[:top_k]
@@ -522,7 +531,6 @@ def llm_rerank_locomo(
if len(candidates) <= 1:
return retrieved_ids
# Build numbered list of candidates
lines = []
for i, (cid, doc) in enumerate(zip(candidates, candidate_docs), 1):
snippet = doc[:300].replace("\n", " ")
@@ -534,35 +542,51 @@ def llm_rerank_locomo(
f"Reply with just the number (1-{len(candidates)}).\n\n" + "\n".join(lines)
)
payload = json.dumps(
{
"model": model,
"max_tokens": 8,
"messages": [{"role": "user", "content": prompt}],
}
).encode("utf-8")
req = urllib.request.Request(
"https://api.anthropic.com/v1/messages",
data=payload,
headers={
if backend == "ollama":
url = (base_url or "http://localhost:11434").rstrip("/") + "/v1/chat/completions"
payload = json.dumps(
{
"model": model,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 1024,
"temperature": 0.0,
}
).encode("utf-8")
headers = {"content-type": "application/json"}
if api_key:
headers["authorization"] = f"Bearer {api_key}"
else:
url = "https://api.anthropic.com/v1/messages"
payload = json.dumps(
{
"model": model,
"max_tokens": 8,
"messages": [{"role": "user", "content": prompt}],
}
).encode("utf-8")
headers = {
"x-api-key": api_key,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
},
method="POST",
)
}
req = urllib.request.Request(url, data=payload, headers=headers, method="POST")
import socket as _socket
for _attempt in range(3):
try:
with urllib.request.urlopen(req, timeout=30) as resp:
with urllib.request.urlopen(req, timeout=120 if backend == "ollama" else 30) as resp:
result = json.loads(resp.read())
raw = result["content"][0]["text"].strip()
m = re.search(r"\b(\d+)\b", raw)
if backend == "ollama":
msg = result["choices"][0]["message"]
raw = (msg.get("content") or "").strip() or (msg.get("reasoning") or "").strip()
else:
raw = result["content"][0]["text"].strip()
# Take LAST integer — reasoning models often count candidates first
m = re.search(r"\b(\d+)\b", raw[::-1])
if m:
pick = int(m.group(1))
pick = int(m.group(1)[::-1])
if 1 <= pick <= len(candidates):
chosen_id = candidates[pick - 1]
reordered = [chosen_id] + [cid for cid in retrieved_ids if cid != chosen_id]
@@ -608,6 +632,8 @@ def run_benchmark(
palace_cache_file=None,
palace_model="claude-haiku-4-5-20251001",
embed_model="default",
llm_backend="anthropic",
llm_base_url="",
):
"""Run LoCoMo retrieval benchmark."""
with open(data_file) as f:
@@ -619,8 +645,12 @@ def run_benchmark(
api_key = ""
if llm_rerank_enabled or mode == "palace":
api_key = _load_api_key(llm_key)
if not api_key:
print(f"ERROR: --mode {mode} requires an API key (--llm-key or ANTHROPIC_API_KEY).")
# Ollama backend doesn't require an Anthropic key. Palace mode still does
# (it uses Anthropic for room-assignment indexing) — so only relax the
# requirement when rerank is the ONLY llm use and backend is ollama.
needs_key = mode == "palace" or (llm_rerank_enabled and llm_backend == "anthropic")
if needs_key and not api_key:
print(f"ERROR: --mode {mode} / --llm-rerank (anthropic) requires an API key.")
sys.exit(1)
# Palace mode: load or create room assignment cache
@@ -888,6 +918,8 @@ def run_benchmark(
api_key,
top_k=rerank_pool,
model=llm_model,
backend=llm_backend,
base_url=llm_base_url,
)
# Compute recall
@@ -1013,6 +1045,18 @@ if __name__ == "__main__":
help="Model for LLM rerank (default: claude-sonnet-4-6)",
)
parser.add_argument("--llm-key", default="", help="API key (or set ANTHROPIC_API_KEY env var)")
parser.add_argument(
"--llm-backend",
choices=["anthropic", "ollama"],
default="anthropic",
help="Which API for --llm-rerank. 'anthropic' (default) or 'ollama' "
"(OpenAI-compat /v1/chat/completions — works for local + Ollama Cloud).",
)
parser.add_argument(
"--llm-base-url",
default="",
help="Override base URL for --llm-backend ollama. Default: http://localhost:11434.",
)
parser.add_argument(
"--hybrid-weight",
type=float,
@@ -1049,4 +1093,6 @@ if __name__ == "__main__":
palace_cache_file=args.palace_cache,
palace_model=args.palace_model,
embed_model=args.embed_model,
llm_backend=args.llm_backend,
llm_base_url=args.llm_base_url,
)