benchmarks: add --llm-backend ollama for non-Anthropic rerank
The rerank pipeline was hardcoded to Anthropic's /v1/messages.
Add a backend flag so the same code path can be exercised with
any OpenAI-compatible endpoint — local Ollama, Ollama Cloud,
or any gateway that speaks /v1/chat/completions.
Enables independent verification of the "100% with Haiku rerank"
claim by running the full benchmark with a different LLM family
(e.g. minimax-m2.7:cloud) and zero Anthropic dependency.
Both longmemeval_bench.py and locomo_bench.py:
- llm_rerank*() gain backend= / base_url= kwargs
- CLI: --llm-backend {anthropic,ollama}, --llm-base-url
- API key required only when backend=anthropic (diary/palace modes still require it)
- Parse last integer in response (reasoning models emit multi-int output)
- Fallback to message.reasoning when content is empty
- Raise max_tokens to 1024 for reasoning models
This commit is contained in:
+69
-23
@@ -510,11 +510,20 @@ def palace_assign_rooms(sessions, sample_id, api_key, cache, model="claude-haiku
|
||||
|
||||
|
||||
def llm_rerank_locomo(
|
||||
question, retrieved_ids, retrieved_docs, api_key, top_k=10, model="claude-sonnet-4-6"
|
||||
question,
|
||||
retrieved_ids,
|
||||
retrieved_docs,
|
||||
api_key,
|
||||
top_k=10,
|
||||
model="claude-sonnet-4-6",
|
||||
backend="anthropic",
|
||||
base_url="",
|
||||
):
|
||||
"""
|
||||
Ask LLM to pick the single most relevant document for this question.
|
||||
Returns reordered retrieved_ids with the best candidate first.
|
||||
|
||||
Supports backend="anthropic" (default) or "ollama" (OpenAI-compat endpoint).
|
||||
"""
|
||||
candidates = retrieved_ids[:top_k]
|
||||
candidate_docs = retrieved_docs[:top_k]
|
||||
@@ -522,7 +531,6 @@ def llm_rerank_locomo(
|
||||
if len(candidates) <= 1:
|
||||
return retrieved_ids
|
||||
|
||||
# Build numbered list of candidates
|
||||
lines = []
|
||||
for i, (cid, doc) in enumerate(zip(candidates, candidate_docs), 1):
|
||||
snippet = doc[:300].replace("\n", " ")
|
||||
@@ -534,35 +542,51 @@ def llm_rerank_locomo(
|
||||
f"Reply with just the number (1-{len(candidates)}).\n\n" + "\n".join(lines)
|
||||
)
|
||||
|
||||
payload = json.dumps(
|
||||
{
|
||||
"model": model,
|
||||
"max_tokens": 8,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}
|
||||
).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
"https://api.anthropic.com/v1/messages",
|
||||
data=payload,
|
||||
headers={
|
||||
if backend == "ollama":
|
||||
url = (base_url or "http://localhost:11434").rstrip("/") + "/v1/chat/completions"
|
||||
payload = json.dumps(
|
||||
{
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": 1024,
|
||||
"temperature": 0.0,
|
||||
}
|
||||
).encode("utf-8")
|
||||
headers = {"content-type": "application/json"}
|
||||
if api_key:
|
||||
headers["authorization"] = f"Bearer {api_key}"
|
||||
else:
|
||||
url = "https://api.anthropic.com/v1/messages"
|
||||
payload = json.dumps(
|
||||
{
|
||||
"model": model,
|
||||
"max_tokens": 8,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}
|
||||
).encode("utf-8")
|
||||
headers = {
|
||||
"x-api-key": api_key,
|
||||
"anthropic-version": "2023-06-01",
|
||||
"content-type": "application/json",
|
||||
},
|
||||
method="POST",
|
||||
)
|
||||
}
|
||||
|
||||
req = urllib.request.Request(url, data=payload, headers=headers, method="POST")
|
||||
|
||||
import socket as _socket
|
||||
|
||||
for _attempt in range(3):
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||
with urllib.request.urlopen(req, timeout=120 if backend == "ollama" else 30) as resp:
|
||||
result = json.loads(resp.read())
|
||||
raw = result["content"][0]["text"].strip()
|
||||
m = re.search(r"\b(\d+)\b", raw)
|
||||
if backend == "ollama":
|
||||
msg = result["choices"][0]["message"]
|
||||
raw = (msg.get("content") or "").strip() or (msg.get("reasoning") or "").strip()
|
||||
else:
|
||||
raw = result["content"][0]["text"].strip()
|
||||
# Take LAST integer — reasoning models often count candidates first
|
||||
m = re.search(r"\b(\d+)\b", raw[::-1])
|
||||
if m:
|
||||
pick = int(m.group(1))
|
||||
pick = int(m.group(1)[::-1])
|
||||
if 1 <= pick <= len(candidates):
|
||||
chosen_id = candidates[pick - 1]
|
||||
reordered = [chosen_id] + [cid for cid in retrieved_ids if cid != chosen_id]
|
||||
@@ -608,6 +632,8 @@ def run_benchmark(
|
||||
palace_cache_file=None,
|
||||
palace_model="claude-haiku-4-5-20251001",
|
||||
embed_model="default",
|
||||
llm_backend="anthropic",
|
||||
llm_base_url="",
|
||||
):
|
||||
"""Run LoCoMo retrieval benchmark."""
|
||||
with open(data_file) as f:
|
||||
@@ -619,8 +645,12 @@ def run_benchmark(
|
||||
api_key = ""
|
||||
if llm_rerank_enabled or mode == "palace":
|
||||
api_key = _load_api_key(llm_key)
|
||||
if not api_key:
|
||||
print(f"ERROR: --mode {mode} requires an API key (--llm-key or ANTHROPIC_API_KEY).")
|
||||
# Ollama backend doesn't require an Anthropic key. Palace mode still does
|
||||
# (it uses Anthropic for room-assignment indexing) — so only relax the
|
||||
# requirement when rerank is the ONLY llm use and backend is ollama.
|
||||
needs_key = mode == "palace" or (llm_rerank_enabled and llm_backend == "anthropic")
|
||||
if needs_key and not api_key:
|
||||
print(f"ERROR: --mode {mode} / --llm-rerank (anthropic) requires an API key.")
|
||||
sys.exit(1)
|
||||
|
||||
# Palace mode: load or create room assignment cache
|
||||
@@ -888,6 +918,8 @@ def run_benchmark(
|
||||
api_key,
|
||||
top_k=rerank_pool,
|
||||
model=llm_model,
|
||||
backend=llm_backend,
|
||||
base_url=llm_base_url,
|
||||
)
|
||||
|
||||
# Compute recall
|
||||
@@ -1013,6 +1045,18 @@ if __name__ == "__main__":
|
||||
help="Model for LLM rerank (default: claude-sonnet-4-6)",
|
||||
)
|
||||
parser.add_argument("--llm-key", default="", help="API key (or set ANTHROPIC_API_KEY env var)")
|
||||
parser.add_argument(
|
||||
"--llm-backend",
|
||||
choices=["anthropic", "ollama"],
|
||||
default="anthropic",
|
||||
help="Which API for --llm-rerank. 'anthropic' (default) or 'ollama' "
|
||||
"(OpenAI-compat /v1/chat/completions — works for local + Ollama Cloud).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--llm-base-url",
|
||||
default="",
|
||||
help="Override base URL for --llm-backend ollama. Default: http://localhost:11434.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--hybrid-weight",
|
||||
type=float,
|
||||
@@ -1049,4 +1093,6 @@ if __name__ == "__main__":
|
||||
palace_cache_file=args.palace_cache,
|
||||
palace_model=args.palace_model,
|
||||
embed_model=args.embed_model,
|
||||
llm_backend=args.llm_backend,
|
||||
llm_base_url=args.llm_base_url,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user