Files
mempalace/tests/benchmarks/test_palace_boost.py
T
Igor Lins e Silva 7b89291334 bench: add scale benchmark suite (94 tests)
Benchmark mempalace at configurable scale (1K–100K drawers) to find
real-world performance limits. Tests cover MCP tool OOM thresholds,
ChromaDB query degradation, search recall@k, mining throughput,
knowledge graph concurrency, memory leak detection, palace boost
quantification, and Layer1 unbounded fetch behavior.

- tests/benchmarks/ with 8 test modules + data generator + report system
- Deterministic data factory with planted needles for recall measurement
- JSON report output with regression detection (--bench-report flag)
- CI benchmark job on PRs at small scale
- psutil added as dev dependency for RSS tracking
2026-04-08 05:06:31 -03:00

173 lines
6.9 KiB
Python

"""
Palace boost validation — does wing/room filtering actually help?
Quantifies the retrieval improvement from the palace spatial metaphor.
Uses planted needles to measure recall with and without filtering
at different scales.
"""
import time
import pytest
from tests.benchmarks.data_generator import PalaceDataGenerator
from tests.benchmarks.report import record_metric
@pytest.mark.benchmark
class TestFilteredVsUnfilteredRecall:
"""Quantify palace boost: recall improvement from wing/room filtering."""
SIZES = [1_000, 2_500, 5_000]
@pytest.mark.parametrize("n_drawers", SIZES)
def test_palace_boost_recall(self, n_drawers, tmp_path, bench_scale):
"""Compare recall@5 with/without wing filter at increasing scale."""
gen = PalaceDataGenerator(seed=42, scale=bench_scale)
palace_path = str(tmp_path / "palace")
_, _, needle_info = gen.populate_palace_directly(
palace_path, n_drawers=n_drawers, include_needles=True
)
from mempalace.searcher import search_memories
n_queries = min(10, len(needle_info))
unfiltered_hits = 0
wing_filtered_hits = 0
room_filtered_hits = 0
for needle in needle_info[:n_queries]:
# Unfiltered search
result = search_memories(needle["query"], palace_path=palace_path, n_results=5)
texts = [h["text"] for h in result.get("results", [])]
if any("NEEDLE_" in t for t in texts[:5]):
unfiltered_hits += 1
# Wing-filtered search
result = search_memories(
needle["query"], palace_path=palace_path, wing=needle["wing"], n_results=5
)
texts = [h["text"] for h in result.get("results", [])]
if any("NEEDLE_" in t for t in texts[:5]):
wing_filtered_hits += 1
# Wing+room filtered search
result = search_memories(
needle["query"],
palace_path=palace_path,
wing=needle["wing"],
room=needle["room"],
n_results=5,
)
texts = [h["text"] for h in result.get("results", [])]
if any("NEEDLE_" in t for t in texts[:5]):
room_filtered_hits += 1
recall_none = unfiltered_hits / max(n_queries, 1)
recall_wing = wing_filtered_hits / max(n_queries, 1)
recall_room = room_filtered_hits / max(n_queries, 1)
boost_wing = recall_wing - recall_none
boost_room = recall_room - recall_none
record_metric("palace_boost", f"recall_unfiltered_at_{n_drawers}", round(recall_none, 3))
record_metric("palace_boost", f"recall_wing_filtered_at_{n_drawers}", round(recall_wing, 3))
record_metric("palace_boost", f"recall_room_filtered_at_{n_drawers}", round(recall_room, 3))
record_metric("palace_boost", f"wing_boost_at_{n_drawers}", round(boost_wing, 3))
record_metric("palace_boost", f"room_boost_at_{n_drawers}", round(boost_room, 3))
@pytest.mark.benchmark
class TestFilterLatencyBenefit:
"""Does filtering reduce query latency by narrowing the search space?"""
def test_filter_speedup(self, tmp_path, bench_scale):
"""Compare latency: no filter vs wing vs wing+room."""
gen = PalaceDataGenerator(seed=42, scale=bench_scale)
palace_path = str(tmp_path / "palace")
gen.populate_palace_directly(palace_path, n_drawers=5_000, include_needles=False)
from mempalace.searcher import search_memories
wing = gen.wings[0]
room = gen.rooms_by_wing[wing][0]
query = "authentication middleware optimization"
n_runs = 10
# No filter
latencies_none = []
for _ in range(n_runs):
start = time.perf_counter()
search_memories(query, palace_path=palace_path, n_results=5)
latencies_none.append((time.perf_counter() - start) * 1000)
# Wing filter
latencies_wing = []
for _ in range(n_runs):
start = time.perf_counter()
search_memories(query, palace_path=palace_path, wing=wing, n_results=5)
latencies_wing.append((time.perf_counter() - start) * 1000)
# Wing + room filter
latencies_room = []
for _ in range(n_runs):
start = time.perf_counter()
search_memories(query, palace_path=palace_path, wing=wing, room=room, n_results=5)
latencies_room.append((time.perf_counter() - start) * 1000)
avg_none = sum(latencies_none) / len(latencies_none)
avg_wing = sum(latencies_wing) / len(latencies_wing)
avg_room = sum(latencies_room) / len(latencies_room)
record_metric("filter_latency", "avg_unfiltered_ms", round(avg_none, 1))
record_metric("filter_latency", "avg_wing_filtered_ms", round(avg_wing, 1))
record_metric("filter_latency", "avg_room_filtered_ms", round(avg_room, 1))
if avg_none > 0:
record_metric("filter_latency", "wing_speedup_pct", round((1 - avg_wing / avg_none) * 100, 1))
record_metric("filter_latency", "room_speedup_pct", round((1 - avg_room / avg_none) * 100, 1))
@pytest.mark.benchmark
class TestBoostAtIncreasingScale:
"""Does the palace boost increase as the palace grows?"""
def test_boost_scaling(self, tmp_path, bench_scale):
"""Measure wing-filtered recall improvement at multiple sizes."""
sizes = [500, 1_000, 2_500]
boosts = []
for size in sizes:
gen = PalaceDataGenerator(seed=42, scale=bench_scale)
palace_path = str(tmp_path / f"palace_{size}")
_, _, needle_info = gen.populate_palace_directly(
palace_path, n_drawers=size, include_needles=True
)
from mempalace.searcher import search_memories
n_queries = min(8, len(needle_info))
unfiltered_hits = 0
filtered_hits = 0
for needle in needle_info[:n_queries]:
result = search_memories(needle["query"], palace_path=palace_path, n_results=5)
if any("NEEDLE_" in h["text"] for h in result.get("results", [])[:5]):
unfiltered_hits += 1
result = search_memories(
needle["query"], palace_path=palace_path, wing=needle["wing"], n_results=5
)
if any("NEEDLE_" in h["text"] for h in result.get("results", [])[:5]):
filtered_hits += 1
recall_none = unfiltered_hits / max(n_queries, 1)
recall_filtered = filtered_hits / max(n_queries, 1)
boost = recall_filtered - recall_none
boosts.append({"size": size, "boost": boost})
record_metric("boost_scaling", "boosts_by_size", boosts)
# Check if boost increases with scale (the hypothesis)
if len(boosts) >= 2:
trend_positive = boosts[-1]["boost"] >= boosts[0]["boost"]
record_metric("boost_scaling", "trend_positive", trend_positive)