From ead2c5d2997755b7277a38963e2030459c237d6b Mon Sep 17 00:00:00 2001 From: Stephen Coogan Date: Sat, 18 Apr 2026 18:50:21 +0100 Subject: [PATCH] fix(miner): use token-boundary matching in detect_room MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Substring checks in path/filename routing caused systemic misrouting in large monorepos — e.g., "views" ⊂ "interviews" sent every file under views/ to the interviews room. Switch to separator-bounded token matching (-, _, ., /) via a _name_matches helper, applied to priority 1 (path parts) and priority 2 (filename). --- mempalace/miner.py | 27 ++++++++++++- tests/test_miner.py | 99 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 123 insertions(+), 3 deletions(-) diff --git a/mempalace/miner.py b/mempalace/miner.py index 6aeddd4..f9c44e2 100644 --- a/mempalace/miner.py +++ b/mempalace/miner.py @@ -8,6 +8,7 @@ Stores verbatim chunks as drawers. No summaries. Ever. """ import os +import re import sys import shlex import hashlib @@ -332,6 +333,28 @@ def load_config(project_dir: str) -> dict: # FILE ROUTING — which room does this file belong to? # ============================================================================= +_TOKEN_SPLIT = re.compile(r"[-_./]+") + + +def _tokens(value: str) -> set: + """Split ``value`` into lowercased tokens bounded by ``-``, ``_``, ``.`` or ``/``.""" + return {t for t in _TOKEN_SPLIT.split(value.lower()) if t} + + +def _name_matches(a: str, b: str) -> bool: + """Return True when ``a`` and ``b`` match as equal strings or as + separator-bounded tokens of each other. + + Prevents incidental substring collisions (e.g., ``"views" in "interviews"``) + that a raw ``in`` check would produce, while preserving the intended + match for real tokens (e.g., ``"frontend"`` in ``"frontend-app"``). + """ + a = a.lower() + b = b.lower() + if a == b: + return True + return b in _tokens(a) or a in _tokens(b) + def detect_room(filepath: Path, content: str, rooms: list, project_path: Path) -> str: """ @@ -351,12 +374,12 @@ def detect_room(filepath: Path, content: str, rooms: list, project_path: Path) - for part in path_parts[:-1]: # skip filename itself for room in rooms: candidates = [room["name"].lower()] + [k.lower() for k in room.get("keywords", [])] - if any(part == c or c in part or part in c for c in candidates): + if any(_name_matches(part, c) for c in candidates): return room["name"] # Priority 2: filename matches room name for room in rooms: - if room["name"].lower() in filename or filename in room["name"].lower(): + if _name_matches(filename, room["name"]): return room["name"] # Priority 3: keyword scoring from room keywords + name diff --git a/tests/test_miner.py b/tests/test_miner.py index 10dd33d..ab053d7 100644 --- a/tests/test_miner.py +++ b/tests/test_miner.py @@ -7,7 +7,7 @@ from pathlib import Path import chromadb import yaml -from mempalace.miner import load_config, mine, scan_project, status +from mempalace.miner import detect_room, load_config, mine, scan_project, status from mempalace.palace import NORMALIZE_VERSION, file_already_mined @@ -491,6 +491,103 @@ def test_file_already_mined_returns_false_for_stale_normalize_version(): shutil.rmtree(tmpdir, ignore_errors=True) +def test_detect_room_uses_token_boundary_matching(tmp_path): + """Path-part routing must not fire on incidental substrings. + + Regression: "views" is a substring of "interviews", so the old + substring check routed every file under views/ into a room keyed + by "interviews". Token-boundary matching prevents this while still + matching real tokens like "frontend" in "frontend-app". + """ + project = tmp_path + rooms = [ + {"name": "billing-page", "keywords": ["billing-page"]}, + {"name": "interviews", "keywords": ["interviews"]}, + {"name": "general", "keywords": []}, + ] + + # views//... must NOT route to "interviews" on the "views"⊂"interviews" accident + view_file = project / "views" / "billing-page" / "Foo.test.tsx" + view_file.parent.mkdir(parents=True) + view_file.write_text("content") + assert detect_room(view_file, "content", rooms, project) == "billing-page" + + # data/interviews/... must route to "interviews" via the real token + data_file = project / "data" / "interviews" / "index.ts" + data_file.parent.mkdir(parents=True) + data_file.write_text("content") + assert detect_room(data_file, "content", rooms, project) == "interviews" + + +def test_detect_room_preserves_token_matches(tmp_path): + """Real separator-bounded tokens still match in both directions.""" + project = tmp_path + rooms = [ + {"name": "frontend", "keywords": ["frontend"]}, + {"name": "general", "keywords": []}, + ] + + # path part contains keyword as a token + f1 = project / "frontend-app" / "main.ts" + f1.parent.mkdir(parents=True) + f1.write_text("x") + assert detect_room(f1, "x", rooms, project) == "frontend" + + # keyword contains path part as a token (reverse direction) + rooms2 = [ + {"name": "data-retention", "keywords": ["data-retention"]}, + {"name": "general", "keywords": []}, + ] + f2 = project / "data" / "data-retention" / "policy.ts" + f2.parent.mkdir(parents=True) + f2.write_text("x") + assert detect_room(f2, "x", rooms2, project) == "data-retention" + + +def test_detect_room_matches_keyword_distinct_from_name(tmp_path): + """Regression: PR #145 — path part must match a keyword even when the + room name itself doesn't contain the path part as a token. + + Scenario: a folder named ``docs/`` should route to a room named + ``documentation`` that declares ``"docs"`` as a keyword. + """ + project = tmp_path + rooms = [ + {"name": "documentation", "keywords": ["docs"]}, + {"name": "general", "keywords": []}, + ] + + f = project / "docs" / "readme.md" + f.parent.mkdir(parents=True) + f.write_text("x") + assert detect_room(f, "x", rooms, project) == "documentation" + + +def test_detect_room_filename_match_uses_token_boundary(tmp_path): + """Priority 2 (filename match) must also use token-boundary rules.""" + project = tmp_path + rooms = [ + {"name": "review", "keywords": []}, + {"name": "general", "keywords": []}, + ] + + # "review" is a substring of "reviewmodule" but not a token — should NOT match + f1 = project / "reviewmodule.ts" + f1.write_text("x") + assert detect_room(f1, "x", rooms, project) != "review" + + # "review" IS a token of "review-page" — should match + f2 = project / "review-page.ts" + f2.write_text("x") + assert detect_room(f2, "x", rooms, project) == "review" + + # Dotted filename stems like "Foo.test" split on "." too + rooms3 = [{"name": "foo", "keywords": []}, {"name": "general", "keywords": []}] + f3 = project / "foo.test.ts" + f3.write_text("x") + assert detect_room(f3, "x", rooms3, project) == "foo" + + def test_add_drawer_stamps_normalize_version(tmp_path): """Fresh drawers carry the current schema version so future upgrades work.""" from mempalace.miner import add_drawer