From 6aebf458ff67793fa75cc803697f37ccbd4c9a14 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Fri, 24 Apr 2026 00:20:32 -0300 Subject: [PATCH] fix(entity): reduce noise in regex-based detection The pattern-matching detector had several systematic false positives that crowded the init review with nonsense. Concrete fixes: - CamelCase extraction: add `[A-Z][a-z]+(?:[A-Z][a-z]+|[A-Z]{2,})+` to candidate patterns so `MemPalace`, `ChromaDB`, `OpenAI`, `ChatGPT` are visible. Previously `MemPalace` fragmented into `Mem` + `Palace`. - Dialogue `^NAME:\s` requires >=2 matches to count. A single metadata line like `Created: 2026-04-21` was scoring as dialogue and classifying `Created` as a person. - Versioned/hyphenated pattern tightened to `\b{name}[-_]v?\d+(?:\.\d+)*\b` (version-only). The previous `\b{name}[-v]\w+` matched `context-manager`, `multi-word`, etc. - every hyphenated compound. - Skip LICENSE/COPYING/NOTICE/AUTHORS/PATENTS files during scan. They produce pure-English-prose noise (`Contributor`, `Software`, `Covered`, `Before`). - Extra SKIP_DIRS: `.terraform`, `vendor`, `target`. - Expand stopword list with capitalized participles/descriptors that commonly appear at sentence start: `created`, `updated`, `extracted`, `processed`, `total`, `summary`, `auto`, `multi`, `hybrid`, `context`, `bridge`, `batch`, `local`, `native`, `never`, `before`, `after`, etc. - classify_entity: high-pronoun single-category signal now classifies as person. A diary's main character gets referenced with pronouns, not dialogue markers - requiring two signal categories demoted `Lu` (16 pronoun hits across 30 mentions) to uncertain. Gate on `pronoun_hits >= 5 AND pronoun_hits / frequency >= 0.2` so common sentence-start words (`Never`, `Before`) with incidental proximity stay uncertain. --- mempalace/entity_detector.py | 57 +++++++++++++++++++++++++++++------ mempalace/i18n/en.json | 14 +++++++-- tests/test_entity_detector.py | 27 +++++++++++++++++ 3 files changed, 86 insertions(+), 12 deletions(-) diff --git a/mempalace/entity_detector.py b/mempalace/entity_detector.py index 754c65d..2f2aae4 100644 --- a/mempalace/entity_detector.py +++ b/mempalace/entity_detector.py @@ -113,6 +113,23 @@ SKIP_DIRS = { ".next", "coverage", ".mempalace", + ".terraform", + "vendor", + "target", +} + +# Files whose content is boilerplate prose — poisons entity detection. +# Matched by stem (case-insensitive), with or without an extension. +SKIP_FILENAMES = { + "license", + "licence", + "copying", + "copyright", + "notice", + "authors", + "patents", + "third_party_notices", + "third-party-notices", } @@ -193,7 +210,7 @@ def _build_patterns(name: str, languages: tuple = ("en",)) -> dict: "person_verbs": _compile_each(sources["person_verb_patterns"]), "project_verbs": _compile_each(sources["project_verb_patterns"]), "direct": direct_compiled, - "versioned": re.compile(rf"\b{n}[-v]\w+", re.IGNORECASE), + "versioned": re.compile(rf"\b{n}[-_]v?\d+(?:\.\d+)*\b", re.IGNORECASE), "code_ref": re.compile(rf"\b{n}\.(py|js|ts|yaml|yml|json|sh)\b", re.IGNORECASE), } @@ -227,12 +244,19 @@ def score_entity(name: str, text: str, lines: list, languages=("en",)) -> dict: # --- Person signals --- - # Dialogue markers (strong signal) + # Dialogue markers (strong signal). + # The bare `^NAME:\s` colon-prefix pattern matches metadata lines like + # `Created: 2026-04-21`, so we require >= 2 hits for it to count as dialogue + # (real speaker markers repeat; single-line metadata doesn't). for rx in patterns["dialogue"]: matches = len(rx.findall(text)) - if matches > 0: - person_score += matches * 3 - person_signals.append(f"dialogue marker ({matches}x)") + if matches == 0: + continue + is_bare_colon = rx.pattern.endswith(r":\s") and not rx.pattern.endswith(r"[:\s]") + if is_bare_colon and matches < 2: + continue + person_score += matches * 3 + person_signals.append(f"dialogue marker ({matches}x)") # Person verbs for rx in patterns["person_verbs"]: @@ -328,17 +352,28 @@ def classify_entity(name: str, frequency: int, scores: dict) -> dict: signal_categories.add("addressed") has_two_signal_types = len(signal_categories) >= 2 - _ = signal_categories - {"pronoun"} # reserved for future thresholds + # Single-category pronoun signal still classifies as person when the + # evidence is overwhelming — a diary's main character is referenced + # with pronouns, not dialogue markers. Require both: many pronoun hits + # AND a high pronoun-to-frequency ratio so common sentence-start words + # (Never, Before, etc.) with incidental pronoun proximity don't qualify. + pronoun_hits = 0 + for s in scores["person_signals"]: + m = re.search(r"pronoun nearby \((\d+)x\)", s) + if m: + pronoun_hits = int(m.group(1)) + break + strong_pronoun_signal = pronoun_hits >= 5 and frequency > 0 and pronoun_hits / frequency >= 0.2 - if person_ratio >= 0.7 and has_two_signal_types and ps >= 5: + if person_ratio >= 0.7 and (has_two_signal_types and ps >= 5 or strong_pronoun_signal): entity_type = "person" confidence = min(0.99, 0.5 + person_ratio * 0.5) signals = scores["person_signals"] or [f"appears {frequency}x"] - elif person_ratio >= 0.7 and (not has_two_signal_types or ps < 5): - # Pronoun-only match — downgrade to uncertain + elif person_ratio >= 0.7: + # Weak single-category person signal — downgrade to uncertain entity_type = "uncertain" confidence = 0.4 - signals = scores["person_signals"] + [f"appears {frequency}x — pronoun-only match"] + signals = scores["person_signals"] + [f"appears {frequency}x — weak person signal"] elif person_ratio <= 0.3: entity_type = "project" confidence = min(0.99, 0.5 + (1 - person_ratio) * 0.5) @@ -560,6 +595,8 @@ def scan_for_detection(project_dir: str, max_files: int = 10) -> list: dirs[:] = [d for d in dirs if d not in SKIP_DIRS] for filename in filenames: filepath = Path(root) / filename + if filepath.stem.lower() in SKIP_FILENAMES: + continue ext = filepath.suffix.lower() if ext in PROSE_EXTENSIONS: prose_files.append(filepath) diff --git a/mempalace/i18n/en.json b/mempalace/i18n/en.json index 6a9dff9..39d9ac1 100644 --- a/mempalace/i18n/en.json +++ b/mempalace/i18n/en.json @@ -42,7 +42,7 @@ "action_pattern": "(?:built|fixed|wrote|added|pushed|measured|tested|reviewed|created|deleted|updated|configured|deployed|migrated)\\s+[\\w\\s]{3,30}" }, "entity": { - "candidate_pattern": "[A-Z][a-z]{1,19}", + "candidate_pattern": "[A-Z][a-z]+(?:[A-Z][a-z]+|[A-Z]{2,})+|[A-Z][a-z]{1,19}", "multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+", "person_verb_patterns": [ "\\b{name}\\s+said\\b", @@ -140,7 +140,17 @@ "agents", "tools", "others", "guards", "ethics", "regulation", "learning", "thinking", "memory", "language", "intelligence", "technology", "society", "culture", "future", "history", "science", - "model", "models", "network", "networks", "training", "inference" + "model", "models", "network", "networks", "training", "inference", + "created", "updated", "deleted", "added", "removed", "modified", + "extracted", "processed", "generated", "compiled", "launched", "installed", + "deployed", "executed", "loaded", "parsed", "validated", "configured", + "total", "summary", "covered", "included", "pending", "failed", "success", + "ready", "active", "disabled", "enabled", "available", "completed", + "auto", "multi", "mini", "micro", "meta", "super", "hybrid", + "context", "bridge", "batch", "local", "global", "native", "cloud", + "before", "after", "during", "often", "always", "never", + "project", "contributor", "software", + "backend", "frontend", "server", "client", "service", "app", "api" ] } } diff --git a/tests/test_entity_detector.py b/tests/test_entity_detector.py index f006270..afad4d7 100644 --- a/tests/test_entity_detector.py +++ b/tests/test_entity_detector.py @@ -148,6 +148,33 @@ def test_classify_entity_pronoun_only_is_uncertain(): assert result["type"] == "uncertain" +def test_classify_entity_high_pronoun_signal_is_person(): + """A diary's main character hit by many pronouns should still classify + as a person even with only the pronoun signal category. Example from + real data: `Lu` has 16 pronoun hits out of 30 mentions.""" + scores = { + "person_score": 32, + "project_score": 0, + "person_signals": ["pronoun nearby (16x)"], + "project_signals": [], + } + result = classify_entity("Lu", 30, scores) + assert result["type"] == "person" + + +def test_classify_entity_low_pronoun_proximity_is_uncertain(): + """Common sentence-start words (Never, Before) get a few pronouns nearby + incidentally. The ratio stays low (<20%), so they stay uncertain.""" + scores = { + "person_score": 4, + "project_score": 0, + "person_signals": ["pronoun nearby (2x)"], + "project_signals": [], + } + result = classify_entity("Never", 21, scores) + assert result["type"] == "uncertain" + + def test_classify_entity_mixed_signals(): scores = { "person_score": 5,