From 6aebf458ff67793fa75cc803697f37ccbd4c9a14 Mon Sep 17 00:00:00 2001
From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com>
Date: Fri, 24 Apr 2026 00:20:32 -0300
Subject: [PATCH] fix(entity): reduce noise in regex-based detection

The pattern-matching detector had several systematic false positives that
crowded the init review with nonsense. Concrete fixes:

- CamelCase extraction: add `[A-Z][a-z]+(?:[A-Z][a-z]+|[A-Z]{2,})+` to
  candidate patterns so `MemPalace`, `ChromaDB`, `OpenAI`, `ChatGPT` are
  visible. Previously `MemPalace` fragmented into `Mem` + `Palace`.
- Dialogue `^NAME:\s` requires >=2 matches to count. A single metadata
  line like `Created: 2026-04-21` was scoring as dialogue and classifying
  `Created` as a person.
- Versioned/hyphenated pattern tightened to `\b{name}[-_]v?\d+(?:\.\d+)*\b`
  (version-only). The previous `\b{name}[-v]\w+` matched `context-manager`,
  `multi-word`, etc. - every hyphenated compound.
- Skip LICENSE/COPYING/NOTICE/AUTHORS/PATENTS files during scan. They
  produce pure-English-prose noise (`Contributor`, `Software`, `Covered`,
  `Before`).
- Extra SKIP_DIRS: `.terraform`, `vendor`, `target`.
- Expand stopword list with capitalized participles/descriptors that
  commonly appear at sentence start: `created`, `updated`, `extracted`,
  `processed`, `total`, `summary`, `auto`, `multi`, `hybrid`, `context`,
  `bridge`, `batch`, `local`, `native`, `never`, `before`, `after`, etc.
- classify_entity: high-pronoun single-category signal now classifies as
  person. A diary's main character gets referenced with pronouns, not
  dialogue markers - requiring two signal categories demoted `Lu` (16
  pronoun hits across 30 mentions) to uncertain. Gate on
  `pronoun_hits >= 5 AND pronoun_hits / frequency >= 0.2` so common
  sentence-start words (`Never`, `Before`) with incidental proximity
  stay uncertain.
---
 mempalace/entity_detector.py  | 57 +++++++++++++++++++++++++++++------
 mempalace/i18n/en.json        | 14 +++++++--
 tests/test_entity_detector.py | 27 +++++++++++++++++
 3 files changed, 86 insertions(+), 12 deletions(-)

diff --git a/mempalace/entity_detector.py b/mempalace/entity_detector.py
index 754c65d..2f2aae4 100644
--- a/mempalace/entity_detector.py
+++ b/mempalace/entity_detector.py
@@ -113,6 +113,23 @@ SKIP_DIRS = {
     ".next",
     "coverage",
     ".mempalace",
+    ".terraform",
+    "vendor",
+    "target",
+}
+
+# Files whose content is boilerplate prose — poisons entity detection.
+# Matched by stem (case-insensitive), with or without an extension.
+SKIP_FILENAMES = {
+    "license",
+    "licence",
+    "copying",
+    "copyright",
+    "notice",
+    "authors",
+    "patents",
+    "third_party_notices",
+    "third-party-notices",
 }
 
 
@@ -193,7 +210,7 @@ def _build_patterns(name: str, languages: tuple = ("en",)) -> dict:
         "person_verbs": _compile_each(sources["person_verb_patterns"]),
         "project_verbs": _compile_each(sources["project_verb_patterns"]),
         "direct": direct_compiled,
-        "versioned": re.compile(rf"\b{n}[-v]\w+", re.IGNORECASE),
+        "versioned": re.compile(rf"\b{n}[-_]v?\d+(?:\.\d+)*\b", re.IGNORECASE),
         "code_ref": re.compile(rf"\b{n}\.(py|js|ts|yaml|yml|json|sh)\b", re.IGNORECASE),
     }
 
@@ -227,12 +244,19 @@ def score_entity(name: str, text: str, lines: list, languages=("en",)) -> dict:
 
     # --- Person signals ---
 
-    # Dialogue markers (strong signal)
+    # Dialogue markers (strong signal).
+    # The bare `^NAME:\s` colon-prefix pattern matches metadata lines like
+    # `Created: 2026-04-21`, so we require >= 2 hits for it to count as dialogue
+    # (real speaker markers repeat; single-line metadata doesn't).
     for rx in patterns["dialogue"]:
         matches = len(rx.findall(text))
-        if matches > 0:
-            person_score += matches * 3
-            person_signals.append(f"dialogue marker ({matches}x)")
+        if matches == 0:
+            continue
+        is_bare_colon = rx.pattern.endswith(r":\s") and not rx.pattern.endswith(r"[:\s]")
+        if is_bare_colon and matches < 2:
+            continue
+        person_score += matches * 3
+        person_signals.append(f"dialogue marker ({matches}x)")
 
     # Person verbs
     for rx in patterns["person_verbs"]:
@@ -328,17 +352,28 @@ def classify_entity(name: str, frequency: int, scores: dict) -> dict:
             signal_categories.add("addressed")
 
     has_two_signal_types = len(signal_categories) >= 2
-    _ = signal_categories - {"pronoun"}  # reserved for future thresholds
+    # Single-category pronoun signal still classifies as person when the
+    # evidence is overwhelming — a diary's main character is referenced
+    # with pronouns, not dialogue markers. Require both: many pronoun hits
+    # AND a high pronoun-to-frequency ratio so common sentence-start words
+    # (Never, Before, etc.) with incidental pronoun proximity don't qualify.
+    pronoun_hits = 0
+    for s in scores["person_signals"]:
+        m = re.search(r"pronoun nearby \((\d+)x\)", s)
+        if m:
+            pronoun_hits = int(m.group(1))
+            break
+    strong_pronoun_signal = pronoun_hits >= 5 and frequency > 0 and pronoun_hits / frequency >= 0.2
 
-    if person_ratio >= 0.7 and has_two_signal_types and ps >= 5:
+    if person_ratio >= 0.7 and (has_two_signal_types and ps >= 5 or strong_pronoun_signal):
         entity_type = "person"
         confidence = min(0.99, 0.5 + person_ratio * 0.5)
         signals = scores["person_signals"] or [f"appears {frequency}x"]
-    elif person_ratio >= 0.7 and (not has_two_signal_types or ps < 5):
-        # Pronoun-only match — downgrade to uncertain
+    elif person_ratio >= 0.7:
+        # Weak single-category person signal — downgrade to uncertain
         entity_type = "uncertain"
         confidence = 0.4
-        signals = scores["person_signals"] + [f"appears {frequency}x — pronoun-only match"]
+        signals = scores["person_signals"] + [f"appears {frequency}x — weak person signal"]
     elif person_ratio <= 0.3:
         entity_type = "project"
         confidence = min(0.99, 0.5 + (1 - person_ratio) * 0.5)
@@ -560,6 +595,8 @@ def scan_for_detection(project_dir: str, max_files: int = 10) -> list:
         dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
         for filename in filenames:
             filepath = Path(root) / filename
+            if filepath.stem.lower() in SKIP_FILENAMES:
+                continue
             ext = filepath.suffix.lower()
             if ext in PROSE_EXTENSIONS:
                 prose_files.append(filepath)
diff --git a/mempalace/i18n/en.json b/mempalace/i18n/en.json
index 6a9dff9..39d9ac1 100644
--- a/mempalace/i18n/en.json
+++ b/mempalace/i18n/en.json
@@ -42,7 +42,7 @@
     "action_pattern": "(?:built|fixed|wrote|added|pushed|measured|tested|reviewed|created|deleted|updated|configured|deployed|migrated)\\s+[\\w\\s]{3,30}"
   },
   "entity": {
-    "candidate_pattern": "[A-Z][a-z]{1,19}",
+    "candidate_pattern": "[A-Z][a-z]+(?:[A-Z][a-z]+|[A-Z]{2,})+|[A-Z][a-z]{1,19}",
     "multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
     "person_verb_patterns": [
       "\\b{name}\\s+said\\b",
@@ -140,7 +140,17 @@
       "agents", "tools", "others", "guards", "ethics", "regulation",
       "learning", "thinking", "memory", "language", "intelligence",
       "technology", "society", "culture", "future", "history", "science",
-      "model", "models", "network", "networks", "training", "inference"
+      "model", "models", "network", "networks", "training", "inference",
+      "created", "updated", "deleted", "added", "removed", "modified",
+      "extracted", "processed", "generated", "compiled", "launched", "installed",
+      "deployed", "executed", "loaded", "parsed", "validated", "configured",
+      "total", "summary", "covered", "included", "pending", "failed", "success",
+      "ready", "active", "disabled", "enabled", "available", "completed",
+      "auto", "multi", "mini", "micro", "meta", "super", "hybrid",
+      "context", "bridge", "batch", "local", "global", "native", "cloud",
+      "before", "after", "during", "often", "always", "never",
+      "project", "contributor", "software",
+      "backend", "frontend", "server", "client", "service", "app", "api"
     ]
   }
 }
diff --git a/tests/test_entity_detector.py b/tests/test_entity_detector.py
index f006270..afad4d7 100644
--- a/tests/test_entity_detector.py
+++ b/tests/test_entity_detector.py
@@ -148,6 +148,33 @@ def test_classify_entity_pronoun_only_is_uncertain():
     assert result["type"] == "uncertain"
 
 
+def test_classify_entity_high_pronoun_signal_is_person():
+    """A diary's main character hit by many pronouns should still classify
+    as a person even with only the pronoun signal category. Example from
+    real data: `Lu` has 16 pronoun hits out of 30 mentions."""
+    scores = {
+        "person_score": 32,
+        "project_score": 0,
+        "person_signals": ["pronoun nearby (16x)"],
+        "project_signals": [],
+    }
+    result = classify_entity("Lu", 30, scores)
+    assert result["type"] == "person"
+
+
+def test_classify_entity_low_pronoun_proximity_is_uncertain():
+    """Common sentence-start words (Never, Before) get a few pronouns nearby
+    incidentally. The ratio stays low (<20%), so they stay uncertain."""
+    scores = {
+        "person_score": 4,
+        "project_score": 0,
+        "person_signals": ["pronoun nearby (2x)"],
+        "project_signals": [],
+    }
+    result = classify_entity("Never", 21, scores)
+    assert result["type"] == "uncertain"
+
+
 def test_classify_entity_mixed_signals():
     scores = {
         "person_score": 5,