fix(entity): reduce noise in regex-based detection

The pattern-matching detector had several systematic false positives that crowded the init review with nonsense. Concrete fixes: - CamelCase extraction: add `[A-Z][a-z]+(?:[A-Z][a-z]+|[A-Z]{2,})+` to candidate patterns so `MemPalace`, `ChromaDB`, `OpenAI`, `ChatGPT` are visible. Previously `MemPalace` fragmented into `Mem` + `Palace`. - Dialogue `^NAME:\s` requires >=2 matches to count. A single metadata line like `Created: 2026-04-21` was scoring as dialogue and classifying `Created` as a person. - Versioned/hyphenated pattern tightened to `\b{name}[-_]v?\d+(?:\.\d+)*\b` (version-only). The previous `\b{name}[-v]\w+` matched `context-manager`, `multi-word`, etc. - every hyphenated compound. - Skip LICENSE/COPYING/NOTICE/AUTHORS/PATENTS files during scan. They produce pure-English-prose noise (`Contributor`, `Software`, `Covered`, `Before`). - Extra SKIP_DIRS: `.terraform`, `vendor`, `target`. - Expand stopword list with capitalized participles/descriptors that commonly appear at sentence start: `created`, `updated`, `extracted`, `processed`, `total`, `summary`, `auto`, `multi`, `hybrid`, `context`, `bridge`, `batch`, `local`, `native`, `never`, `before`, `after`, etc. - classify_entity: high-pronoun single-category signal now classifies as person. A diary's main character gets referenced with pronouns, not dialogue markers - requiring two signal categories demoted `Lu` (16 pronoun hits across 30 mentions) to uncertain. Gate on `pronoun_hits >= 5 AND pronoun_hits / frequency >= 0.2` so common sentence-start words (`Never`, `Before`) with incidental proximity stay uncertain.
2026-04-24 00:20:32 -03:00
parent 6d252a0de4
commit 6aebf458ff
3 changed files with 86 additions and 12 deletions
@@ -113,6 +113,23 @@ SKIP_DIRS = {
    ".next",
    "coverage",
    ".mempalace",
+    ".terraform",
+    "vendor",
+    "target",
+}
+
+# Files whose content is boilerplate prose — poisons entity detection.
+# Matched by stem (case-insensitive), with or without an extension.
+SKIP_FILENAMES = {
+    "license",
+    "licence",
+    "copying",
+    "copyright",
+    "notice",
+    "authors",
+    "patents",
+    "third_party_notices",
+    "third-party-notices",
 }


@@ -193,7 +210,7 @@ def _build_patterns(name: str, languages: tuple = ("en",)) -> dict:
        "person_verbs": _compile_each(sources["person_verb_patterns"]),
        "project_verbs": _compile_each(sources["project_verb_patterns"]),
        "direct": direct_compiled,
-        "versioned": re.compile(rf"\b{n}[-v]\w+", re.IGNORECASE),
+        "versioned": re.compile(rf"\b{n}[-_]v?\d+(?:\.\d+)*\b", re.IGNORECASE),
        "code_ref": re.compile(rf"\b{n}\.(py|js|ts|yaml|yml|json|sh)\b", re.IGNORECASE),
    }

@@ -227,12 +244,19 @@ def score_entity(name: str, text: str, lines: list, languages=("en",)) -> dict:

    # --- Person signals ---

-    # Dialogue markers (strong signal)
+    # Dialogue markers (strong signal).
+    # The bare `^NAME:\s` colon-prefix pattern matches metadata lines like
+    # `Created: 2026-04-21`, so we require >= 2 hits for it to count as dialogue
+    # (real speaker markers repeat; single-line metadata doesn't).
    for rx in patterns["dialogue"]:
        matches = len(rx.findall(text))
-        if matches > 0:
-            person_score += matches * 3
-            person_signals.append(f"dialogue marker ({matches}x)")
+        if matches == 0:
+            continue
+        is_bare_colon = rx.pattern.endswith(r":\s") and not rx.pattern.endswith(r"[:\s]")
+        if is_bare_colon and matches < 2:
+            continue
+        person_score += matches * 3
+        person_signals.append(f"dialogue marker ({matches}x)")

    # Person verbs
    for rx in patterns["person_verbs"]:
@@ -328,17 +352,28 @@ def classify_entity(name: str, frequency: int, scores: dict) -> dict:
            signal_categories.add("addressed")

    has_two_signal_types = len(signal_categories) >= 2
-    _ = signal_categories - {"pronoun"}  # reserved for future thresholds
+    # Single-category pronoun signal still classifies as person when the
+    # evidence is overwhelming — a diary's main character is referenced
+    # with pronouns, not dialogue markers. Require both: many pronoun hits
+    # AND a high pronoun-to-frequency ratio so common sentence-start words
+    # (Never, Before, etc.) with incidental pronoun proximity don't qualify.
+    pronoun_hits = 0
+    for s in scores["person_signals"]:
+        m = re.search(r"pronoun nearby \((\d+)x\)", s)
+        if m:
+            pronoun_hits = int(m.group(1))
+            break
+    strong_pronoun_signal = pronoun_hits >= 5 and frequency > 0 and pronoun_hits / frequency >= 0.2

-    if person_ratio >= 0.7 and has_two_signal_types and ps >= 5:
+    if person_ratio >= 0.7 and (has_two_signal_types and ps >= 5 or strong_pronoun_signal):
        entity_type = "person"
        confidence = min(0.99, 0.5 + person_ratio * 0.5)
        signals = scores["person_signals"] or [f"appears {frequency}x"]
-    elif person_ratio >= 0.7 and (not has_two_signal_types or ps < 5):
-        # Pronoun-only match — downgrade to uncertain
+    elif person_ratio >= 0.7:
+        # Weak single-category person signal — downgrade to uncertain
        entity_type = "uncertain"
        confidence = 0.4
-        signals = scores["person_signals"] + [f"appears {frequency}x — pronoun-only match"]
+        signals = scores["person_signals"] + [f"appears {frequency}x — weak person signal"]
    elif person_ratio <= 0.3:
        entity_type = "project"
        confidence = min(0.99, 0.5 + (1 - person_ratio) * 0.5)
@@ -560,6 +595,8 @@ def scan_for_detection(project_dir: str, max_files: int = 10) -> list:
        dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
        for filename in filenames:
            filepath = Path(root) / filename
+            if filepath.stem.lower() in SKIP_FILENAMES:
+                continue
            ext = filepath.suffix.lower()
            if ext in PROSE_EXTENSIONS:
                prose_files.append(filepath)
@@ -42,7 +42,7 @@
    "action_pattern": "(?:built|fixed|wrote|added|pushed|measured|tested|reviewed|created|deleted|updated|configured|deployed|migrated)\\s+[\\w\\s]{3,30}"
  },
  "entity": {
-    "candidate_pattern": "[A-Z][a-z]{1,19}",
+    "candidate_pattern": "[A-Z][a-z]+(?:[A-Z][a-z]+|[A-Z]{2,})+|[A-Z][a-z]{1,19}",
    "multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
    "person_verb_patterns": [
      "\\b{name}\\s+said\\b",
@@ -140,7 +140,17 @@
      "agents", "tools", "others", "guards", "ethics", "regulation",
      "learning", "thinking", "memory", "language", "intelligence",
      "technology", "society", "culture", "future", "history", "science",
-      "model", "models", "network", "networks", "training", "inference"
+      "model", "models", "network", "networks", "training", "inference",
+      "created", "updated", "deleted", "added", "removed", "modified",
+      "extracted", "processed", "generated", "compiled", "launched", "installed",
+      "deployed", "executed", "loaded", "parsed", "validated", "configured",
+      "total", "summary", "covered", "included", "pending", "failed", "success",
+      "ready", "active", "disabled", "enabled", "available", "completed",
+      "auto", "multi", "mini", "micro", "meta", "super", "hybrid",
+      "context", "bridge", "batch", "local", "global", "native", "cloud",
+      "before", "after", "during", "often", "always", "never",
+      "project", "contributor", "software",
+      "backend", "frontend", "server", "client", "service", "app", "api"
    ]
  }
 }
@@ -148,6 +148,33 @@ def test_classify_entity_pronoun_only_is_uncertain():
    assert result["type"] == "uncertain"


+def test_classify_entity_high_pronoun_signal_is_person():
+    """A diary's main character hit by many pronouns should still classify
+    as a person even with only the pronoun signal category. Example from
+    real data: `Lu` has 16 pronoun hits out of 30 mentions."""
+    scores = {
+        "person_score": 32,
+        "project_score": 0,
+        "person_signals": ["pronoun nearby (16x)"],
+        "project_signals": [],
+    }
+    result = classify_entity("Lu", 30, scores)
+    assert result["type"] == "person"
+
+
+def test_classify_entity_low_pronoun_proximity_is_uncertain():
+    """Common sentence-start words (Never, Before) get a few pronouns nearby
+    incidentally. The ratio stays low (<20%), so they stay uncertain."""
+    scores = {
+        "person_score": 4,
+        "project_score": 0,
+        "person_signals": ["pronoun nearby (2x)"],
+        "project_signals": [],
+    }
+    result = classify_entity("Never", 21, scores)
+    assert result["type"] == "uncertain"
+
+
 def test_classify_entity_mixed_signals():
    scores = {
        "person_score": 5,