fix(entity): reduce noise in regex-based detection
The pattern-matching detector had several systematic false positives that
crowded the init review with nonsense. Concrete fixes:
- CamelCase extraction: add `[A-Z][a-z]+(?:[A-Z][a-z]+|[A-Z]{2,})+` to
candidate patterns so `MemPalace`, `ChromaDB`, `OpenAI`, `ChatGPT` are
visible. Previously `MemPalace` fragmented into `Mem` + `Palace`.
- Dialogue `^NAME:\s` requires >=2 matches to count. A single metadata
line like `Created: 2026-04-21` was scoring as dialogue and classifying
`Created` as a person.
- Versioned/hyphenated pattern tightened to `\b{name}[-_]v?\d+(?:\.\d+)*\b`
(version-only). The previous `\b{name}[-v]\w+` matched `context-manager`,
`multi-word`, etc. - every hyphenated compound.
- Skip LICENSE/COPYING/NOTICE/AUTHORS/PATENTS files during scan. They
produce pure-English-prose noise (`Contributor`, `Software`, `Covered`,
`Before`).
- Extra SKIP_DIRS: `.terraform`, `vendor`, `target`.
- Expand stopword list with capitalized participles/descriptors that
commonly appear at sentence start: `created`, `updated`, `extracted`,
`processed`, `total`, `summary`, `auto`, `multi`, `hybrid`, `context`,
`bridge`, `batch`, `local`, `native`, `never`, `before`, `after`, etc.
- classify_entity: high-pronoun single-category signal now classifies as
person. A diary's main character gets referenced with pronouns, not
dialogue markers - requiring two signal categories demoted `Lu` (16
pronoun hits across 30 mentions) to uncertain. Gate on
`pronoun_hits >= 5 AND pronoun_hits / frequency >= 0.2` so common
sentence-start words (`Never`, `Before`) with incidental proximity
stay uncertain.
This commit is contained in:
@@ -113,6 +113,23 @@ SKIP_DIRS = {
|
||||
".next",
|
||||
"coverage",
|
||||
".mempalace",
|
||||
".terraform",
|
||||
"vendor",
|
||||
"target",
|
||||
}
|
||||
|
||||
# Files whose content is boilerplate prose — poisons entity detection.
|
||||
# Matched by stem (case-insensitive), with or without an extension.
|
||||
SKIP_FILENAMES = {
|
||||
"license",
|
||||
"licence",
|
||||
"copying",
|
||||
"copyright",
|
||||
"notice",
|
||||
"authors",
|
||||
"patents",
|
||||
"third_party_notices",
|
||||
"third-party-notices",
|
||||
}
|
||||
|
||||
|
||||
@@ -193,7 +210,7 @@ def _build_patterns(name: str, languages: tuple = ("en",)) -> dict:
|
||||
"person_verbs": _compile_each(sources["person_verb_patterns"]),
|
||||
"project_verbs": _compile_each(sources["project_verb_patterns"]),
|
||||
"direct": direct_compiled,
|
||||
"versioned": re.compile(rf"\b{n}[-v]\w+", re.IGNORECASE),
|
||||
"versioned": re.compile(rf"\b{n}[-_]v?\d+(?:\.\d+)*\b", re.IGNORECASE),
|
||||
"code_ref": re.compile(rf"\b{n}\.(py|js|ts|yaml|yml|json|sh)\b", re.IGNORECASE),
|
||||
}
|
||||
|
||||
@@ -227,12 +244,19 @@ def score_entity(name: str, text: str, lines: list, languages=("en",)) -> dict:
|
||||
|
||||
# --- Person signals ---
|
||||
|
||||
# Dialogue markers (strong signal)
|
||||
# Dialogue markers (strong signal).
|
||||
# The bare `^NAME:\s` colon-prefix pattern matches metadata lines like
|
||||
# `Created: 2026-04-21`, so we require >= 2 hits for it to count as dialogue
|
||||
# (real speaker markers repeat; single-line metadata doesn't).
|
||||
for rx in patterns["dialogue"]:
|
||||
matches = len(rx.findall(text))
|
||||
if matches > 0:
|
||||
person_score += matches * 3
|
||||
person_signals.append(f"dialogue marker ({matches}x)")
|
||||
if matches == 0:
|
||||
continue
|
||||
is_bare_colon = rx.pattern.endswith(r":\s") and not rx.pattern.endswith(r"[:\s]")
|
||||
if is_bare_colon and matches < 2:
|
||||
continue
|
||||
person_score += matches * 3
|
||||
person_signals.append(f"dialogue marker ({matches}x)")
|
||||
|
||||
# Person verbs
|
||||
for rx in patterns["person_verbs"]:
|
||||
@@ -328,17 +352,28 @@ def classify_entity(name: str, frequency: int, scores: dict) -> dict:
|
||||
signal_categories.add("addressed")
|
||||
|
||||
has_two_signal_types = len(signal_categories) >= 2
|
||||
_ = signal_categories - {"pronoun"} # reserved for future thresholds
|
||||
# Single-category pronoun signal still classifies as person when the
|
||||
# evidence is overwhelming — a diary's main character is referenced
|
||||
# with pronouns, not dialogue markers. Require both: many pronoun hits
|
||||
# AND a high pronoun-to-frequency ratio so common sentence-start words
|
||||
# (Never, Before, etc.) with incidental pronoun proximity don't qualify.
|
||||
pronoun_hits = 0
|
||||
for s in scores["person_signals"]:
|
||||
m = re.search(r"pronoun nearby \((\d+)x\)", s)
|
||||
if m:
|
||||
pronoun_hits = int(m.group(1))
|
||||
break
|
||||
strong_pronoun_signal = pronoun_hits >= 5 and frequency > 0 and pronoun_hits / frequency >= 0.2
|
||||
|
||||
if person_ratio >= 0.7 and has_two_signal_types and ps >= 5:
|
||||
if person_ratio >= 0.7 and (has_two_signal_types and ps >= 5 or strong_pronoun_signal):
|
||||
entity_type = "person"
|
||||
confidence = min(0.99, 0.5 + person_ratio * 0.5)
|
||||
signals = scores["person_signals"] or [f"appears {frequency}x"]
|
||||
elif person_ratio >= 0.7 and (not has_two_signal_types or ps < 5):
|
||||
# Pronoun-only match — downgrade to uncertain
|
||||
elif person_ratio >= 0.7:
|
||||
# Weak single-category person signal — downgrade to uncertain
|
||||
entity_type = "uncertain"
|
||||
confidence = 0.4
|
||||
signals = scores["person_signals"] + [f"appears {frequency}x — pronoun-only match"]
|
||||
signals = scores["person_signals"] + [f"appears {frequency}x — weak person signal"]
|
||||
elif person_ratio <= 0.3:
|
||||
entity_type = "project"
|
||||
confidence = min(0.99, 0.5 + (1 - person_ratio) * 0.5)
|
||||
@@ -560,6 +595,8 @@ def scan_for_detection(project_dir: str, max_files: int = 10) -> list:
|
||||
dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
|
||||
for filename in filenames:
|
||||
filepath = Path(root) / filename
|
||||
if filepath.stem.lower() in SKIP_FILENAMES:
|
||||
continue
|
||||
ext = filepath.suffix.lower()
|
||||
if ext in PROSE_EXTENSIONS:
|
||||
prose_files.append(filepath)
|
||||
|
||||
+12
-2
@@ -42,7 +42,7 @@
|
||||
"action_pattern": "(?:built|fixed|wrote|added|pushed|measured|tested|reviewed|created|deleted|updated|configured|deployed|migrated)\\s+[\\w\\s]{3,30}"
|
||||
},
|
||||
"entity": {
|
||||
"candidate_pattern": "[A-Z][a-z]{1,19}",
|
||||
"candidate_pattern": "[A-Z][a-z]+(?:[A-Z][a-z]+|[A-Z]{2,})+|[A-Z][a-z]{1,19}",
|
||||
"multi_word_pattern": "[A-Z][a-z]+(?:\\s+[A-Z][a-z]+)+",
|
||||
"person_verb_patterns": [
|
||||
"\\b{name}\\s+said\\b",
|
||||
@@ -140,7 +140,17 @@
|
||||
"agents", "tools", "others", "guards", "ethics", "regulation",
|
||||
"learning", "thinking", "memory", "language", "intelligence",
|
||||
"technology", "society", "culture", "future", "history", "science",
|
||||
"model", "models", "network", "networks", "training", "inference"
|
||||
"model", "models", "network", "networks", "training", "inference",
|
||||
"created", "updated", "deleted", "added", "removed", "modified",
|
||||
"extracted", "processed", "generated", "compiled", "launched", "installed",
|
||||
"deployed", "executed", "loaded", "parsed", "validated", "configured",
|
||||
"total", "summary", "covered", "included", "pending", "failed", "success",
|
||||
"ready", "active", "disabled", "enabled", "available", "completed",
|
||||
"auto", "multi", "mini", "micro", "meta", "super", "hybrid",
|
||||
"context", "bridge", "batch", "local", "global", "native", "cloud",
|
||||
"before", "after", "during", "often", "always", "never",
|
||||
"project", "contributor", "software",
|
||||
"backend", "frontend", "server", "client", "service", "app", "api"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -148,6 +148,33 @@ def test_classify_entity_pronoun_only_is_uncertain():
|
||||
assert result["type"] == "uncertain"
|
||||
|
||||
|
||||
def test_classify_entity_high_pronoun_signal_is_person():
|
||||
"""A diary's main character hit by many pronouns should still classify
|
||||
as a person even with only the pronoun signal category. Example from
|
||||
real data: `Lu` has 16 pronoun hits out of 30 mentions."""
|
||||
scores = {
|
||||
"person_score": 32,
|
||||
"project_score": 0,
|
||||
"person_signals": ["pronoun nearby (16x)"],
|
||||
"project_signals": [],
|
||||
}
|
||||
result = classify_entity("Lu", 30, scores)
|
||||
assert result["type"] == "person"
|
||||
|
||||
|
||||
def test_classify_entity_low_pronoun_proximity_is_uncertain():
|
||||
"""Common sentence-start words (Never, Before) get a few pronouns nearby
|
||||
incidentally. The ratio stays low (<20%), so they stay uncertain."""
|
||||
scores = {
|
||||
"person_score": 4,
|
||||
"project_score": 0,
|
||||
"person_signals": ["pronoun nearby (2x)"],
|
||||
"project_signals": [],
|
||||
}
|
||||
result = classify_entity("Never", 21, scores)
|
||||
assert result["type"] == "uncertain"
|
||||
|
||||
|
||||
def test_classify_entity_mixed_signals():
|
||||
scores = {
|
||||
"person_score": 5,
|
||||
|
||||
Reference in New Issue
Block a user