Optimize entity detection with regex caching and pre-compilation

- Use functools.lru_cache to cache compiled patterns for entity names. - Pre-compile static pronoun patterns into a single regex. - Remove redundant .lower() calls in score_entity loop. Co-authored-by: igorls <4753812+igorls@users.noreply.github.com>
2026-04-13 21:35:53 +00:00
parent 95a8d7176a
commit d886a62d8a
1 changed files with 7 additions and 5 deletions
@@ -17,6 +17,7 @@ Usage:

 import re
 import os
+import functools
 from pathlib import Path
 from collections import defaultdict

@@ -60,6 +61,8 @@ PRONOUN_PATTERNS = [
    r"\btheir\b",
 ]

+PRONOUN_RE = re.compile("|".join(PRONOUN_PATTERNS), re.IGNORECASE)
+
 # Person signals — dialogue markers
 DIALOGUE_PATTERNS = [
    r"^>\s*{name}[:\s]",  # > Speaker: ...
@@ -466,6 +469,7 @@ def extract_candidates(text: str) -> dict:
 # ==================== SIGNAL SCORING ====================


+@functools.lru_cache(maxsize=128)
 def _build_patterns(name: str) -> dict:
    """Pre-compile all regex patterns for a single entity name."""
    n = re.escape(name)
@@ -515,11 +519,9 @@ def score_entity(name: str, text: str, lines: list) -> dict:
    name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()]
    pronoun_hits = 0
    for idx in name_line_indices:
-        window_text = " ".join(lines[max(0, idx - 2) : idx + 3]).lower()
-        for pronoun_pattern in PRONOUN_PATTERNS:
-            if re.search(pronoun_pattern, window_text):
-                pronoun_hits += 1
-                break
+        window_text = " ".join(lines[max(0, idx - 2) : idx + 3])
+        if PRONOUN_RE.search(window_text):
+            pronoun_hits += 1
    if pronoun_hits > 0:
        person_score += pronoun_hits * 2
        person_signals.append(f"pronoun nearby ({pronoun_hits}x)")