Optimize entity detection with regex caching and pre-compilation
- Use functools.lru_cache to cache compiled patterns for entity names. - Pre-compile static pronoun patterns into a single regex. - Remove redundant .lower() calls in score_entity loop. Co-authored-by: igorls <4753812+igorls@users.noreply.github.com>
This commit is contained in:
@@ -17,6 +17,7 @@ Usage:
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
import os
|
import os
|
||||||
|
import functools
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
@@ -60,6 +61,8 @@ PRONOUN_PATTERNS = [
|
|||||||
r"\btheir\b",
|
r"\btheir\b",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
PRONOUN_RE = re.compile("|".join(PRONOUN_PATTERNS), re.IGNORECASE)
|
||||||
|
|
||||||
# Person signals — dialogue markers
|
# Person signals — dialogue markers
|
||||||
DIALOGUE_PATTERNS = [
|
DIALOGUE_PATTERNS = [
|
||||||
r"^>\s*{name}[:\s]", # > Speaker: ...
|
r"^>\s*{name}[:\s]", # > Speaker: ...
|
||||||
@@ -466,6 +469,7 @@ def extract_candidates(text: str) -> dict:
|
|||||||
# ==================== SIGNAL SCORING ====================
|
# ==================== SIGNAL SCORING ====================
|
||||||
|
|
||||||
|
|
||||||
|
@functools.lru_cache(maxsize=128)
|
||||||
def _build_patterns(name: str) -> dict:
|
def _build_patterns(name: str) -> dict:
|
||||||
"""Pre-compile all regex patterns for a single entity name."""
|
"""Pre-compile all regex patterns for a single entity name."""
|
||||||
n = re.escape(name)
|
n = re.escape(name)
|
||||||
@@ -515,11 +519,9 @@ def score_entity(name: str, text: str, lines: list) -> dict:
|
|||||||
name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()]
|
name_line_indices = [i for i, line in enumerate(lines) if name_lower in line.lower()]
|
||||||
pronoun_hits = 0
|
pronoun_hits = 0
|
||||||
for idx in name_line_indices:
|
for idx in name_line_indices:
|
||||||
window_text = " ".join(lines[max(0, idx - 2) : idx + 3]).lower()
|
window_text = " ".join(lines[max(0, idx - 2) : idx + 3])
|
||||||
for pronoun_pattern in PRONOUN_PATTERNS:
|
if PRONOUN_RE.search(window_text):
|
||||||
if re.search(pronoun_pattern, window_text):
|
pronoun_hits += 1
|
||||||
pronoun_hits += 1
|
|
||||||
break
|
|
||||||
if pronoun_hits > 0:
|
if pronoun_hits > 0:
|
||||||
person_score += pronoun_hits * 2
|
person_score += pronoun_hits * 2
|
||||||
person_signals.append(f"pronoun nearby ({pronoun_hits}x)")
|
person_signals.append(f"pronoun nearby ({pronoun_hits}x)")
|
||||||
|
|||||||
Reference in New Issue
Block a user