fix: enforce atomic topics in closets, extract richer pointers
- upsert_closet replaced by upsert_closet_lines: checks each topic line individually against CLOSET_CHAR_LIMIT. If adding one line WHOLE would exceed the limit, starts a new closet. Never splits mid-topic. - build_closet_lines returns a list of atomic lines (not joined text) - Richer extraction: section headers, more action verbs, up to 3 quotes, up to 12 topics per file - Each line is complete: topic|entities|→drawer_refs Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
+5
-4
@@ -17,7 +17,7 @@ from collections import defaultdict
|
|||||||
|
|
||||||
from .palace import (
|
from .palace import (
|
||||||
SKIP_DIRS, get_collection, get_closets_collection,
|
SKIP_DIRS, get_collection, get_closets_collection,
|
||||||
file_already_mined, mine_lock, build_closet_text, upsert_closet,
|
file_already_mined, mine_lock, build_closet_lines, upsert_closet_lines,
|
||||||
)
|
)
|
||||||
|
|
||||||
READABLE_EXTENSIONS = {
|
READABLE_EXTENSIONS = {
|
||||||
@@ -471,14 +471,15 @@ def process_file(
|
|||||||
drawers_added += 1
|
drawers_added += 1
|
||||||
|
|
||||||
# Build closet — the searchable index pointing to these drawers
|
# Build closet — the searchable index pointing to these drawers
|
||||||
|
# Each topic line is atomic — never split across closets
|
||||||
if closets_col and drawers_added > 0:
|
if closets_col and drawers_added > 0:
|
||||||
drawer_ids = [
|
drawer_ids = [
|
||||||
f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(c['chunk_index'])).encode()).hexdigest()[:24]}"
|
f"drawer_{wing}_{room}_{hashlib.sha256((source_file + str(c['chunk_index'])).encode()).hexdigest()[:24]}"
|
||||||
for c in chunks
|
for c in chunks
|
||||||
]
|
]
|
||||||
closet_text = build_closet_text(source_file, drawer_ids, content, wing, room)
|
closet_lines = build_closet_lines(source_file, drawer_ids, content, wing, room)
|
||||||
closet_id = f"closet_{wing}_{room}_{hashlib.sha256(source_file.encode()).hexdigest()[:24]}"
|
closet_id_base = f"closet_{wing}_{room}_{hashlib.sha256(source_file.encode()).hexdigest()[:24]}"
|
||||||
upsert_closet(closets_col, closet_id, closet_text, {
|
upsert_closet_lines(closets_col, closet_id_base, closet_lines, {
|
||||||
"wing": wing,
|
"wing": wing,
|
||||||
"room": room,
|
"room": room,
|
||||||
"source_file": source_file,
|
"source_file": source_file,
|
||||||
|
|||||||
+82
-31
@@ -60,58 +60,109 @@ def get_closets_collection(palace_path: str, create: bool = True):
|
|||||||
CLOSET_CHAR_LIMIT = 1500 # fill closet until ~1500 chars, then start a new one
|
CLOSET_CHAR_LIMIT = 1500 # fill closet until ~1500 chars, then start a new one
|
||||||
|
|
||||||
|
|
||||||
def build_closet_text(source_file, drawer_ids, content, wing, room):
|
def build_closet_lines(source_file, drawer_ids, content, wing, room):
|
||||||
"""Build a compact closet entry from drawer content.
|
"""Build compact closet pointer lines from drawer content.
|
||||||
|
|
||||||
Extracts topics, names, and key quotes into an AAAK-style pointer
|
Returns a LIST of lines (not joined). Each line is one complete topic
|
||||||
that tells the searcher which drawers to open.
|
pointer — never split across closets.
|
||||||
|
|
||||||
|
Format: topic|entities|→drawer_ids
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
drawer_ref = ",".join(drawer_ids[:3])
|
||||||
|
|
||||||
# Extract proper nouns (capitalized words, 2+ occurrences)
|
# Extract proper nouns (capitalized words, 2+ occurrences)
|
||||||
words = re.findall(r"\b[A-Z][a-z]{2,}\b", content[:5000])
|
words = re.findall(r"\b[A-Z][a-z]{2,}\b", content[:5000])
|
||||||
word_freq = {}
|
word_freq = {}
|
||||||
for w in words:
|
for w in words:
|
||||||
word_freq[w] = word_freq.get(w, 0) + 1
|
word_freq[w] = word_freq.get(w, 0) + 1
|
||||||
entities = sorted([w for w, c in word_freq.items() if c >= 2], key=lambda w: -word_freq[w])[:5]
|
entities = sorted(
|
||||||
|
[w for w, c in word_freq.items() if c >= 2],
|
||||||
|
key=lambda w: -word_freq[w],
|
||||||
|
)[:5]
|
||||||
|
entity_str = ";".join(entities) if entities else ""
|
||||||
|
|
||||||
# Extract key phrases
|
# Extract key phrases — action verbs + context
|
||||||
topics = []
|
topics = []
|
||||||
for pattern in [
|
for pattern in [
|
||||||
r"(?:built|fixed|wrote|added|pushed|tested|created|decided|migrated)\s+[\w\s]{3,30}",
|
r"(?:built|fixed|wrote|added|pushed|tested|created|decided|migrated|reviewed|deployed|configured|removed|updated)\s+[\w\s]{3,40}",
|
||||||
]:
|
]:
|
||||||
topics.extend(re.findall(pattern, content[:5000], re.IGNORECASE))
|
topics.extend(re.findall(pattern, content[:5000], re.IGNORECASE))
|
||||||
topics = list(dict.fromkeys(t.strip().lower() for t in topics))[:8]
|
# Also grab section headers if present
|
||||||
|
for header in re.findall(r"^#{1,3}\s+(.{5,60})$", content[:5000], re.MULTILINE):
|
||||||
|
topics.append(header.strip())
|
||||||
|
# Dedupe preserving order
|
||||||
|
topics = list(dict.fromkeys(t.strip().lower() for t in topics))[:12]
|
||||||
|
|
||||||
# Extract first quote
|
# Extract quotes
|
||||||
quotes = re.findall(r'"([^"]{15,100})"', content[:5000])
|
quotes = re.findall(r'"([^"]{15,150})"', content[:5000])
|
||||||
quote = quotes[0] if quotes else ""
|
|
||||||
|
|
||||||
# Build pointer lines
|
# Build pointer lines — each one is atomic, never split
|
||||||
entity_str = ";".join(entities[:5]) if entities else ""
|
|
||||||
lines = []
|
lines = []
|
||||||
for topic in topics:
|
for topic in topics:
|
||||||
pointer = f"{topic}|{entity_str}|→{','.join(drawer_ids[:3])}"
|
lines.append(f"{topic}|{entity_str}|→{drawer_ref}")
|
||||||
lines.append(pointer)
|
for quote in quotes[:3]:
|
||||||
if quote:
|
lines.append(f'"{quote}"|{entity_str}|→{drawer_ref}')
|
||||||
lines.append(f'"{quote}"|{entity_str}|→{",".join(drawer_ids[:3])}')
|
|
||||||
|
# Always have at least one line
|
||||||
if not lines:
|
if not lines:
|
||||||
lines.append(f"{wing}/{room}|{entity_str}|→{','.join(drawer_ids[:3])}")
|
name = Path(source_file).stem[:40]
|
||||||
|
lines.append(f"{wing}/{room}/{name}|{entity_str}|→{drawer_ref}")
|
||||||
|
|
||||||
return "\n".join(lines)
|
return lines
|
||||||
|
|
||||||
|
|
||||||
def upsert_closet(closets_col, closet_id, closet_text, metadata):
|
def upsert_closet_lines(closets_col, closet_id_base, lines, metadata):
|
||||||
"""Add or update a closet. Respects CLOSET_CHAR_LIMIT."""
|
"""Add topic lines to closets. Never splits a topic mid-line.
|
||||||
try:
|
|
||||||
existing = closets_col.get(ids=[closet_id])
|
If adding a line WHOLE would exceed CLOSET_CHAR_LIMIT, a new closet
|
||||||
if existing.get("ids"):
|
is created. Some closets may have less than 1500 chars — that's fine.
|
||||||
old_text = existing["documents"][0]
|
Every topic is complete and readable.
|
||||||
if len(old_text) + len(closet_text) + 1 <= CLOSET_CHAR_LIMIT:
|
|
||||||
closet_text = old_text + "\n" + closet_text
|
Returns the number of closets written.
|
||||||
# else: start fresh — old closet was full
|
"""
|
||||||
except Exception:
|
closet_num = 1
|
||||||
pass
|
current_lines = []
|
||||||
closets_col.upsert(documents=[closet_text], ids=[closet_id], metadatas=[metadata])
|
current_chars = 0
|
||||||
|
closets_written = 0
|
||||||
|
|
||||||
|
def _flush():
|
||||||
|
nonlocal closets_written
|
||||||
|
if not current_lines:
|
||||||
|
return
|
||||||
|
closet_id = f"{closet_id_base}_{closet_num:02d}"
|
||||||
|
text = "\n".join(current_lines)
|
||||||
|
|
||||||
|
# Check if closet already has content — append if room
|
||||||
|
try:
|
||||||
|
existing = closets_col.get(ids=[closet_id])
|
||||||
|
if existing.get("ids") and existing["documents"][0]:
|
||||||
|
old = existing["documents"][0]
|
||||||
|
if len(old) + len(text) + 1 <= CLOSET_CHAR_LIMIT:
|
||||||
|
text = old + "\n" + text
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
closets_col.upsert(documents=[text], ids=[closet_id], metadatas=[metadata])
|
||||||
|
closets_written += 1
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
line_len = len(line)
|
||||||
|
# Would this line fit whole in the current closet?
|
||||||
|
if current_chars > 0 and current_chars + line_len + 1 > CLOSET_CHAR_LIMIT:
|
||||||
|
# Doesn't fit — flush current closet, start new one
|
||||||
|
_flush()
|
||||||
|
closet_num += 1
|
||||||
|
current_lines = []
|
||||||
|
current_chars = 0
|
||||||
|
|
||||||
|
current_lines.append(line)
|
||||||
|
current_chars += line_len + 1 # +1 for newline
|
||||||
|
|
||||||
|
_flush()
|
||||||
|
return closets_written
|
||||||
|
|
||||||
|
|
||||||
@contextlib.contextmanager
|
@contextlib.contextmanager
|
||||||
|
|||||||
Reference in New Issue
Block a user