b524b31839
* fix: restrict file permissions on sensitive palace data On Linux with default umask (022), several files and directories containing personal data were created world-readable. This patch applies chmod 0o700 to directories and 0o600 to files immediately after creation, wrapped in try/except for Windows compatibility. Files hardened: - hooks_cli.py: hook_state/ directory and hook.log - entity_registry.py: entity_registry.json (names, relationships) - knowledge_graph.py: knowledge_graph.sqlite3 parent directory - exporter.py: export output directory and wing subdirectories - config.py: people_map.json (name mappings) - mcp_server.py: WAL file creation uses atomic os.open (TOCTOU fix) Refs: MemPalace/mempalace#809 * fix: avoid redundant chmod calls on hot paths - hooks_cli.py: chmod STATE_DIR and hook.log only on first creation, not on every _log() call (hooks fire on every Stop event) - exporter.py: track created wing dirs to skip redundant makedirs + chmod on the same directory across batches - mcp_server.py: remove redundant _WAL_FILE.chmod after os.open already set mode=0o600 atomically Refs: MemPalace/mempalace#809
174 lines
6.1 KiB
Python
174 lines
6.1 KiB
Python
"""
|
|
exporter.py — Export the palace as a browsable folder of markdown files.
|
|
|
|
Produces:
|
|
output_dir/
|
|
index.md — table of contents
|
|
wing_name/
|
|
room_name.md — one file per room, drawers as sections
|
|
|
|
Streams drawers in paginated batches so memory usage stays bounded
|
|
regardless of palace size.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
from collections import defaultdict
|
|
from datetime import datetime
|
|
|
|
from .palace import get_collection
|
|
|
|
|
|
def _safe_path_component(name: str) -> str:
|
|
"""Sanitize a string for use as a directory/file name component."""
|
|
name = re.sub(r'[/\\:*?"<>|]', "_", name)
|
|
name = name.strip(". ")
|
|
return name or "unknown"
|
|
|
|
|
|
def export_palace(palace_path: str, output_dir: str, format: str = "markdown") -> dict:
|
|
"""Export all palace drawers as markdown files organized by wing/room.
|
|
|
|
Streams drawers in batches of 1000 and writes each wing/room file
|
|
incrementally, keeping memory usage proportional to batch size rather
|
|
than total palace size.
|
|
|
|
Args:
|
|
palace_path: Path to the ChromaDB palace directory.
|
|
output_dir: Where to write the exported markdown tree.
|
|
format: Output format (currently only "markdown").
|
|
|
|
Returns:
|
|
Stats dict: {"wings": N, "rooms": N, "drawers": N}
|
|
"""
|
|
col = get_collection(palace_path)
|
|
total = col.count()
|
|
|
|
if total == 0:
|
|
print(" Palace is empty — nothing to export.")
|
|
return {"wings": 0, "rooms": 0, "drawers": 0}
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
try:
|
|
os.chmod(output_dir, 0o700)
|
|
except (OSError, NotImplementedError):
|
|
pass
|
|
|
|
# Track which room files have been opened (so we can append vs overwrite)
|
|
opened_rooms: set[tuple[str, str]] = set()
|
|
# Track which wing directories have been created and chmoded
|
|
created_wing_dirs: set[str] = set()
|
|
# Track stats per wing: {wing: {room: count}}
|
|
wing_stats: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
|
|
total_drawers = 0
|
|
|
|
print(f" Streaming {total} drawers...")
|
|
offset = 0
|
|
while offset < total:
|
|
batch = col.get(limit=1000, offset=offset, include=["documents", "metadatas"])
|
|
if not batch["ids"]:
|
|
break
|
|
|
|
# Group this batch by wing/room so we do one file write per room per batch
|
|
batch_grouped: dict[str, dict[str, list]] = defaultdict(lambda: defaultdict(list))
|
|
for doc_id, doc, meta in zip(batch["ids"], batch["documents"], batch["metadatas"]):
|
|
wing = meta.get("wing", "unknown")
|
|
room = meta.get("room", "general")
|
|
batch_grouped[wing][room].append(
|
|
{
|
|
"id": doc_id,
|
|
"content": doc,
|
|
"source": meta.get("source_file", ""),
|
|
"filed_at": meta.get("filed_at", ""),
|
|
"added_by": meta.get("added_by", ""),
|
|
}
|
|
)
|
|
|
|
# Write/append each room file
|
|
for wing, rooms in batch_grouped.items():
|
|
safe_wing = _safe_path_component(wing)
|
|
wing_dir = os.path.join(output_dir, safe_wing)
|
|
if wing_dir not in created_wing_dirs:
|
|
os.makedirs(wing_dir, exist_ok=True)
|
|
try:
|
|
os.chmod(wing_dir, 0o700)
|
|
except (OSError, NotImplementedError):
|
|
pass
|
|
created_wing_dirs.add(wing_dir)
|
|
|
|
for room, drawers in rooms.items():
|
|
safe_room = _safe_path_component(room)
|
|
room_path = os.path.join(wing_dir, f"{safe_room}.md")
|
|
key = (wing, room)
|
|
is_new = key not in opened_rooms
|
|
|
|
with open(room_path, "a" if not is_new else "w", encoding="utf-8") as f:
|
|
if is_new:
|
|
f.write(f"# {wing} / {room}\n\n")
|
|
opened_rooms.add(key)
|
|
|
|
for drawer in drawers:
|
|
source = drawer["source"] or "unknown"
|
|
filed = drawer["filed_at"] or "unknown"
|
|
added_by = drawer["added_by"] or "unknown"
|
|
|
|
f.write(
|
|
f"## {drawer['id']}\n"
|
|
f"\n"
|
|
f"> {_quote_content(drawer['content'])}\n"
|
|
f"\n"
|
|
f"| Field | Value |\n"
|
|
f"|-------|-------|\n"
|
|
f"| Source | {source} |\n"
|
|
f"| Filed | {filed} |\n"
|
|
f"| Added by | {added_by} |\n"
|
|
f"\n"
|
|
f"---\n\n"
|
|
)
|
|
|
|
wing_stats[wing][room] += len(drawers)
|
|
total_drawers += len(drawers)
|
|
|
|
offset += len(batch["ids"])
|
|
|
|
# Build and print stats
|
|
index_rows = []
|
|
for wing in sorted(wing_stats):
|
|
rooms = wing_stats[wing]
|
|
wing_drawer_count = sum(rooms.values())
|
|
index_rows.append((wing, len(rooms), wing_drawer_count))
|
|
print(f" {wing}: {len(rooms)} rooms, {wing_drawer_count} drawers")
|
|
|
|
# Write index.md
|
|
today = datetime.now().strftime("%Y-%m-%d")
|
|
index_lines = [
|
|
f"# Palace Export — {today}\n",
|
|
"",
|
|
"| Wing | Rooms | Drawers |",
|
|
"|------|-------|---------|",
|
|
]
|
|
for wing, room_count, drawer_count in index_rows:
|
|
index_lines.append(f"| [{wing}]({wing}/) | {room_count} | {drawer_count} |")
|
|
index_lines.append("")
|
|
|
|
index_path = os.path.join(output_dir, "index.md")
|
|
with open(index_path, "w", encoding="utf-8") as f:
|
|
f.write("\n".join(index_lines))
|
|
|
|
stats = {
|
|
"wings": len(wing_stats),
|
|
"rooms": sum(r for _, r, _ in index_rows),
|
|
"drawers": total_drawers,
|
|
}
|
|
print(
|
|
f"\n Exported {stats['drawers']} drawers across {stats['wings']} wings, {stats['rooms']} rooms"
|
|
)
|
|
print(f" Output: {output_dir}")
|
|
return stats
|
|
|
|
|
|
def _quote_content(text: str) -> str:
|
|
"""Format content for a markdown blockquote, handling multiline."""
|
|
lines = text.rstrip("\n").split("\n")
|
|
return "\n> ".join(lines)
|