Files
mempalace/mempalace/exporter.py
T
Marcio E. Heiderscheidt b524b31839 fix: restrict file permissions on sensitive palace data (#814)
* fix: restrict file permissions on sensitive palace data

On Linux with default umask (022), several files and directories
containing personal data were created world-readable. This patch
applies chmod 0o700 to directories and 0o600 to files immediately
after creation, wrapped in try/except for Windows compatibility.

Files hardened:
- hooks_cli.py: hook_state/ directory and hook.log
- entity_registry.py: entity_registry.json (names, relationships)
- knowledge_graph.py: knowledge_graph.sqlite3 parent directory
- exporter.py: export output directory and wing subdirectories
- config.py: people_map.json (name mappings)
- mcp_server.py: WAL file creation uses atomic os.open (TOCTOU fix)

Refs: MemPalace/mempalace#809

* fix: avoid redundant chmod calls on hot paths

- hooks_cli.py: chmod STATE_DIR and hook.log only on first creation,
  not on every _log() call (hooks fire on every Stop event)
- exporter.py: track created wing dirs to skip redundant makedirs +
  chmod on the same directory across batches
- mcp_server.py: remove redundant _WAL_FILE.chmod after os.open
  already set mode=0o600 atomically

Refs: MemPalace/mempalace#809
2026-04-15 00:27:03 -07:00

174 lines
6.1 KiB
Python

"""
exporter.py — Export the palace as a browsable folder of markdown files.
Produces:
output_dir/
index.md — table of contents
wing_name/
room_name.md — one file per room, drawers as sections
Streams drawers in paginated batches so memory usage stays bounded
regardless of palace size.
"""
import os
import re
from collections import defaultdict
from datetime import datetime
from .palace import get_collection
def _safe_path_component(name: str) -> str:
"""Sanitize a string for use as a directory/file name component."""
name = re.sub(r'[/\\:*?"<>|]', "_", name)
name = name.strip(". ")
return name or "unknown"
def export_palace(palace_path: str, output_dir: str, format: str = "markdown") -> dict:
"""Export all palace drawers as markdown files organized by wing/room.
Streams drawers in batches of 1000 and writes each wing/room file
incrementally, keeping memory usage proportional to batch size rather
than total palace size.
Args:
palace_path: Path to the ChromaDB palace directory.
output_dir: Where to write the exported markdown tree.
format: Output format (currently only "markdown").
Returns:
Stats dict: {"wings": N, "rooms": N, "drawers": N}
"""
col = get_collection(palace_path)
total = col.count()
if total == 0:
print(" Palace is empty — nothing to export.")
return {"wings": 0, "rooms": 0, "drawers": 0}
os.makedirs(output_dir, exist_ok=True)
try:
os.chmod(output_dir, 0o700)
except (OSError, NotImplementedError):
pass
# Track which room files have been opened (so we can append vs overwrite)
opened_rooms: set[tuple[str, str]] = set()
# Track which wing directories have been created and chmoded
created_wing_dirs: set[str] = set()
# Track stats per wing: {wing: {room: count}}
wing_stats: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int))
total_drawers = 0
print(f" Streaming {total} drawers...")
offset = 0
while offset < total:
batch = col.get(limit=1000, offset=offset, include=["documents", "metadatas"])
if not batch["ids"]:
break
# Group this batch by wing/room so we do one file write per room per batch
batch_grouped: dict[str, dict[str, list]] = defaultdict(lambda: defaultdict(list))
for doc_id, doc, meta in zip(batch["ids"], batch["documents"], batch["metadatas"]):
wing = meta.get("wing", "unknown")
room = meta.get("room", "general")
batch_grouped[wing][room].append(
{
"id": doc_id,
"content": doc,
"source": meta.get("source_file", ""),
"filed_at": meta.get("filed_at", ""),
"added_by": meta.get("added_by", ""),
}
)
# Write/append each room file
for wing, rooms in batch_grouped.items():
safe_wing = _safe_path_component(wing)
wing_dir = os.path.join(output_dir, safe_wing)
if wing_dir not in created_wing_dirs:
os.makedirs(wing_dir, exist_ok=True)
try:
os.chmod(wing_dir, 0o700)
except (OSError, NotImplementedError):
pass
created_wing_dirs.add(wing_dir)
for room, drawers in rooms.items():
safe_room = _safe_path_component(room)
room_path = os.path.join(wing_dir, f"{safe_room}.md")
key = (wing, room)
is_new = key not in opened_rooms
with open(room_path, "a" if not is_new else "w", encoding="utf-8") as f:
if is_new:
f.write(f"# {wing} / {room}\n\n")
opened_rooms.add(key)
for drawer in drawers:
source = drawer["source"] or "unknown"
filed = drawer["filed_at"] or "unknown"
added_by = drawer["added_by"] or "unknown"
f.write(
f"## {drawer['id']}\n"
f"\n"
f"> {_quote_content(drawer['content'])}\n"
f"\n"
f"| Field | Value |\n"
f"|-------|-------|\n"
f"| Source | {source} |\n"
f"| Filed | {filed} |\n"
f"| Added by | {added_by} |\n"
f"\n"
f"---\n\n"
)
wing_stats[wing][room] += len(drawers)
total_drawers += len(drawers)
offset += len(batch["ids"])
# Build and print stats
index_rows = []
for wing in sorted(wing_stats):
rooms = wing_stats[wing]
wing_drawer_count = sum(rooms.values())
index_rows.append((wing, len(rooms), wing_drawer_count))
print(f" {wing}: {len(rooms)} rooms, {wing_drawer_count} drawers")
# Write index.md
today = datetime.now().strftime("%Y-%m-%d")
index_lines = [
f"# Palace Export — {today}\n",
"",
"| Wing | Rooms | Drawers |",
"|------|-------|---------|",
]
for wing, room_count, drawer_count in index_rows:
index_lines.append(f"| [{wing}]({wing}/) | {room_count} | {drawer_count} |")
index_lines.append("")
index_path = os.path.join(output_dir, "index.md")
with open(index_path, "w", encoding="utf-8") as f:
f.write("\n".join(index_lines))
stats = {
"wings": len(wing_stats),
"rooms": sum(r for _, r, _ in index_rows),
"drawers": total_drawers,
}
print(
f"\n Exported {stats['drawers']} drawers across {stats['wings']} wings, {stats['rooms']} rooms"
)
print(f" Output: {output_dir}")
return stats
def _quote_content(text: str) -> str:
"""Format content for a markdown blockquote, handling multiline."""
lines = text.rstrip("\n").split("\n")
return "\n> ".join(lines)