security: harden inputs, fix shell injection, optimize DB access
- Fix command injection in hook script (pass paths via sys.argv) - Add sanitize_name/sanitize_content validators in config.py - Add 10MB file size guard + symlink skip in miners - Fix SQLite connection leak in knowledge_graph.py (reuse connection) - Use `with conn:` for proper transaction handling - Consolidate shared palace operations into palace.py - Add write-ahead log for audit trail on writes/deletes - Add metadata cache with 30s TTL for status/taxonomy calls - Upgrade md5 → sha256 for drawer/triple IDs - Harden file permissions (0o700/0o600) - Pin chromadb>=0.5.0,<0.7 Based on PR #252 by @anthonyonazure with lint fixes applied. Co-Authored-By: anthonyonazure <anthonyonazure@users.noreply.github.com>
This commit is contained in:
+11
-35
@@ -15,9 +15,8 @@ from pathlib import Path
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
|
||||
import chromadb
|
||||
|
||||
from .normalize import normalize
|
||||
from .palace import SKIP_DIRS, get_collection, file_already_mined
|
||||
|
||||
|
||||
# File types that might contain conversations
|
||||
@@ -28,22 +27,8 @@ CONVO_EXTENSIONS = {
|
||||
".jsonl",
|
||||
}
|
||||
|
||||
SKIP_DIRS = {
|
||||
".git",
|
||||
"node_modules",
|
||||
"__pycache__",
|
||||
".venv",
|
||||
"venv",
|
||||
"env",
|
||||
"dist",
|
||||
"build",
|
||||
".next",
|
||||
".mempalace",
|
||||
"tool-results",
|
||||
"memory",
|
||||
}
|
||||
|
||||
MIN_CHUNK_SIZE = 30
|
||||
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB — skip files larger than this
|
||||
|
||||
|
||||
# =============================================================================
|
||||
@@ -211,23 +196,6 @@ def detect_convo_room(content: str) -> str:
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def get_collection(palace_path: str):
|
||||
os.makedirs(palace_path, exist_ok=True)
|
||||
client = chromadb.PersistentClient(path=palace_path)
|
||||
try:
|
||||
return client.get_collection("mempalace_drawers")
|
||||
except Exception:
|
||||
return client.create_collection("mempalace_drawers")
|
||||
|
||||
|
||||
def file_already_mined(collection, source_file: str) -> bool:
|
||||
try:
|
||||
results = collection.get(where={"source_file": source_file}, limit=1)
|
||||
return len(results.get("ids", [])) > 0
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# SCAN FOR CONVERSATION FILES
|
||||
# =============================================================================
|
||||
@@ -244,6 +212,14 @@ def scan_convos(convo_dir: str) -> list:
|
||||
continue
|
||||
filepath = Path(root) / filename
|
||||
if filepath.suffix.lower() in CONVO_EXTENSIONS:
|
||||
# Skip symlinks and oversized files
|
||||
if filepath.is_symlink():
|
||||
continue
|
||||
try:
|
||||
if filepath.stat().st_size > MAX_FILE_SIZE:
|
||||
continue
|
||||
except OSError:
|
||||
continue
|
||||
files.append(filepath)
|
||||
return files
|
||||
|
||||
@@ -356,7 +332,7 @@ def mine_convos(
|
||||
chunk_room = chunk.get("memory_type", room) if extract_mode == "general" else room
|
||||
if extract_mode == "general":
|
||||
room_counts[chunk_room] += 1
|
||||
drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.md5((source_file + str(chunk['chunk_index'])).encode(), usedforsecurity=False).hexdigest()[:16]}"
|
||||
drawer_id = f"drawer_{wing}_{chunk_room}_{hashlib.sha256((source_file + str(chunk['chunk_index'])).encode()).hexdigest()[:24]}"
|
||||
try:
|
||||
collection.add(
|
||||
documents=[chunk["content"]],
|
||||
|
||||
Reference in New Issue
Block a user