diff --git a/mempalace/cli.py b/mempalace/cli.py index 467f618..30add58 100644 --- a/mempalace/cli.py +++ b/mempalace/cli.py @@ -65,6 +65,9 @@ def cmd_init(args): def cmd_mine(args): palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path + include_ignored = [] + for raw in args.include_ignored or []: + include_ignored.extend(part.strip() for part in raw.split(",") if part.strip()) if args.mode == "convos": from .convo_miner import mine_convos @@ -88,6 +91,8 @@ def cmd_mine(args): agent=args.agent, limit=args.limit, dry_run=args.dry_run, + respect_gitignore=not args.no_gitignore, + include_ignored=include_ignored, ) @@ -359,6 +364,17 @@ def main(): help="Ingest mode: 'projects' for code/docs (default), 'convos' for chat exports", ) p_mine.add_argument("--wing", default=None, help="Wing name (default: directory name)") + p_mine.add_argument( + "--no-gitignore", + action="store_true", + help="Don't respect .gitignore files when scanning project files", + ) + p_mine.add_argument( + "--include-ignored", + action="append", + default=[], + help="Always scan these project-relative paths even if ignored; repeat or pass comma-separated paths", + ) p_mine.add_argument( "--agent", default="mempalace", diff --git a/mempalace/miner.py b/mempalace/miner.py index ecd313d..4d3ca76 100644 --- a/mempalace/miner.py +++ b/mempalace/miner.py @@ -10,6 +10,7 @@ Stores verbatim chunks as drawers. No summaries. Ever. import os import sys import hashlib +import fnmatch from pathlib import Path from datetime import datetime from collections import defaultdict @@ -51,6 +52,27 @@ SKIP_DIRS = { ".next", "coverage", ".mempalace", + ".ruff_cache", + ".mypy_cache", + ".pytest_cache", + ".cache", + ".tox", + ".nox", + ".idea", + ".vscode", + ".ipynb_checkpoints", + ".eggs", + "htmlcov", + "target", +} + +SKIP_FILENAMES = { + "mempalace.yaml", + "mempalace.yml", + "mempal.yaml", + "mempal.yml", + ".gitignore", + "package-lock.json", } CHUNK_SIZE = 800 # chars per drawer @@ -58,6 +80,196 @@ CHUNK_OVERLAP = 100 # overlap between chunks MIN_CHUNK_SIZE = 50 # skip tiny chunks +# ============================================================================= +# IGNORE MATCHING +# ============================================================================= + + +class GitignoreMatcher: + """Lightweight matcher for one directory's .gitignore patterns.""" + + def __init__(self, base_dir: Path, rules: list): + self.base_dir = base_dir + self.rules = rules + + @classmethod + def from_dir(cls, dir_path: Path): + gitignore_path = dir_path / ".gitignore" + if not gitignore_path.is_file(): + return None + + try: + lines = gitignore_path.read_text(encoding="utf-8", errors="replace").splitlines() + except Exception: + return None + + rules = [] + for raw_line in lines: + line = raw_line.strip() + if not line: + continue + + if line.startswith("\\#") or line.startswith("\\!"): + line = line[1:] + elif line.startswith("#"): + continue + + negated = line.startswith("!") + if negated: + line = line[1:] + + anchored = line.startswith("/") + if anchored: + line = line.lstrip("/") + + dir_only = line.endswith("/") + if dir_only: + line = line.rstrip("/") + + if not line: + continue + + rules.append( + { + "pattern": line, + "anchored": anchored, + "dir_only": dir_only, + "negated": negated, + } + ) + + if not rules: + return None + + return cls(dir_path, rules) + + def matches(self, path: Path, is_dir: bool = None): + try: + relative = path.relative_to(self.base_dir).as_posix().strip("/") + except ValueError: + return None + + if not relative: + return None + + if is_dir is None: + is_dir = path.is_dir() + + ignored = None + for rule in self.rules: + if self._rule_matches(rule, relative, is_dir): + ignored = not rule["negated"] + return ignored + + def _rule_matches(self, rule: dict, relative: str, is_dir: bool) -> bool: + pattern = rule["pattern"] + parts = relative.split("/") + pattern_parts = pattern.split("/") + + if rule["dir_only"]: + target_parts = parts if is_dir else parts[:-1] + if not target_parts: + return False + if rule["anchored"] or len(pattern_parts) > 1: + return self._match_from_root(target_parts, pattern_parts) + return any(fnmatch.fnmatch(part, pattern) for part in target_parts) + + if rule["anchored"] or len(pattern_parts) > 1: + return self._match_from_root(parts, pattern_parts) + + return any(fnmatch.fnmatch(part, pattern) for part in parts) + + def _match_from_root(self, target_parts: list, pattern_parts: list) -> bool: + def matches(path_index: int, pattern_index: int) -> bool: + if pattern_index == len(pattern_parts): + return True + + if path_index == len(target_parts): + return all(part == "**" for part in pattern_parts[pattern_index:]) + + pattern_part = pattern_parts[pattern_index] + if pattern_part == "**": + return matches(path_index, pattern_index + 1) or matches( + path_index + 1, pattern_index + ) + + if not fnmatch.fnmatch(target_parts[path_index], pattern_part): + return False + + return matches(path_index + 1, pattern_index + 1) + + return matches(0, 0) + + +def load_gitignore_matcher(dir_path: Path, cache: dict): + """Load and cache one directory's .gitignore matcher.""" + if dir_path not in cache: + cache[dir_path] = GitignoreMatcher.from_dir(dir_path) + return cache[dir_path] + + +def is_gitignored(path: Path, matchers: list, is_dir: bool = False) -> bool: + """Apply active .gitignore matchers in ancestor order; last match wins.""" + ignored = False + for matcher in matchers: + decision = matcher.matches(path, is_dir=is_dir) + if decision is not None: + ignored = decision + return ignored + + +def should_skip_dir(dirname: str) -> bool: + """Skip known generated/cache directories before gitignore matching.""" + return dirname in SKIP_DIRS or dirname.endswith(".egg-info") + + +def normalize_include_paths(include_ignored: list) -> set: + """Normalize comma-parsed include paths into project-relative POSIX strings.""" + normalized = set() + for raw_path in include_ignored or []: + candidate = str(raw_path).strip().strip("/") + if candidate: + normalized.add(Path(candidate).as_posix()) + return normalized + + +def is_exact_force_include(path: Path, project_path: Path, include_paths: set) -> bool: + """Return True when a path exactly matches an explicit include override.""" + if not include_paths: + return False + + try: + relative = path.relative_to(project_path).as_posix().strip("/") + except ValueError: + return False + + return relative in include_paths + + +def is_force_included(path: Path, project_path: Path, include_paths: set) -> bool: + """Return True when a path or one of its ancestors/descendants was explicitly included.""" + if not include_paths: + return False + + try: + relative = path.relative_to(project_path).as_posix().strip("/") + except ValueError: + return False + + if not relative: + return False + + for include_path in include_paths: + if relative == include_path: + return True + if relative.startswith(f"{include_path}/"): + return True + if include_path.startswith(f"{relative}/"): + return True + + return False + + # ============================================================================= # CONFIG # ============================================================================= @@ -284,26 +496,58 @@ def process_file( # ============================================================================= -def scan_project(project_dir: str) -> list: +def scan_project( + project_dir: str, + respect_gitignore: bool = True, + include_ignored: list = None, +) -> list: """Return list of all readable file paths.""" project_path = Path(project_dir).expanduser().resolve() files = [] + active_matchers = [] + matcher_cache = {} + include_paths = normalize_include_paths(include_ignored) + for root, dirs, filenames in os.walk(project_path): - dirs[:] = [d for d in dirs if d not in SKIP_DIRS] + root_path = Path(root) + + if respect_gitignore: + active_matchers = [ + matcher + for matcher in active_matchers + if root_path == matcher.base_dir or matcher.base_dir in root_path.parents + ] + current_matcher = load_gitignore_matcher(root_path, matcher_cache) + if current_matcher is not None: + active_matchers.append(current_matcher) + + dirs[:] = [ + d + for d in dirs + if is_force_included(root_path / d, project_path, include_paths) + or not should_skip_dir(d) + ] + if respect_gitignore and active_matchers: + dirs[:] = [ + d + for d in dirs + if is_force_included(root_path / d, project_path, include_paths) + or not is_gitignored(root_path / d, active_matchers, is_dir=True) + ] + for filename in filenames: - filepath = Path(root) / filename - if filepath.suffix.lower() in READABLE_EXTENSIONS: - # Skip config files - if filename in ( - "mempalace.yaml", - "mempalace.yml", - "mempal.yaml", - "mempal.yml", - ".gitignore", - "package-lock.json", - ): + filepath = root_path / filename + force_include = is_force_included(filepath, project_path, include_paths) + exact_force_include = is_exact_force_include(filepath, project_path, include_paths) + + if not force_include and filename in SKIP_FILENAMES: + continue + if filepath.suffix.lower() not in READABLE_EXTENSIONS and not exact_force_include: + continue + if respect_gitignore and active_matchers and not force_include: + if is_gitignored(filepath, active_matchers, is_dir=False): continue - files.append(filepath) + files.append(filepath) return files @@ -319,6 +563,8 @@ def mine( agent: str = "mempalace", limit: int = 0, dry_run: bool = False, + respect_gitignore: bool = True, + include_ignored: list = None, ): """Mine a project directory into the palace.""" @@ -328,7 +574,11 @@ def mine( wing = wing_override or config["wing"] rooms = config.get("rooms", [{"name": "general", "description": "All project files"}]) - files = scan_project(project_dir) + files = scan_project( + project_dir, + respect_gitignore=respect_gitignore, + include_ignored=include_ignored, + ) if limit > 0: files = files[:limit] @@ -341,6 +591,10 @@ def mine( print(f" Palace: {palace_path}") if dry_run: print(" DRY RUN — nothing will be filed") + if not respect_gitignore: + print(" .gitignore: DISABLED") + if include_ignored: + print(f" Include: {', '.join(sorted(normalize_include_paths(include_ignored)))}") print(f"{'─' * 55}\n") if not dry_run: diff --git a/tests/test_miner.py b/tests/test_miner.py index b4d0c3a..337e949 100644 --- a/tests/test_miner.py +++ b/tests/test_miner.py @@ -1,36 +1,208 @@ import os -import tempfile import shutil -import yaml +import tempfile +from pathlib import Path + import chromadb -from mempalace.miner import mine +import yaml + +from mempalace.miner import mine, scan_project + + +def write_file(path: Path, content: str): + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + +def scanned_files(project_root: Path, **kwargs): + files = scan_project(str(project_root), **kwargs) + return sorted(path.relative_to(project_root).as_posix() for path in files) def test_project_mining(): tmpdir = tempfile.mkdtemp() - # Create a mini project - os.makedirs(os.path.join(tmpdir, "backend")) - with open(os.path.join(tmpdir, "backend", "app.py"), "w") as f: - f.write("def main():\n print('hello world')\n" * 20) - # Create config - with open(os.path.join(tmpdir, "mempalace.yaml"), "w") as f: - yaml.dump( - { - "wing": "test_project", - "rooms": [ - {"name": "backend", "description": "Backend code"}, - {"name": "general", "description": "General"}, - ], - }, - f, + try: + project_root = Path(tmpdir).resolve() + os.makedirs(project_root / "backend") + + write_file( + project_root / "backend" / "app.py", "def main():\n print('hello world')\n" * 20 ) + with open(project_root / "mempalace.yaml", "w") as f: + yaml.dump( + { + "wing": "test_project", + "rooms": [ + {"name": "backend", "description": "Backend code"}, + {"name": "general", "description": "General"}, + ], + }, + f, + ) - palace_path = os.path.join(tmpdir, "palace") - mine(tmpdir, palace_path) + palace_path = project_root / "palace" + mine(str(project_root), str(palace_path)) - # Verify - client = chromadb.PersistentClient(path=palace_path) - col = client.get_collection("mempalace_drawers") - assert col.count() > 0 + client = chromadb.PersistentClient(path=str(palace_path)) + col = client.get_collection("mempalace_drawers") + assert col.count() > 0 + finally: + shutil.rmtree(tmpdir) - shutil.rmtree(tmpdir) + +def test_scan_project_respects_gitignore(): + tmpdir = tempfile.mkdtemp() + try: + project_root = Path(tmpdir).resolve() + + write_file(project_root / ".gitignore", "ignored.py\ngenerated/\n") + write_file(project_root / "src" / "app.py", "print('hello')\n" * 20) + write_file(project_root / "ignored.py", "print('ignore me')\n" * 20) + write_file(project_root / "generated" / "artifact.py", "print('artifact')\n" * 20) + + assert scanned_files(project_root) == ["src/app.py"] + finally: + shutil.rmtree(tmpdir) + + +def test_scan_project_respects_nested_gitignore(): + tmpdir = tempfile.mkdtemp() + try: + project_root = Path(tmpdir).resolve() + + write_file(project_root / ".gitignore", "*.log\n") + write_file(project_root / "subrepo" / ".gitignore", "tasks/\n") + write_file(project_root / "subrepo" / "src" / "main.py", "print('main')\n" * 20) + write_file(project_root / "subrepo" / "tasks" / "task.py", "print('task')\n" * 20) + write_file(project_root / "subrepo" / "debug.log", "debug\n" * 20) + + assert scanned_files(project_root) == ["subrepo/src/main.py"] + finally: + shutil.rmtree(tmpdir) + + +def test_scan_project_allows_nested_gitignore_override(): + tmpdir = tempfile.mkdtemp() + try: + project_root = Path(tmpdir).resolve() + + write_file(project_root / ".gitignore", "*.csv\n") + write_file(project_root / "subrepo" / ".gitignore", "!keep.csv\n") + write_file(project_root / "drop.csv", "a,b,c\n" * 20) + write_file(project_root / "subrepo" / "keep.csv", "a,b,c\n" * 20) + + assert scanned_files(project_root) == ["subrepo/keep.csv"] + finally: + shutil.rmtree(tmpdir) + + +def test_scan_project_allows_gitignore_negation_when_parent_dir_is_visible(): + tmpdir = tempfile.mkdtemp() + try: + project_root = Path(tmpdir).resolve() + + write_file(project_root / ".gitignore", "generated/*\n!generated/keep.py\n") + write_file(project_root / "generated" / "drop.py", "print('drop')\n" * 20) + write_file(project_root / "generated" / "keep.py", "print('keep')\n" * 20) + + assert scanned_files(project_root) == ["generated/keep.py"] + finally: + shutil.rmtree(tmpdir) + + +def test_scan_project_does_not_reinclude_file_from_ignored_directory(): + tmpdir = tempfile.mkdtemp() + try: + project_root = Path(tmpdir).resolve() + + write_file(project_root / ".gitignore", "generated/\n!generated/keep.py\n") + write_file(project_root / "generated" / "drop.py", "print('drop')\n" * 20) + write_file(project_root / "generated" / "keep.py", "print('keep')\n" * 20) + + assert scanned_files(project_root) == [] + finally: + shutil.rmtree(tmpdir) + + +def test_scan_project_can_disable_gitignore(): + tmpdir = tempfile.mkdtemp() + try: + project_root = Path(tmpdir).resolve() + + write_file(project_root / ".gitignore", "data/\n") + write_file(project_root / "data" / "stuff.csv", "a,b,c\n" * 20) + + assert scanned_files(project_root, respect_gitignore=False) == ["data/stuff.csv"] + finally: + shutil.rmtree(tmpdir) + + +def test_scan_project_can_include_ignored_directory(): + tmpdir = tempfile.mkdtemp() + try: + project_root = Path(tmpdir).resolve() + + write_file(project_root / ".gitignore", "docs/\n") + write_file(project_root / "docs" / "guide.md", "# Guide\n" * 20) + + assert scanned_files(project_root, include_ignored=["docs"]) == ["docs/guide.md"] + finally: + shutil.rmtree(tmpdir) + + +def test_scan_project_can_include_specific_ignored_file(): + tmpdir = tempfile.mkdtemp() + try: + project_root = Path(tmpdir).resolve() + + write_file(project_root / ".gitignore", "generated/\n") + write_file(project_root / "generated" / "drop.py", "print('drop')\n" * 20) + write_file(project_root / "generated" / "keep.py", "print('keep')\n" * 20) + + assert scanned_files(project_root, include_ignored=["generated/keep.py"]) == [ + "generated/keep.py" + ] + finally: + shutil.rmtree(tmpdir) + + +def test_scan_project_can_include_exact_file_without_known_extension(): + tmpdir = tempfile.mkdtemp() + try: + project_root = Path(tmpdir).resolve() + + write_file(project_root / ".gitignore", "README\n") + write_file(project_root / "README", "hello\n" * 20) + + assert scanned_files(project_root, include_ignored=["README"]) == ["README"] + finally: + shutil.rmtree(tmpdir) + + +def test_scan_project_include_override_beats_skip_dirs(): + tmpdir = tempfile.mkdtemp() + try: + project_root = Path(tmpdir).resolve() + + write_file(project_root / ".pytest_cache" / "cache.py", "print('cache')\n" * 20) + + assert scanned_files( + project_root, + respect_gitignore=False, + include_ignored=[".pytest_cache"], + ) == [".pytest_cache/cache.py"] + finally: + shutil.rmtree(tmpdir) + + +def test_scan_project_skip_dirs_still_apply_without_override(): + tmpdir = tempfile.mkdtemp() + try: + project_root = Path(tmpdir).resolve() + + write_file(project_root / ".pytest_cache" / "cache.py", "print('cache')\n" * 20) + write_file(project_root / "main.py", "print('main')\n" * 20) + + assert scanned_files(project_root, respect_gitignore=False) == ["main.py"] + finally: + shutil.rmtree(tmpdir)