fix: support nested .gitignore rules during mining

This commit is contained in:
ac-opensource
2026-04-08 00:02:21 +08:00
parent 9b9daa9b4b
commit c8c220d789
3 changed files with 355 additions and 86 deletions
+16
View File
@@ -65,6 +65,9 @@ def cmd_init(args):
def cmd_mine(args):
palace_path = os.path.expanduser(args.palace) if args.palace else MempalaceConfig().palace_path
include_ignored = []
for raw in args.include_ignored or []:
include_ignored.extend(part.strip() for part in raw.split(",") if part.strip())
if args.mode == "convos":
from .convo_miner import mine_convos
@@ -88,6 +91,8 @@ def cmd_mine(args):
agent=args.agent,
limit=args.limit,
dry_run=args.dry_run,
respect_gitignore=not args.no_gitignore,
include_ignored=include_ignored,
)
@@ -288,6 +293,17 @@ def main():
help="Ingest mode: 'projects' for code/docs (default), 'convos' for chat exports",
)
p_mine.add_argument("--wing", default=None, help="Wing name (default: directory name)")
p_mine.add_argument(
"--no-gitignore",
action="store_true",
help="Don't respect .gitignore files when scanning project files",
)
p_mine.add_argument(
"--include-ignored",
action="append",
default=[],
help="Always scan these project-relative paths even if ignored; repeat or pass comma-separated paths",
)
p_mine.add_argument(
"--agent",
default="mempalace",
+162 -35
View File
@@ -52,6 +52,27 @@ SKIP_DIRS = {
".next",
"coverage",
".mempalace",
".ruff_cache",
".mypy_cache",
".pytest_cache",
".cache",
".tox",
".nox",
".idea",
".vscode",
".ipynb_checkpoints",
".eggs",
"htmlcov",
"target",
}
SKIP_FILENAMES = {
"mempalace.yaml",
"mempalace.yml",
"mempal.yaml",
"mempal.yml",
".gitignore",
"package-lock.json",
}
CHUNK_SIZE = 800 # chars per drawer
@@ -65,27 +86,32 @@ MIN_CHUNK_SIZE = 50 # skip tiny chunks
class GitignoreMatcher:
"""Lightweight matcher for a project's root .gitignore patterns."""
"""Lightweight matcher for one directory's .gitignore patterns."""
def __init__(self, rules: list):
def __init__(self, base_dir: Path, rules: list):
self.base_dir = base_dir
self.rules = rules
self.has_negations = any(rule["negated"] for rule in rules)
@classmethod
def from_project(cls, project_path: Path):
gitignore_path = project_path / ".gitignore"
if not gitignore_path.exists():
return cls([])
def from_dir(cls, dir_path: Path):
gitignore_path = dir_path / ".gitignore"
if not gitignore_path.is_file():
return None
try:
lines = gitignore_path.read_text(encoding="utf-8", errors="replace").splitlines()
except Exception:
return cls([])
return None
rules = []
for raw_line in lines:
line = raw_line.strip()
if not line or line.startswith("#"):
if not line:
continue
if line.startswith("\\#") or line.startswith("\\!"):
line = line[1:]
elif line.startswith("#"):
continue
negated = line.startswith("!")
@@ -112,24 +138,24 @@ class GitignoreMatcher:
}
)
return cls(rules)
if not rules:
return None
def matches(self, path: Path, project_path: Path, is_dir: bool = None) -> bool:
if not self.rules:
return False
return cls(dir_path, rules)
def matches(self, path: Path, is_dir: bool = None):
try:
relative = path.relative_to(project_path).as_posix().strip("/")
relative = path.relative_to(self.base_dir).as_posix().strip("/")
except ValueError:
return False
return None
if not relative:
return False
return None
if is_dir is None:
is_dir = path.is_dir()
ignored = False
ignored = None
for rule in self.rules:
if self._rule_matches(rule, relative, is_dir):
ignored = not rule["negated"]
@@ -175,6 +201,75 @@ class GitignoreMatcher:
return matches(0, 0)
def load_gitignore_matcher(dir_path: Path, cache: dict):
"""Load and cache one directory's .gitignore matcher."""
if dir_path not in cache:
cache[dir_path] = GitignoreMatcher.from_dir(dir_path)
return cache[dir_path]
def is_gitignored(path: Path, matchers: list, is_dir: bool = False) -> bool:
"""Apply active .gitignore matchers in ancestor order; last match wins."""
ignored = False
for matcher in matchers:
decision = matcher.matches(path, is_dir=is_dir)
if decision is not None:
ignored = decision
return ignored
def should_skip_dir(dirname: str) -> bool:
"""Skip known generated/cache directories before gitignore matching."""
return dirname in SKIP_DIRS or dirname.endswith(".egg-info")
def normalize_include_paths(include_ignored: list) -> set:
"""Normalize comma-parsed include paths into project-relative POSIX strings."""
normalized = set()
for raw_path in include_ignored or []:
candidate = str(raw_path).strip().strip("/")
if candidate:
normalized.add(Path(candidate).as_posix())
return normalized
def is_exact_force_include(path: Path, project_path: Path, include_paths: set) -> bool:
"""Return True when a path exactly matches an explicit include override."""
if not include_paths:
return False
try:
relative = path.relative_to(project_path).as_posix().strip("/")
except ValueError:
return False
return relative in include_paths
def is_force_included(path: Path, project_path: Path, include_paths: set) -> bool:
"""Return True when a path or one of its ancestors/descendants was explicitly included."""
if not include_paths:
return False
try:
relative = path.relative_to(project_path).as_posix().strip("/")
except ValueError:
return False
if not relative:
return False
for include_path in include_paths:
if relative == include_path:
return True
if relative.startswith(f"{include_path}/"):
return True
if include_path.startswith(f"{relative}/"):
return True
return False
# =============================================================================
# CONFIG
# =============================================================================
@@ -401,36 +496,58 @@ def process_file(
# =============================================================================
def scan_project(project_dir: str) -> list:
def scan_project(
project_dir: str,
respect_gitignore: bool = True,
include_ignored: list = None,
) -> list:
"""Return list of all readable file paths."""
project_path = Path(project_dir).expanduser().resolve()
gitignore_matcher = GitignoreMatcher.from_project(project_path)
files = []
active_matchers = []
matcher_cache = {}
include_paths = normalize_include_paths(include_ignored)
for root, dirs, filenames in os.walk(project_path):
root_path = Path(root)
dirs[:] = [d for d in dirs if d not in SKIP_DIRS]
if not gitignore_matcher.has_negations:
if respect_gitignore:
active_matchers = [
matcher
for matcher in active_matchers
if root_path == matcher.base_dir or matcher.base_dir in root_path.parents
]
current_matcher = load_gitignore_matcher(root_path, matcher_cache)
if current_matcher is not None:
active_matchers.append(current_matcher)
dirs[:] = [
d
for d in dirs
if is_force_included(root_path / d, project_path, include_paths)
or not should_skip_dir(d)
]
if respect_gitignore and active_matchers:
dirs[:] = [
d
for d in dirs
if not gitignore_matcher.matches(root_path / d, project_path, is_dir=True)
if is_force_included(root_path / d, project_path, include_paths)
or not is_gitignored(root_path / d, active_matchers, is_dir=True)
]
for filename in filenames:
filepath = root_path / filename
if gitignore_matcher.matches(filepath, project_path, is_dir=False):
force_include = is_force_included(filepath, project_path, include_paths)
exact_force_include = is_exact_force_include(filepath, project_path, include_paths)
if not force_include and filename in SKIP_FILENAMES:
continue
if filepath.suffix.lower() in READABLE_EXTENSIONS:
# Skip config files
if filename in (
"mempalace.yaml",
"mempalace.yml",
"mempal.yaml",
"mempal.yml",
".gitignore",
"package-lock.json",
):
if filepath.suffix.lower() not in READABLE_EXTENSIONS and not exact_force_include:
continue
if respect_gitignore and active_matchers and not force_include:
if is_gitignored(filepath, active_matchers, is_dir=False):
continue
files.append(filepath)
files.append(filepath)
return files
@@ -446,6 +563,8 @@ def mine(
agent: str = "mempalace",
limit: int = 0,
dry_run: bool = False,
respect_gitignore: bool = True,
include_ignored: list = None,
):
"""Mine a project directory into the palace."""
@@ -455,7 +574,11 @@ def mine(
wing = wing_override or config["wing"]
rooms = config.get("rooms", [{"name": "general", "description": "All project files"}])
files = scan_project(project_dir)
files = scan_project(
project_dir,
respect_gitignore=respect_gitignore,
include_ignored=include_ignored,
)
if limit > 0:
files = files[:limit]
@@ -468,6 +591,10 @@ def mine(
print(f" Palace: {palace_path}")
if dry_run:
print(" DRY RUN — nothing will be filed")
if not respect_gitignore:
print(" .gitignore: DISABLED")
if include_ignored:
print(f" Include: {', '.join(sorted(normalize_include_paths(include_ignored)))}")
print(f"{'' * 55}\n")
if not dry_run:
+177 -51
View File
@@ -1,82 +1,208 @@
import os
import tempfile
import shutil
import yaml
import chromadb
import tempfile
from pathlib import Path
import chromadb
import yaml
from mempalace.miner import mine, scan_project
def write_file(path: Path, content: str):
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(content, encoding="utf-8")
def scanned_files(project_root: Path, **kwargs):
files = scan_project(str(project_root), **kwargs)
return sorted(path.relative_to(project_root).as_posix() for path in files)
def test_project_mining():
tmpdir = tempfile.mkdtemp()
# Create a mini project
os.makedirs(os.path.join(tmpdir, "backend"))
with open(os.path.join(tmpdir, "backend", "app.py"), "w") as f:
f.write("def main():\n print('hello world')\n" * 20)
# Create config
with open(os.path.join(tmpdir, "mempalace.yaml"), "w") as f:
yaml.dump(
{
"wing": "test_project",
"rooms": [
{"name": "backend", "description": "Backend code"},
{"name": "general", "description": "General"},
],
},
f,
try:
project_root = Path(tmpdir).resolve()
os.makedirs(project_root / "backend")
write_file(
project_root / "backend" / "app.py", "def main():\n print('hello world')\n" * 20
)
with open(project_root / "mempalace.yaml", "w") as f:
yaml.dump(
{
"wing": "test_project",
"rooms": [
{"name": "backend", "description": "Backend code"},
{"name": "general", "description": "General"},
],
},
f,
)
palace_path = os.path.join(tmpdir, "palace")
mine(tmpdir, palace_path)
palace_path = project_root / "palace"
mine(str(project_root), str(palace_path))
# Verify
client = chromadb.PersistentClient(path=palace_path)
col = client.get_collection("mempalace_drawers")
assert col.count() > 0
shutil.rmtree(tmpdir)
client = chromadb.PersistentClient(path=str(palace_path))
col = client.get_collection("mempalace_drawers")
assert col.count() > 0
finally:
shutil.rmtree(tmpdir)
def test_scan_project_respects_gitignore():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
os.makedirs(project_root / "src")
os.makedirs(project_root / "generated")
(project_root / ".gitignore").write_text("ignored.py\ngenerated/\n", encoding="utf-8")
(project_root / "src" / "app.py").write_text("print('hello')\n" * 20, encoding="utf-8")
(project_root / "ignored.py").write_text("print('ignore me')\n" * 20, encoding="utf-8")
(project_root / "generated" / "artifact.py").write_text(
"print('ignore this dir')\n" * 20,
encoding="utf-8",
)
write_file(project_root / ".gitignore", "ignored.py\ngenerated/\n")
write_file(project_root / "src" / "app.py", "print('hello')\n" * 20)
write_file(project_root / "ignored.py", "print('ignore me')\n" * 20)
write_file(project_root / "generated" / "artifact.py", "print('artifact')\n" * 20)
files = scan_project(str(project_root))
relative_files = sorted(path.relative_to(project_root).as_posix() for path in files)
assert relative_files == ["src/app.py"]
assert scanned_files(project_root) == ["src/app.py"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_handles_gitignore_negation():
def test_scan_project_respects_nested_gitignore():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
os.makedirs(project_root / "generated")
(project_root / ".gitignore").write_text(
"generated/\n!generated/keep.py\n",
encoding="utf-8",
)
(project_root / "generated" / "drop.py").write_text("print('drop')\n" * 20, encoding="utf-8")
(project_root / "generated" / "keep.py").write_text("print('keep')\n" * 20, encoding="utf-8")
write_file(project_root / ".gitignore", "*.log\n")
write_file(project_root / "subrepo" / ".gitignore", "tasks/\n")
write_file(project_root / "subrepo" / "src" / "main.py", "print('main')\n" * 20)
write_file(project_root / "subrepo" / "tasks" / "task.py", "print('task')\n" * 20)
write_file(project_root / "subrepo" / "debug.log", "debug\n" * 20)
files = scan_project(str(project_root))
relative_files = sorted(path.relative_to(project_root).as_posix() for path in files)
assert relative_files == ["generated/keep.py"]
assert scanned_files(project_root) == ["subrepo/src/main.py"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_allows_nested_gitignore_override():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "*.csv\n")
write_file(project_root / "subrepo" / ".gitignore", "!keep.csv\n")
write_file(project_root / "drop.csv", "a,b,c\n" * 20)
write_file(project_root / "subrepo" / "keep.csv", "a,b,c\n" * 20)
assert scanned_files(project_root) == ["subrepo/keep.csv"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_allows_gitignore_negation_when_parent_dir_is_visible():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "generated/*\n!generated/keep.py\n")
write_file(project_root / "generated" / "drop.py", "print('drop')\n" * 20)
write_file(project_root / "generated" / "keep.py", "print('keep')\n" * 20)
assert scanned_files(project_root) == ["generated/keep.py"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_does_not_reinclude_file_from_ignored_directory():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "generated/\n!generated/keep.py\n")
write_file(project_root / "generated" / "drop.py", "print('drop')\n" * 20)
write_file(project_root / "generated" / "keep.py", "print('keep')\n" * 20)
assert scanned_files(project_root) == []
finally:
shutil.rmtree(tmpdir)
def test_scan_project_can_disable_gitignore():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "data/\n")
write_file(project_root / "data" / "stuff.csv", "a,b,c\n" * 20)
assert scanned_files(project_root, respect_gitignore=False) == ["data/stuff.csv"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_can_include_ignored_directory():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "docs/\n")
write_file(project_root / "docs" / "guide.md", "# Guide\n" * 20)
assert scanned_files(project_root, include_ignored=["docs"]) == ["docs/guide.md"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_can_include_specific_ignored_file():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "generated/\n")
write_file(project_root / "generated" / "drop.py", "print('drop')\n" * 20)
write_file(project_root / "generated" / "keep.py", "print('keep')\n" * 20)
assert scanned_files(project_root, include_ignored=["generated/keep.py"]) == [
"generated/keep.py"
]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_can_include_exact_file_without_known_extension():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".gitignore", "README\n")
write_file(project_root / "README", "hello\n" * 20)
assert scanned_files(project_root, include_ignored=["README"]) == ["README"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_include_override_beats_skip_dirs():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".pytest_cache" / "cache.py", "print('cache')\n" * 20)
assert scanned_files(
project_root,
respect_gitignore=False,
include_ignored=[".pytest_cache"],
) == [".pytest_cache/cache.py"]
finally:
shutil.rmtree(tmpdir)
def test_scan_project_skip_dirs_still_apply_without_override():
tmpdir = tempfile.mkdtemp()
try:
project_root = Path(tmpdir).resolve()
write_file(project_root / ".pytest_cache" / "cache.py", "print('cache')\n" * 20)
write_file(project_root / "main.py", "print('main')\n" * 20)
assert scanned_files(project_root, respect_gitignore=False) == ["main.py"]
finally:
shutil.rmtree(tmpdir)