"""Tests for mempalace.project_scanner.""" import json import subprocess from pathlib import Path from mempalace.project_scanner import ( PersonInfo, ProjectInfo, _dedupe_people, _is_bot, _looks_like_real_name, _merge_detected, _parse_cargo, _parse_gomod, _parse_package_json, _parse_pyproject, _UnionFind, discover_entities, find_git_repos, scan, to_detected_dict, ) # ── manifest parsers ──────────────────────────────────────────────────── def test_parse_package_json(tmp_path): f = tmp_path / "package.json" f.write_text(json.dumps({"name": "my-package", "version": "1.0.0"})) assert _parse_package_json(f) == "my-package" def test_parse_package_json_missing_name(tmp_path): f = tmp_path / "package.json" f.write_text(json.dumps({"version": "1.0.0"})) assert _parse_package_json(f) is None def test_parse_package_json_malformed(tmp_path): f = tmp_path / "package.json" f.write_text("{ not valid json") assert _parse_package_json(f) is None def test_parse_pyproject_pep621(tmp_path): f = tmp_path / "pyproject.toml" f.write_text('[project]\nname = "my-py-package"\n') assert _parse_pyproject(f) == "my-py-package" def test_parse_pyproject_poetry(tmp_path): f = tmp_path / "pyproject.toml" f.write_text('[tool.poetry]\nname = "poetry-pkg"\n') assert _parse_pyproject(f) == "poetry-pkg" def test_parse_cargo(tmp_path): f = tmp_path / "Cargo.toml" f.write_text('[package]\nname = "rust-crate"\nversion = "0.1.0"\n') assert _parse_cargo(f) == "rust-crate" def test_parse_gomod(tmp_path): f = tmp_path / "go.mod" f.write_text("module github.com/user/my-go-mod\n\ngo 1.21\n") assert _parse_gomod(f) == "my-go-mod" # ── bot filtering ─────────────────────────────────────────────────────── def test_is_bot_catches_github_actions(): assert _is_bot("github-actions[bot]", "41898282+github-actions[bot]@users.noreply.github.com") def test_is_bot_catches_dependabot(): assert _is_bot("dependabot[bot]", "dependabot@github.com") def test_is_bot_catches_pr_bot(): assert _is_bot("Comfy Org PR Bot", "prbot@example.com") def test_is_bot_does_not_flag_github_privacy_email(): # Real humans use ...@users.noreply.github.com when privacy is enabled. # Must NOT be filtered. assert not _is_bot("Igor Lins e Silva", "123456+igorls@users.noreply.github.com") def test_is_bot_does_not_flag_robot_person_name(): # "Robot" as a surname should not trigger the \bbot$ pattern # since \b requires a boundary before 'bot'. assert not _is_bot("Sarah Robot", "sarah@example.com") def test_looks_like_real_name_accepts_human(): assert _looks_like_real_name("Igor Lins e Silva") assert _looks_like_real_name("Jane Doe") def test_looks_like_real_name_rejects_handles(): assert not _looks_like_real_name("666ghj") assert not _looks_like_real_name("comfyanonymous") assert not _looks_like_real_name("bensig") assert not _looks_like_real_name("") assert not _looks_like_real_name("no_spaces_handle") # ── union-find dedup ──────────────────────────────────────────────────── def test_unionfind_merges_shared_email(): commits = [ ("Milla J", "shared@example.com", "repo1"), ("MSL", "shared@example.com", "repo1"), ("Milla J", "other@example.com", "repo1"), ] people = _dedupe_people(commits) # All three commits collapse into one "Milla J" person (MSL is filtered # as display name because it lacks a space but its commits still count). assert "Milla J" in people assert people["Milla J"].total_commits == 3 assert "MSL" not in people def test_unionfind_keeps_distinct_people_separate(): commits = [ ("Alice Example", "alice@example.com", "r"), ("Bob Sample", "bob@sample.org", "r"), ] people = _dedupe_people(commits) assert "Alice Example" in people assert "Bob Sample" in people def test_unionfind_merges_shared_name(): """Same display name, two different emails, same person.""" commits = [ ("Jane Doe", "jane@work.com", "r"), ("Jane Doe", "jane@personal.com", "r"), ] people = _dedupe_people(commits) assert people["Jane Doe"].total_commits == 2 assert len(people["Jane Doe"].emails) == 2 # ── project_info / person_info ───────────────────────────────────────── def test_project_info_confidence_is_mine(): p = ProjectInfo(name="x", repo_root=Path("."), is_mine=True) assert p.confidence == 0.99 def test_project_info_confidence_no_git(): p = ProjectInfo(name="x", repo_root=Path("."), has_git=False, manifest="package.json") assert p.confidence > 0.8 def test_person_info_signal_pluralization(): p = PersonInfo(name="x", total_commits=1, repos={"a"}) assert "1 commit across 1 repo" == p.to_signal() p2 = PersonInfo(name="y", total_commits=5, repos={"a", "b"}) assert "5 commits across 2 repos" == p2.to_signal() # ── find_git_repos ────────────────────────────────────────────────────── def test_find_git_repos_detects_root_repo(tmp_path): (tmp_path / ".git").mkdir() repos = find_git_repos(tmp_path) assert tmp_path in repos def test_find_git_repos_detects_nested(tmp_path): sub = tmp_path / "subproject" sub.mkdir() (sub / ".git").mkdir() repos = find_git_repos(tmp_path) assert sub in repos def test_find_git_repos_skips_nested_inside_repo(tmp_path): """If root is a repo and there's another repo inside it, the inner repo is NOT walked into (we stop at the first repo boundary when descending).""" (tmp_path / ".git").mkdir() deep = tmp_path / "a" / "b" / "nested-repo" deep.mkdir(parents=True) (deep / ".git").mkdir() repos = find_git_repos(tmp_path) # Root IS found; nested still discovered on its own branch (not inside root's .git) assert tmp_path in repos def test_find_git_repos_empty_dir(tmp_path): assert find_git_repos(tmp_path) == [] # ── scan ──────────────────────────────────────────────────────────────── def _init_git_repo(path: Path, name: str = "Jane Doe", email: str = "jane@example.com"): """Helper: init a git repo with one commit.""" subprocess.run(["git", "init", "-q"], cwd=path, check=True) subprocess.run(["git", "config", "user.name", name], cwd=path, check=True) subprocess.run(["git", "config", "user.email", email], cwd=path, check=True) subprocess.run(["git", "config", "commit.gpgsign", "false"], cwd=path, check=True) (path / "README.md").write_text("hello") subprocess.run(["git", "add", "README.md"], cwd=path, check=True) subprocess.run( ["git", "commit", "-q", "-m", "initial"], cwd=path, check=True, env={"GIT_COMMITTER_NAME": name, "GIT_COMMITTER_EMAIL": email, "PATH": "/usr/bin:/bin"}, ) def test_scan_project_from_package_json(tmp_path): (tmp_path / "package.json").write_text(json.dumps({"name": "my-app"})) _init_git_repo(tmp_path) projects, people = scan(tmp_path) assert len(projects) == 1 assert projects[0].name == "my-app" assert projects[0].is_mine is True def test_scan_project_from_pyproject(tmp_path): (tmp_path / "pyproject.toml").write_text('[project]\nname = "pyproj"\n') _init_git_repo(tmp_path) projects, _ = scan(tmp_path) assert any(p.name == "pyproj" for p in projects) def test_scan_fallback_to_dir_name_when_no_manifest(tmp_path): repo = tmp_path / "my-repo-name" repo.mkdir() _init_git_repo(repo) projects, _ = scan(tmp_path) assert any(p.name == "my-repo-name" for p in projects) def test_scan_manifest_only_no_git(tmp_path): """A dir with a manifest but no git still produces a project.""" (tmp_path / "package.json").write_text(json.dumps({"name": "manifest-only"})) projects, people = scan(tmp_path) assert len(projects) == 1 assert projects[0].name == "manifest-only" assert projects[0].has_git is False assert people == [] def test_scan_empty_dir(tmp_path): projects, people = scan(tmp_path) assert projects == [] assert people == [] def test_scan_returns_empty_for_nonexistent(tmp_path): missing = tmp_path / "does-not-exist" projects, people = scan(missing) assert projects == [] assert people == [] # ── to_detected_dict ──────────────────────────────────────────────────── def test_to_detected_dict_shape(): projects = [ProjectInfo(name="p", repo_root=Path("."), is_mine=True, manifest="package.json")] people = [PersonInfo(name="Jane Doe", total_commits=5, repos={"r"})] d = to_detected_dict(projects, people) assert set(d.keys()) == {"people", "projects", "uncertain"} assert d["projects"][0]["name"] == "p" assert d["projects"][0]["type"] == "project" assert d["people"][0]["name"] == "Jane Doe" assert d["people"][0]["type"] == "person" assert d["uncertain"] == [] # ── merge ─────────────────────────────────────────────────────────────── def test_merge_primary_wins_case_insensitive(): primary = { "people": [], "projects": [ { "name": "mempalace", "type": "project", "confidence": 0.99, "frequency": 10, "signals": ["pyproject.toml"], } ], "uncertain": [], } secondary = { "people": [], "projects": [], "uncertain": [ { "name": "MemPalace", "type": "uncertain", "confidence": 0.4, "frequency": 6, "signals": ["regex"], } ], } merged = _merge_detected(primary, secondary) # `MemPalace` (uncertain) is deduped against `mempalace` (project) case-insensitively assert len(merged["projects"]) == 1 assert len(merged["uncertain"]) == 0 def test_merge_drops_secondary_uncertain_when_requested(): primary = {"people": [], "projects": [], "uncertain": []} secondary = { "people": [], "projects": [], "uncertain": [ {"name": "Foo", "type": "uncertain", "confidence": 0.4, "frequency": 3, "signals": []} ], } merged = _merge_detected(primary, secondary, drop_secondary_uncertain=True) assert merged["uncertain"] == [] def test_merge_keeps_distinct_names(): primary = { "people": [ { "name": "Alice Smith", "type": "person", "confidence": 0.9, "frequency": 10, "signals": [], } ], "projects": [], "uncertain": [], } secondary = { "people": [ { "name": "Bob Jones", "type": "person", "confidence": 0.7, "frequency": 3, "signals": [], } ], "projects": [], "uncertain": [], } merged = _merge_detected(primary, secondary) assert len(merged["people"]) == 2 # ── discover_entities ────────────────────────────────────────────────── def test_discover_entities_falls_back_to_prose_when_no_git(tmp_path): """If no manifests or git, regex detector on prose is the only source.""" notes = tmp_path / "notes.md" notes.write_text( "Riley said hello. Riley asked about it. Riley laughed. " "Hey Riley, thanks for the help. Riley pushed the change. " "Riley decided to go." ) d = discover_entities(str(tmp_path)) # Prose-only fallback kicks in — Riley appears with person signals all_names = [e["name"] for cat in d.values() for e in cat] assert "Riley" in all_names def test_discover_entities_prefers_real_signal_over_prose(tmp_path): """When manifest exists, its name wins even if prose has noisy candidates.""" (tmp_path / "package.json").write_text(json.dumps({"name": "realproj"})) _init_git_repo(tmp_path) (tmp_path / "doc.md").write_text( "Something. Another. Whatever. Context. Context. Context. Context. " "realproj. realproj. realproj. realproj." ) d = discover_entities(str(tmp_path)) proj_names = [e["name"] for e in d["projects"]] assert "realproj" in proj_names # ── _UnionFind basics ────────────────────────────────────────────────── def test_unionfind_find_creates_singleton(): uf = _UnionFind() assert uf.find("x") == "x" def test_unionfind_union_merges(): uf = _UnionFind() uf.union("a", "b") assert uf.find("a") == uf.find("b") def test_unionfind_transitive(): uf = _UnionFind() uf.union("a", "b") uf.union("b", "c") assert uf.find("a") == uf.find("c")