fix(hooks): consolidate transcript ingest, harden shell parsers (#1231 review)

Address Copilot review on #1231:

1. Stop double-mining the transcript on the Python side. ``_get_mine_targets``
   now returns only the ``MEMPAL_DIR`` projects target — the convos target
   for the transcript dir is dropped because ``_ingest_transcript`` already
   handles it on every hook fire. The duplicate spawn was using
   ``sys.executable`` (vs ``_mempalace_python()``) and a different ``--wing``,
   so each Stop/PreCompact event was writing the same transcript into two
   wings under asymmetric interpreters and overwriting the single
   ``_MINE_PID_FILE`` lock.

2. ``_maybe_auto_ingest`` and ``_mine_sync`` now spawn via
   ``_mempalace_python()`` so the resolved interpreter matches the venv
   that owns mempalace (matters under GUI-launched harnesses where
   ``sys.executable`` may resolve to a system Python without chromadb).

3. Replace ``eval $(...)`` in both shell hooks with a ``mapfile``-based
   reader. Sanitized values are still emitted by the same Python parser,
   but the shell now does plain variable assignment instead of executing
   the parser's stdout — smaller blast radius if the sanitizer is ever
   bypassed.

4. Mirror ``_validate_transcript_path`` in the shell hooks via a
   ``is_valid_transcript_path`` helper — extension + traversal-segment
   rejection, parity with the Python validator. The convos mine in each
   shell hook is now gated on the validator instead of bare ``-f``.

5. Tighten the ``..`` traversal test that previously exercised the
   suffix gate by mistake (``../../etc/passwd`` lacks ``.json[l]``).
   Use ``.jsonl`` paths with traversal segments to actually hit the
   ``..`` rejection branch.

6. README: add a one-liner pointing at ``mempalace sweep`` for users
   who want per-message recall on top of the file-level chunks the
   hooks produce. The sweeper was undiscoverable previously.

Tests: 1418 passed, 1 skipped (full suite minus benchmarks).
This commit is contained in:
Igor Lins e Silva
2026-04-27 02:26:53 -03:00
parent eb4de04339
commit fe56797762
6 changed files with 261 additions and 132 deletions
+62
View File
@@ -61,3 +61,65 @@ class TestSaveHookAutoMines:
'MEMPAL_DIR defaults to "" which silently disables mining. '
"Either set a default path or add transcript-based mining."
)
class TestShellHookTranscriptValidation:
"""Both shell hooks must validate transcript paths before mining them.
Mirrors mempalace.hooks_cli._validate_transcript_path so unsafe paths
(no extension, traversal segments) are rejected at the shell layer
too — added in #1231 review (Copilot #7, #8).
"""
@staticmethod
def _hook_src(name: str) -> str:
path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "hooks", name)
return open(path).read()
@staticmethod
def _strip_comments(src: str) -> str:
return "\n".join(line for line in src.splitlines() if not line.lstrip().startswith("#"))
def test_save_hook_defines_and_uses_validator(self):
src = self._strip_comments(self._hook_src("mempal_save_hook.sh"))
assert "is_valid_transcript_path() {" in src, "validator function must be defined"
assert (
'is_valid_transcript_path "$TRANSCRIPT_PATH"' in src
), "validator must be invoked against TRANSCRIPT_PATH before mining"
def test_precompact_hook_defines_and_uses_validator(self):
src = self._strip_comments(self._hook_src("mempal_precompact_hook.sh"))
assert "is_valid_transcript_path() {" in src, "validator function must be defined"
assert (
'is_valid_transcript_path "$TRANSCRIPT_PATH"' in src
), "validator must be invoked against TRANSCRIPT_PATH before mining"
def test_validators_run_via_bash(self, tmp_path):
"""Source the validator out of each hook and exercise it directly."""
import subprocess
for name in ("mempal_save_hook.sh", "mempal_precompact_hook.sh"):
src = self._hook_src(name)
# Extract just the function definition (first occurrence).
start = src.index("is_valid_transcript_path() {")
end = src.index("\n}\n", start) + 2
func_src = src[start:end]
script = tmp_path / "v.sh"
script.write_text(
f"{func_src}\n" 'is_valid_transcript_path "$1" && echo OK || echo NO\n'
)
def run(arg: str) -> str:
return subprocess.run(
["bash", str(script), arg],
capture_output=True,
text=True,
check=False,
).stdout.strip()
assert run("/tmp/sessions/abc.jsonl") == "OK"
assert run("/tmp/sessions/abc.json") == "OK"
assert run("") == "NO"
assert run("/tmp/notes.txt") == "NO"
assert run("../etc/passwd.jsonl") == "NO"
assert run("/tmp/../etc/t.jsonl") == "NO"