Skip to content

Commit 40495a3

Browse files
authored
Extract Vault sweep move planning (#28)
1 parent 7d3b9a6 commit 40495a3

4 files changed

Lines changed: 194 additions & 59 deletions

File tree

CONTEXT.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,14 @@ lives at `modules/pathfinder/app/pf_archive_import_execution.py`. The compatibil
153153
`modules/pathfinder/app/pf_archive_import.py` builds the plan and delegates execution.
154154
_Avoid_: live importer helper, Cartosia writer, report wrapper.
155155

156+
**Vault Sweep Plan**:
157+
The Sentinel Core behavior that describes Vault sweep move intent before dry-run reporting or live
158+
mutation: noise trash targets, misplaced-topic relocation targets, duplicate trash targets, and
159+
their reportable reasons. The code module lives at `sentinel-core/app/services/vault_sweep_plan.py`.
160+
`sentinel-core/app/services/vault_sweeper.py` owns orchestration, locks, safety probes, embedding,
161+
and live Vault I/O; it should use sweep plans instead of rebuilding proposed-move dictionaries inline.
162+
_Avoid_: sweep proposal dict helper, dry-run move formatter, vault move decision.
163+
156164
**Session**:
157165
One user message + one Sentinel response. Bounded by a single `POST /message` request.
158166

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
"""Side-effect-free move planning for Vault sweeps."""
2+
3+
from __future__ import annotations
4+
5+
from dataclasses import dataclass
6+
from typing import Literal
7+
8+
9+
@dataclass(frozen=True)
10+
class SweepMovePlan:
11+
"""A planned Vault move before dry-run reporting or live execution."""
12+
13+
kind: Literal["trash", "topic"]
14+
src: str
15+
dst: str
16+
reason: str
17+
18+
def asdict(self) -> dict:
19+
return {
20+
"kind": self.kind,
21+
"src": self.src,
22+
"dst": self.dst,
23+
"reason": self.reason,
24+
}
25+
26+
27+
def is_in_topic_dir(path: str, topic_dir: str) -> bool:
28+
"""True when ``path`` is already within ``topic_dir``.
29+
30+
Handles the journal nested-date case: ``journal/2026-04-27/foo.md`` is
31+
considered in-dir for any ``journal/...`` topic_dir, not just exact
32+
same-day match. The sweeper does not relocate journal entries between
33+
days, only flags a wrong-topic placement.
34+
"""
35+
if not topic_dir:
36+
return False
37+
family_root = topic_dir.split("/", 1)[0] + "/"
38+
return path.startswith(family_root)
39+
40+
41+
def propose_topic_move(
42+
src_path: str, topic: str, *, today: str | None = None
43+
) -> str | None:
44+
"""Return the destination path a topic move would use."""
45+
from app.services.note_classifier import topic_dir_for
46+
47+
topic_dir = topic_dir_for(topic, today=today)
48+
if not topic_dir:
49+
return None
50+
if is_in_topic_dir(src_path, topic_dir):
51+
return None
52+
filename = src_path.rsplit("/", 1)[-1]
53+
return f"{topic_dir}/{filename}"
54+
55+
56+
def plan_noise_trash(src_path: str, *, today: str) -> SweepMovePlan:
57+
filename = src_path.rsplit("/", 1)[-1]
58+
return SweepMovePlan(
59+
kind="trash",
60+
src=src_path,
61+
dst=f"_trash/{today}/{filename}",
62+
reason="cheap-filter:noise",
63+
)
64+
65+
66+
def plan_topic_move(
67+
src_path: str,
68+
topic: str,
69+
*,
70+
confidence: float,
71+
today: str | None = None,
72+
) -> SweepMovePlan | None:
73+
dst = propose_topic_move(src_path, topic, today=today)
74+
if dst is None:
75+
return None
76+
return SweepMovePlan(
77+
kind="topic",
78+
src=src_path,
79+
dst=dst,
80+
reason=f"topic={topic} (confidence={confidence:.2f})",
81+
)
82+
83+
84+
def plan_duplicate_trash(
85+
src_path: str,
86+
keeper_path: str,
87+
*,
88+
confidence: float,
89+
today: str,
90+
) -> SweepMovePlan:
91+
filename = src_path.rsplit("/", 1)[-1]
92+
return SweepMovePlan(
93+
kind="trash",
94+
src=src_path,
95+
dst=f"_trash/{today}/{filename}",
96+
reason=f"duplicate of {keeper_path} (cosine≥0.92, conf={confidence:.1f})",
97+
)

sentinel-core/app/services/vault_sweeper.py

Lines changed: 32 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,13 @@
2828
decode_index_body as _decode_index_body,
2929
encode_index_body as _encode_index_body,
3030
)
31+
from app.services.vault_sweep_plan import (
32+
is_in_topic_dir,
33+
plan_duplicate_trash,
34+
plan_noise_trash,
35+
plan_topic_move,
36+
propose_topic_move,
37+
)
3138
from app.time_utils import _iso_utc, _today_str
3239
from sentinel_shared.embedding_codec import decode_embedding, encode_embedding
3340
from sentinel_shared.similarity import cosine_similarity, find_dup_clusters
@@ -50,8 +57,10 @@
5057
"_encode_index_body",
5158
"cosine_similarity",
5259
"find_dup_clusters",
60+
"is_in_topic_dir",
5361
"split_frontmatter",
5462
"join_frontmatter",
63+
"propose_topic_move",
5564
]
5665

5766
logger = logging.getLogger(__name__)
@@ -178,44 +187,8 @@ async def walk_vault(client, root: str = "") -> AsyncIterator[str]:
178187
# stays here; I/O goes through the injected ``vault``.
179188

180189

181-
# --- Topic-folder move (misplaced note → correct topic dir) ---
182-
183-
184-
def is_in_topic_dir(path: str, topic_dir: str) -> bool:
185-
"""True when ``path`` is already within ``topic_dir``.
186-
187-
Handles the journal nested-date case: ``journal/2026-04-27/foo.md`` is
188-
considered in-dir for any ``journal/...`` topic_dir, not just exact
189-
same-day match. The sweeper does not relocate journal entries between
190-
days — only flags a wrong-topic placement.
191-
"""
192-
if not topic_dir:
193-
return False
194-
# Same dir or any subdirectory of topic_dir's root family.
195-
# For journal/2026-04-27, the family root is "journal/"; so journal/.../
196-
# is considered "in topic_dir family".
197-
family_root = topic_dir.split("/", 1)[0] + "/"
198-
return path.startswith(family_root)
199-
200-
201-
def propose_topic_move(
202-
src_path: str, topic: str, *, today: str | None = None
203-
) -> str | None:
204-
"""Return the destination path a topic-move WOULD use, or None if no
205-
move is needed (already in topic family) or topic has no canonical dir.
206-
207-
Used by ``run_sweep(dry_run=True)`` to populate ``proposed_moves``
208-
without touching the vault.
209-
"""
210-
from app.services.note_classifier import topic_dir_for
211-
212-
topic_dir = topic_dir_for(topic, today=today)
213-
if not topic_dir:
214-
return None
215-
if is_in_topic_dir(src_path, topic_dir):
216-
return None
217-
filename = src_path.rsplit("/", 1)[-1]
218-
return f"{topic_dir}/{filename}"
190+
# Topic-folder move planning lives in app.services.vault_sweep_plan and is
191+
# imported/re-exported here for backwards compatibility with existing callers.
219192

220193

221194
# --- Lockfile (migrated to ObsidianVault.acquire_sweep_lock / release_sweep_lock) ---
@@ -481,12 +454,9 @@ async def _is_safe() -> bool:
481454
if getattr(result, "topic", None) == "noise":
482455
if dry_run:
483456
today = _today_str()
484-
report.proposed_moves.append({
485-
"kind": "trash",
486-
"src": path,
487-
"dst": f"_trash/{today}/{path.rsplit('/', 1)[-1]}",
488-
"reason": "cheap-filter:noise",
489-
})
457+
report.proposed_moves.append(
458+
plan_noise_trash(path, today=today).asdict()
459+
)
490460
report.noise_moved += 1
491461
else:
492462
# MANDATORY per-move safety check (re-evaluated here, not once per run)
@@ -509,17 +479,19 @@ async def _is_safe() -> bool:
509479
# directory and the current path isn't already in that family,
510480
# move (or propose to move) the note to {topic_dir}/{filename}.
511481
topic = getattr(result, "topic", None)
512-
proposed_dst = (
513-
propose_topic_move(path, topic) if topic else None
482+
topic_plan = (
483+
plan_topic_move(
484+
path,
485+
topic,
486+
confidence=float(result.confidence),
487+
)
488+
if topic
489+
else None
514490
)
491+
proposed_dst = topic_plan.dst if topic_plan is not None else None
515492
if proposed_dst is not None:
516493
if dry_run:
517-
report.proposed_moves.append({
518-
"kind": "topic",
519-
"src": path,
520-
"dst": proposed_dst,
521-
"reason": f"topic={topic} (confidence={result.confidence:.2f})",
522-
})
494+
report.proposed_moves.append(topic_plan.asdict())
523495
# 260427-cza: parity with the live `else` branch below
524496
# which increments topic_moves. Without this, dry-run
525497
# reports `topic_moves: 0` while listing N proposals.
@@ -664,13 +636,14 @@ async def _is_safe() -> bool:
664636
keeper_path = survivors[keeper_idx][0]
665637
if dry_run:
666638
today = _today_str()
667-
proposed = f"_trash/{today}/{src.rsplit('/', 1)[-1]}"
668-
report.proposed_moves.append({
669-
"kind": "trash",
670-
"src": src,
671-
"dst": proposed,
672-
"reason": f"duplicate of {keeper_path} (cosine≥0.92, conf={conf:.1f})",
673-
})
639+
report.proposed_moves.append(
640+
plan_duplicate_trash(
641+
src,
642+
keeper_path,
643+
confidence=conf,
644+
today=today,
645+
).asdict()
646+
)
674647
report.duplicates_moved += 1
675648
continue
676649
# MANDATORY per-move safety check (re-evaluated before each dedup-trash)
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
from __future__ import annotations
2+
3+
from app.services.vault_sweep_plan import (
4+
is_in_topic_dir,
5+
plan_duplicate_trash,
6+
plan_noise_trash,
7+
plan_topic_move,
8+
propose_topic_move,
9+
)
10+
11+
12+
def test_plan_noise_trash_matches_dry_run_report_shape():
13+
plan = plan_noise_trash("stale/hello.md", today="2026-06-16")
14+
15+
assert plan.asdict() == {
16+
"kind": "trash",
17+
"src": "stale/hello.md",
18+
"dst": "_trash/2026-06-16/hello.md",
19+
"reason": "cheap-filter:noise",
20+
}
21+
22+
23+
def test_plan_topic_move_skips_existing_topic_family():
24+
assert is_in_topic_dir("journal/2026-06-16/a.md", "journal/2026-06-17")
25+
assert propose_topic_move("accomplishments/a.md", "accomplishment") is None
26+
27+
28+
def test_plan_topic_move_describes_destination_and_reason():
29+
plan = plan_topic_move(
30+
"random/a.md",
31+
"accomplishment",
32+
confidence=0.954,
33+
)
34+
35+
assert plan is not None
36+
assert plan.asdict() == {
37+
"kind": "topic",
38+
"src": "random/a.md",
39+
"dst": "accomplishments/a.md",
40+
"reason": "topic=accomplishment (confidence=0.95)",
41+
}
42+
43+
44+
def test_plan_duplicate_trash_matches_dry_run_report_shape():
45+
plan = plan_duplicate_trash(
46+
"references/short.md",
47+
"references/long.md",
48+
confidence=0.87,
49+
today="2026-06-16",
50+
)
51+
52+
assert plan.asdict() == {
53+
"kind": "trash",
54+
"src": "references/short.md",
55+
"dst": "_trash/2026-06-16/short.md",
56+
"reason": "duplicate of references/long.md (cosine≥0.92, conf=0.9)",
57+
}

0 commit comments

Comments
 (0)