feat(cli): durable session→mesh binding + cross-mesh send (1.36.0)
Some checks failed
CI / Lint (push) Has been cancelled
CI / Typecheck (push) Has been cancelled
CI / Broker tests (Postgres) (push) Has been cancelled
CI / Docker build (linux/amd64) (push) Has been cancelled

Fixes the 'live peer looks disconnected' class of bugs. Two layers:

ROOT CAUSE — involuntary mesh context loss:
The session→mesh binding lived only in the daemon's in-memory registry,
so a daemon restart (e.g. `daemon down && up`) wiped it. Every live
session then lost its mesh, and CLI commands fell back to an arbitrary
default mesh — a peer that never moved looked offline.

Fix: persist session bindings to ~/.claudemesh/daemon/sessions.json
(secret-free — keypairs reload from the per-session keypair store). On
boot the daemon rehydrates each binding whose pid is still alive (with a
start-time PID-reuse guard), reloads its keypair, re-signs a parent
attestation, and re-registers it — which reconnects its SessionBroker
WS. Restarts are now transparent; sessions keep their mesh.

DEFENSIVE LAYER — cross-mesh send resolution:
`send` without --mesh and several joined meshes returned mesh_required;
a prefix under --mesh X resolved against the default mesh's roster, not
X's (only the full 64-char pubkey worked). Now a name/prefix is resolved
across all joined meshes (or scoped to --mesh): unique match auto-selects
its mesh, multi-mesh match asks for --mesh, none gives a clear error.
Kills mesh_required for peers on a non-default mesh and fixes P3.

Maps to field-report P1/P2/P3. P4 (shared member) left as-is (by design).
New: 5 persistence unit tests. Full suite 119/119. Daemon boot verified.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-06-02 20:38:37 +01:00
parent 71401b1d50
commit c747040e0d
6 changed files with 330 additions and 51 deletions

View File

@@ -4,7 +4,7 @@ import { DAEMON_PATHS } from "./paths.js";
import { acquireSingletonLock, releaseSingletonLock } from "./lock.js";
import { ensureLocalToken } from "./local-token.js";
import { startIpcServer } from "./ipc/server.js";
import { setRegistryHooks, startReaper, type SessionInfo } from "./session-registry.js";
import { setRegistryHooks, startReaper, registerSession, readPersistedSessions, setRegistryPersistence, type SessionInfo } from "./session-registry.js";
import { openSqlite, type SqliteDb } from "./db/sqlite.js";
import { migrateOutbox } from "./db/outbox.js";
import { migrateInbox } from "./db/inbox.js";
@@ -308,6 +308,81 @@ export async function runDaemon(opts: RunDaemonOptions = {}): Promise<number> {
startReaper();
// Rehydrate persisted session bindings (1.36.0). A daemon restart used
// to wipe the in-memory registry, so every live session lost its mesh
// context and CLI commands fell back to an arbitrary default mesh — a
// live peer then looked "disconnected" though nothing had moved. We now
// reload each persisted binding, validate the pid is still alive (with
// a start-time PID-reuse guard), reload its keypair from the per-session
// store, re-sign a fresh parent attestation, and re-register it — which
// fires onRegister and reconnects its SessionBrokerClient on the broker.
try {
const persisted = readPersistedSessions(DAEMON_PATHS.SESSIONS_FILE);
if (persisted.length > 0) {
const { loadOrCreateSessionKeypair } = await import("~/services/session/keypair-store.js");
const { signParentAttestation } = await import("~/services/broker/session-hello-sig.js");
const { isPidAlive, getProcessStartTimes } = await import("./process-info.js");
const liveStartTimes = await getProcessStartTimes(persisted.map((p) => p.pid)).catch(() => new Map<number, string>());
let revived = 0;
for (const s of persisted) {
if (!isPidAlive(s.pid)) continue;
if (s.startTime !== undefined) {
const live = liveStartTimes.get(s.pid);
if (live !== undefined && live !== s.startTime) continue; // PID reused
}
const meshConfig = meshConfigs.get(s.mesh);
if (!meshConfig) continue; // mesh no longer joined
try {
const kp = await loadOrCreateSessionKeypair(meshConfig.slug, s.sessionId);
const att = await signParentAttestation({
parentMemberPubkey: meshConfig.pubkey,
parentSecretKey: meshConfig.secretKey,
sessionPubkey: kp.publicKey,
});
registerSession({
token: s.token,
sessionId: s.sessionId,
mesh: s.mesh,
displayName: s.displayName,
pid: s.pid,
...(s.cwd ? { cwd: s.cwd } : {}),
...(s.role ? { role: s.role } : {}),
...(s.groups ? { groups: s.groups } : {}),
...(s.startTime ? { startTime: s.startTime } : {}),
presence: {
sessionPubkey: kp.publicKey,
sessionSecretKey: kp.secretKey,
parentAttestation: {
sessionPubkey: att.sessionPubkey,
parentMemberPubkey: att.parentMemberPubkey,
expiresAt: att.expiresAt,
signature: att.signature,
},
},
});
revived++;
} catch (err) {
process.stderr.write(JSON.stringify({
level: "warn", msg: "session_rehydrate_failed",
token: s.token.slice(0, 8), mesh: s.mesh, err: String(err),
ts: new Date().toISOString(),
}) + "\n");
}
}
process.stderr.write(JSON.stringify({
level: "info", msg: "sessions_rehydrated",
revived, persisted: persisted.length, ts: new Date().toISOString(),
}) + "\n");
}
} catch (err) {
process.stderr.write(JSON.stringify({
level: "warn", msg: "session_rehydrate_scan_failed", err: String(err),
ts: new Date().toISOString(),
}) + "\n");
}
// Enable ongoing persistence now that rehydration has read the old file.
setRegistryPersistence(DAEMON_PATHS.SESSIONS_FILE);
const ipc = startIpcServer({
localToken,
tcpEnabled,