feat(cli): durable session→mesh binding + cross-mesh send (1.36.0)
Fixes the 'live peer looks disconnected' class of bugs. Two layers: ROOT CAUSE — involuntary mesh context loss: The session→mesh binding lived only in the daemon's in-memory registry, so a daemon restart (e.g. `daemon down && up`) wiped it. Every live session then lost its mesh, and CLI commands fell back to an arbitrary default mesh — a peer that never moved looked offline. Fix: persist session bindings to ~/.claudemesh/daemon/sessions.json (secret-free — keypairs reload from the per-session keypair store). On boot the daemon rehydrates each binding whose pid is still alive (with a start-time PID-reuse guard), reloads its keypair, re-signs a parent attestation, and re-registers it — which reconnects its SessionBroker WS. Restarts are now transparent; sessions keep their mesh. DEFENSIVE LAYER — cross-mesh send resolution: `send` without --mesh and several joined meshes returned mesh_required; a prefix under --mesh X resolved against the default mesh's roster, not X's (only the full 64-char pubkey worked). Now a name/prefix is resolved across all joined meshes (or scoped to --mesh): unique match auto-selects its mesh, multi-mesh match asks for --mesh, none gives a clear error. Kills mesh_required for peers on a non-default mesh and fixes P3. Maps to field-report P1/P2/P3. P4 (shared member) left as-is (by design). New: 5 persistence unit tests. Full suite 119/119. Daemon boot verified. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -4,7 +4,7 @@ import { DAEMON_PATHS } from "./paths.js";
|
||||
import { acquireSingletonLock, releaseSingletonLock } from "./lock.js";
|
||||
import { ensureLocalToken } from "./local-token.js";
|
||||
import { startIpcServer } from "./ipc/server.js";
|
||||
import { setRegistryHooks, startReaper, type SessionInfo } from "./session-registry.js";
|
||||
import { setRegistryHooks, startReaper, registerSession, readPersistedSessions, setRegistryPersistence, type SessionInfo } from "./session-registry.js";
|
||||
import { openSqlite, type SqliteDb } from "./db/sqlite.js";
|
||||
import { migrateOutbox } from "./db/outbox.js";
|
||||
import { migrateInbox } from "./db/inbox.js";
|
||||
@@ -308,6 +308,81 @@ export async function runDaemon(opts: RunDaemonOptions = {}): Promise<number> {
|
||||
|
||||
startReaper();
|
||||
|
||||
// Rehydrate persisted session bindings (1.36.0). A daemon restart used
|
||||
// to wipe the in-memory registry, so every live session lost its mesh
|
||||
// context and CLI commands fell back to an arbitrary default mesh — a
|
||||
// live peer then looked "disconnected" though nothing had moved. We now
|
||||
// reload each persisted binding, validate the pid is still alive (with
|
||||
// a start-time PID-reuse guard), reload its keypair from the per-session
|
||||
// store, re-sign a fresh parent attestation, and re-register it — which
|
||||
// fires onRegister and reconnects its SessionBrokerClient on the broker.
|
||||
try {
|
||||
const persisted = readPersistedSessions(DAEMON_PATHS.SESSIONS_FILE);
|
||||
if (persisted.length > 0) {
|
||||
const { loadOrCreateSessionKeypair } = await import("~/services/session/keypair-store.js");
|
||||
const { signParentAttestation } = await import("~/services/broker/session-hello-sig.js");
|
||||
const { isPidAlive, getProcessStartTimes } = await import("./process-info.js");
|
||||
const liveStartTimes = await getProcessStartTimes(persisted.map((p) => p.pid)).catch(() => new Map<number, string>());
|
||||
let revived = 0;
|
||||
for (const s of persisted) {
|
||||
if (!isPidAlive(s.pid)) continue;
|
||||
if (s.startTime !== undefined) {
|
||||
const live = liveStartTimes.get(s.pid);
|
||||
if (live !== undefined && live !== s.startTime) continue; // PID reused
|
||||
}
|
||||
const meshConfig = meshConfigs.get(s.mesh);
|
||||
if (!meshConfig) continue; // mesh no longer joined
|
||||
try {
|
||||
const kp = await loadOrCreateSessionKeypair(meshConfig.slug, s.sessionId);
|
||||
const att = await signParentAttestation({
|
||||
parentMemberPubkey: meshConfig.pubkey,
|
||||
parentSecretKey: meshConfig.secretKey,
|
||||
sessionPubkey: kp.publicKey,
|
||||
});
|
||||
registerSession({
|
||||
token: s.token,
|
||||
sessionId: s.sessionId,
|
||||
mesh: s.mesh,
|
||||
displayName: s.displayName,
|
||||
pid: s.pid,
|
||||
...(s.cwd ? { cwd: s.cwd } : {}),
|
||||
...(s.role ? { role: s.role } : {}),
|
||||
...(s.groups ? { groups: s.groups } : {}),
|
||||
...(s.startTime ? { startTime: s.startTime } : {}),
|
||||
presence: {
|
||||
sessionPubkey: kp.publicKey,
|
||||
sessionSecretKey: kp.secretKey,
|
||||
parentAttestation: {
|
||||
sessionPubkey: att.sessionPubkey,
|
||||
parentMemberPubkey: att.parentMemberPubkey,
|
||||
expiresAt: att.expiresAt,
|
||||
signature: att.signature,
|
||||
},
|
||||
},
|
||||
});
|
||||
revived++;
|
||||
} catch (err) {
|
||||
process.stderr.write(JSON.stringify({
|
||||
level: "warn", msg: "session_rehydrate_failed",
|
||||
token: s.token.slice(0, 8), mesh: s.mesh, err: String(err),
|
||||
ts: new Date().toISOString(),
|
||||
}) + "\n");
|
||||
}
|
||||
}
|
||||
process.stderr.write(JSON.stringify({
|
||||
level: "info", msg: "sessions_rehydrated",
|
||||
revived, persisted: persisted.length, ts: new Date().toISOString(),
|
||||
}) + "\n");
|
||||
}
|
||||
} catch (err) {
|
||||
process.stderr.write(JSON.stringify({
|
||||
level: "warn", msg: "session_rehydrate_scan_failed", err: String(err),
|
||||
ts: new Date().toISOString(),
|
||||
}) + "\n");
|
||||
}
|
||||
// Enable ongoing persistence now that rehydration has read the old file.
|
||||
setRegistryPersistence(DAEMON_PATHS.SESSIONS_FILE);
|
||||
|
||||
const ipc = startIpcServer({
|
||||
localToken,
|
||||
tcpEnabled,
|
||||
|
||||
Reference in New Issue
Block a user