diff --git a/apps/cli/package.json b/apps/cli/package.json index b05ea02..2e2d0f6 100644 --- a/apps/cli/package.json +++ b/apps/cli/package.json @@ -1,6 +1,6 @@ { "name": "claudemesh-cli", - "version": "1.35.1", + "version": "1.36.0", "description": "Peer mesh for Claude Code sessions — CLI + MCP server.", "keywords": [ "claude-code", diff --git a/apps/cli/src/commands/send.ts b/apps/cli/src/commands/send.ts index 9e51492..e2aa262 100644 --- a/apps/cli/src/commands/send.ts +++ b/apps/cli/src/commands/send.ts @@ -40,63 +40,93 @@ export async function runSend(flags: SendFlags, to: string, message: string): Pr : flags.priority === "low" ? "low" : "next"; - // Resolve which mesh to use. With --mesh, target it directly. - // Without, use first joined mesh — same default as withMesh. + // Resolve which mesh to use. With --mesh, target it directly. Without, + // use the only joined mesh, else leave null and let target resolution + // below discover the right mesh from where the peer actually lives. const config = readConfig(); - const meshSlug = + let meshSlug = flags.mesh ?? (config.meshes.length === 1 ? config.meshes[0]!.slug : null); - // 1.31.6: hex-prefix resolution. If `to` looks like hex but isn't a - // full 64-char pubkey, resolve it against the peer list and replace - // it with the matching full pubkey. The broker stores `targetSpec` - // verbatim and the drain query at apps/broker/src/broker.ts:2408 - // matches only on full pubkeys, so a 16-hex prefix would queue - // successfully but never fetch — sender saw "sent", recipient saw - // nothing. Resolving here makes the CLI's prefix UX work end-to-end - // and surfaces ambiguous / unmatched prefixes with a clear error - // instead of a silent drop. - if ( - !to.startsWith("@") && - !to.startsWith("#") && - to !== "*" && - /^[0-9a-f]{4,63}$/i.test(to) - ) { - try { - const { tryListPeersViaDaemon } = await import("~/services/bridge/daemon-route.js"); - const peers = (await tryListPeersViaDaemon()) ?? []; - const lower = to.toLowerCase(); - const matches = peers.filter((p) => { - const pk = (p as { pubkey?: string }).pubkey ?? ""; - const mpk = (p as { memberPubkey?: string }).memberPubkey ?? ""; - return pk.toLowerCase().startsWith(lower) || mpk.toLowerCase().startsWith(lower); - }); - if (matches.length === 0) { - render.err(`No peer matches hex prefix "${to}".`); - const names = peers - .map((p) => (p as { displayName?: string }).displayName) - .filter(Boolean) - .join(", "); - if (names) render.hint(`online: ${names}`); - process.exit(1); + // Cross-mesh target resolution (1.36.0). A direct send to a hex prefix + // or display name is resolved against the peer rosters so the CLI: + // - expands a prefix/name to the full session pubkey (the broker's + // drain matches only full pubkeys — a bare prefix would queue but + // never fetch: sender saw "sent", recipient saw nothing); + // - DISCOVERS which joined mesh the target is on when no --mesh was + // given and several meshes are joined. Previously this returned + // `mesh_required` and a live peer on a non-default mesh looked + // "disconnected". We now scan every joined mesh's roster and, if + // the target resolves in exactly one, auto-select that mesh. + // With --mesh (or a single joined mesh) the scan is scoped to that one + // mesh, so `send --mesh X ` resolves against X's roster — not + // the default mesh (the bug where only the full 64-char pubkey worked). + const isDirect = !to.startsWith("@") && !to.startsWith("#") && to !== "*"; + const isFullPubkey = /^[0-9a-f]{64}$/i.test(to); + const isPrefix = /^[0-9a-f]{4,63}$/i.test(to); + const isName = isDirect && !isFullPubkey && !isPrefix; + + if (isDirect && (isPrefix || isName || (isFullPubkey && !meshSlug))) { + const { tryListPeersViaDaemon } = await import("~/services/bridge/daemon-route.js"); + const searchSlugs = meshSlug ? [meshSlug] : config.meshes.map((m) => m.slug); + const lower = to.toLowerCase(); + let daemonReachable = false; + type Hit = { slug: string; pubkey: string; displayName: string }; + const matches: Hit[] = []; + for (const slug of searchSlugs) { + const peers = await tryListPeersViaDaemon(slug); + if (peers === null) continue; // daemon unreachable for this query + daemonReachable = true; + for (const p of peers) { + const pk = ((p as { pubkey?: string }).pubkey ?? "").toLowerCase(); + const mpk = ((p as { memberPubkey?: string }).memberPubkey ?? "").toLowerCase(); + const dn = (p as { displayName?: string }).displayName ?? "?"; + const hit = isName + ? dn.toLowerCase() === lower + : pk.startsWith(lower) || mpk.startsWith(lower); + if (hit) matches.push({ slug, pubkey: (p as { pubkey?: string }).pubkey ?? "", displayName: dn }); } - if (matches.length > 1) { - const candidates = matches - .map((p) => { - const pk = (p as { pubkey?: string }).pubkey ?? ""; - const dn = (p as { displayName?: string }).displayName ?? "?"; - return `${dn} ${pk.slice(0, 16)}…`; - }) + } + + // Only act on a reachable daemon. If it was down for every query, fall + // through to the cold path, which opens its own WS and resolves names. + if (daemonReachable) { + const byPubkey = new Map(); + for (const m of matches) if (!byPubkey.has(m.pubkey)) byPubkey.set(m.pubkey, m); + const uniq = [...byPubkey.values()]; + const meshesHit = [...new Set(uniq.map((m) => m.slug))]; + + if (uniq.length === 0) { + // For a full pubkey we couldn't locate, keep going — the user gave + // a complete key and the daemon send will surface a clear error. + if (!isFullPubkey) { + render.err(`No peer matches "${to}"${flags.mesh ? ` on mesh "${flags.mesh}"` : " on any joined mesh"}.`); + render.hint("Check `claudemesh peer list` (add --mesh to scope)."); + process.exit(1); + } + } else if (uniq.length > 1) { + if (meshesHit.length > 1 && !meshSlug) { + // Target lives on several meshes — disambiguate by mesh, not prefix. + const where = uniq + .map((m) => `${m.displayName} ${m.pubkey.slice(0, 12)}… @${m.slug}`) + .join(", "); + render.err(`"${to}" matches peers on ${meshesHit.length} meshes — pick one with --mesh .`); + render.hint(`candidates: ${where}`); + process.exit(1); + } + const candidates = uniq + .map((m) => `${m.displayName} ${m.pubkey.slice(0, 16)}…`) .join(", "); - render.err(`Ambiguous hex prefix "${to}" — matches ${matches.length} peers.`); + render.err(`Ambiguous ${isName ? "name" : "prefix"} "${to}" — matches ${uniq.length} peers.`); render.hint(`candidates: ${candidates}`); render.hint("Use a longer prefix or paste the full 64-char pubkey."); process.exit(1); + } else { + // Exactly one match — adopt its mesh (P1: kills mesh_required for + // peers on a non-default mesh) and its full pubkey (prefix/name). + meshSlug = uniq[0]!.slug; + if (!isFullPubkey) to = uniq[0]!.pubkey; } - to = (matches[0] as { pubkey?: string }).pubkey ?? to; - } catch { - // Daemon unreachable — fall through; cold path will try a name - // lookup and surface its own error if that also fails. } } @@ -206,8 +236,10 @@ export async function runSend(flags: SendFlags, to: string, message: string): Pr // was removed in 1.28.0. } - // Cold path — open our own WS, encrypt locally, fire envelope. - await withMesh({ meshSlug: flags.mesh ?? null }, async (client) => { + // Cold path — open our own WS, encrypt locally, fire envelope. Use the + // resolved meshSlug (may have been discovered above) so a name/prefix + // that lives on a non-default mesh still targets the right one. + await withMesh({ meshSlug: meshSlug ?? flags.mesh ?? null }, async (client) => { let targetSpec = to; if (to.startsWith("#") && !/^#[0-9a-z_-]{20,}$/i.test(to)) { // Topic by name → resolve to "#" via topicList. The broker diff --git a/apps/cli/src/daemon/paths.ts b/apps/cli/src/daemon/paths.ts index f40d918..2a56a61 100644 --- a/apps/cli/src/daemon/paths.ts +++ b/apps/cli/src/daemon/paths.ts @@ -31,6 +31,11 @@ export const DAEMON_PATHS = { get OUTBOX_DB() { return join(this.DAEMON_DIR, "outbox.db"); }, get INBOX_DB() { return join(this.DAEMON_DIR, "inbox.db"); }, get LOG_FILE() { return join(this.DAEMON_DIR, "daemon.log"); }, + /** Persisted session→mesh bindings. Rehydrated on daemon restart so a + * restart never orphans a live session's mesh context (the bug where + * a peer looked "disconnected" after the daemon bounced). Holds no + * secrets — keypairs are reloaded from the per-session keypair store. */ + get SESSIONS_FILE() { return join(this.DAEMON_DIR, "sessions.json"); }, } as const; export const DAEMON_TCP_HOST = "127.0.0.1"; diff --git a/apps/cli/src/daemon/run.ts b/apps/cli/src/daemon/run.ts index 47cb8cc..c4aa776 100644 --- a/apps/cli/src/daemon/run.ts +++ b/apps/cli/src/daemon/run.ts @@ -4,7 +4,7 @@ import { DAEMON_PATHS } from "./paths.js"; import { acquireSingletonLock, releaseSingletonLock } from "./lock.js"; import { ensureLocalToken } from "./local-token.js"; import { startIpcServer } from "./ipc/server.js"; -import { setRegistryHooks, startReaper, type SessionInfo } from "./session-registry.js"; +import { setRegistryHooks, startReaper, registerSession, readPersistedSessions, setRegistryPersistence, type SessionInfo } from "./session-registry.js"; import { openSqlite, type SqliteDb } from "./db/sqlite.js"; import { migrateOutbox } from "./db/outbox.js"; import { migrateInbox } from "./db/inbox.js"; @@ -308,6 +308,81 @@ export async function runDaemon(opts: RunDaemonOptions = {}): Promise { startReaper(); + // Rehydrate persisted session bindings (1.36.0). A daemon restart used + // to wipe the in-memory registry, so every live session lost its mesh + // context and CLI commands fell back to an arbitrary default mesh — a + // live peer then looked "disconnected" though nothing had moved. We now + // reload each persisted binding, validate the pid is still alive (with + // a start-time PID-reuse guard), reload its keypair from the per-session + // store, re-sign a fresh parent attestation, and re-register it — which + // fires onRegister and reconnects its SessionBrokerClient on the broker. + try { + const persisted = readPersistedSessions(DAEMON_PATHS.SESSIONS_FILE); + if (persisted.length > 0) { + const { loadOrCreateSessionKeypair } = await import("~/services/session/keypair-store.js"); + const { signParentAttestation } = await import("~/services/broker/session-hello-sig.js"); + const { isPidAlive, getProcessStartTimes } = await import("./process-info.js"); + const liveStartTimes = await getProcessStartTimes(persisted.map((p) => p.pid)).catch(() => new Map()); + let revived = 0; + for (const s of persisted) { + if (!isPidAlive(s.pid)) continue; + if (s.startTime !== undefined) { + const live = liveStartTimes.get(s.pid); + if (live !== undefined && live !== s.startTime) continue; // PID reused + } + const meshConfig = meshConfigs.get(s.mesh); + if (!meshConfig) continue; // mesh no longer joined + try { + const kp = await loadOrCreateSessionKeypair(meshConfig.slug, s.sessionId); + const att = await signParentAttestation({ + parentMemberPubkey: meshConfig.pubkey, + parentSecretKey: meshConfig.secretKey, + sessionPubkey: kp.publicKey, + }); + registerSession({ + token: s.token, + sessionId: s.sessionId, + mesh: s.mesh, + displayName: s.displayName, + pid: s.pid, + ...(s.cwd ? { cwd: s.cwd } : {}), + ...(s.role ? { role: s.role } : {}), + ...(s.groups ? { groups: s.groups } : {}), + ...(s.startTime ? { startTime: s.startTime } : {}), + presence: { + sessionPubkey: kp.publicKey, + sessionSecretKey: kp.secretKey, + parentAttestation: { + sessionPubkey: att.sessionPubkey, + parentMemberPubkey: att.parentMemberPubkey, + expiresAt: att.expiresAt, + signature: att.signature, + }, + }, + }); + revived++; + } catch (err) { + process.stderr.write(JSON.stringify({ + level: "warn", msg: "session_rehydrate_failed", + token: s.token.slice(0, 8), mesh: s.mesh, err: String(err), + ts: new Date().toISOString(), + }) + "\n"); + } + } + process.stderr.write(JSON.stringify({ + level: "info", msg: "sessions_rehydrated", + revived, persisted: persisted.length, ts: new Date().toISOString(), + }) + "\n"); + } + } catch (err) { + process.stderr.write(JSON.stringify({ + level: "warn", msg: "session_rehydrate_scan_failed", err: String(err), + ts: new Date().toISOString(), + }) + "\n"); + } + // Enable ongoing persistence now that rehydration has read the old file. + setRegistryPersistence(DAEMON_PATHS.SESSIONS_FILE); + const ipc = startIpcServer({ localToken, tcpEnabled, diff --git a/apps/cli/src/daemon/session-registry.ts b/apps/cli/src/daemon/session-registry.ts index d79c7a8..39a1d68 100644 --- a/apps/cli/src/daemon/session-registry.ts +++ b/apps/cli/src/daemon/session-registry.ts @@ -22,6 +22,10 @@ * session have no token to begin with. */ +import { existsSync, mkdirSync, readFileSync, renameSync, writeFileSync } from "node:fs"; +import { dirname } from "node:path"; +import { randomBytes } from "node:crypto"; + import { getProcessStartTime, getProcessStartTimes, isPidAlive } from "./process-info.js"; /** @@ -83,6 +87,65 @@ const hooks: RegistryHooks = {}; let reaperHandle: NodeJS.Timeout | null = null; +/** When set, registry mutations are mirrored to this file so a daemon + * restart can rehydrate live sessions. Holds NO secret material — the + * session keypair is reloaded from the per-session keypair store on + * rehydrate. null (default) disables persistence, which keeps unit + * tests from touching disk unless they opt in. */ +let persistPath: string | null = null; + +/** Slim, secret-free projection persisted to disk. */ +export interface PersistedSession { + token: string; + sessionId: string; + mesh: string; + displayName: string; + pid: number; + cwd?: string; + role?: string; + groups?: string[]; + startTime?: string; + registeredAt: number; +} + +function toPersisted(info: SessionInfo): PersistedSession { + // Drop `presence` (carries the session secret key) — never to disk here. + const { presence: _presence, ...rest } = info; + return rest; +} + +/** Enable on-disk persistence of session bindings (called at daemon boot + * with DAEMON_PATHS.SESSIONS_FILE). Pass null to disable. */ +export function setRegistryPersistence(path: string | null): void { + persistPath = path; +} + +function persist(): void { + if (!persistPath) return; + try { + mkdirSync(dirname(persistPath), { recursive: true, mode: 0o700 }); + const rows = [...byToken.values()].map(toPersisted); + const tmp = `${persistPath}.${randomBytes(6).toString("hex")}.tmp`; + writeFileSync(tmp, JSON.stringify({ version: 1, sessions: rows }), { mode: 0o600 }); + renameSync(tmp, persistPath); + } catch { + // Best-effort: a persistence failure must never throttle the registry. + } +} + +/** Read persisted session bindings from disk (pure — no registration, no + * liveness check). Returns [] when the file is absent or unreadable. + * The daemon's boot rehydration validates liveness and re-registers. */ +export function readPersistedSessions(path: string): PersistedSession[] { + try { + if (!existsSync(path)) return []; + const parsed = JSON.parse(readFileSync(path, "utf8")) as { sessions?: PersistedSession[] }; + return Array.isArray(parsed.sessions) ? parsed.sessions : []; + } catch { + return []; + } +} + export function startReaper(): void { if (reaperHandle) return; // The sweep is async (batched ps) — wrap in `void` so setInterval @@ -125,6 +188,7 @@ export function registerSession(info: Omit): Sessio const stored: SessionInfo = { ...info, registeredAt: Date.now() }; byToken.set(info.token, stored); bySessionId.set(info.sessionId, info.token); + persist(); try { hooks.onRegister?.(stored); } catch { /* see above */ } if (stored.startTime === undefined) { void captureStartTimeAsync(info.token, info.pid); @@ -138,6 +202,7 @@ async function captureStartTimeAsync(token: string, pid: number): Promise const entry = byToken.get(token); if (!entry || entry.pid !== pid) return; // entry was replaced; skip entry.startTime = lstart; + persist(); // capture start-time on disk so restart can PID-reuse-guard } export function deregisterByToken(token: string): boolean { @@ -145,6 +210,7 @@ export function deregisterByToken(token: string): boolean { if (!entry) return false; byToken.delete(token); if (bySessionId.get(entry.sessionId) === token) bySessionId.delete(entry.sessionId); + persist(); try { hooks.onDeregister?.(entry); } catch { /* see above */ } return true; } @@ -217,4 +283,5 @@ export function _resetRegistry(): void { bySessionId.clear(); hooks.onRegister = undefined; hooks.onDeregister = undefined; + persistPath = null; } diff --git a/apps/cli/tests/unit/session-registry-persist.test.ts b/apps/cli/tests/unit/session-registry-persist.test.ts new file mode 100644 index 0000000..5bc59b1 --- /dev/null +++ b/apps/cli/tests/unit/session-registry-persist.test.ts @@ -0,0 +1,100 @@ +/** + * Session-registry persistence (1.36.0) — durable session→mesh bindings. + * + * A daemon restart used to wipe the in-memory registry, orphaning every + * live session's mesh context. Persistence lets the daemon rehydrate on + * boot. Verifies: + * - register writes a slim record to disk; readPersistedSessions reads it; + * - the session SECRET KEY is never written to disk; + * - deregister removes the record; + * - persistence is off by default (no disk writes until enabled). + */ + +import { existsSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, beforeEach, describe, expect, test } from "vitest"; + +import { + _resetRegistry, + deregisterByToken, + readPersistedSessions, + registerSession, + setRegistryPersistence, +} from "../../src/daemon/session-registry.js"; + +const SECRET = "b".repeat(128); +const PRESENCE = { + sessionPubkey: "a".repeat(64), + sessionSecretKey: SECRET, + parentAttestation: { + sessionPubkey: "a".repeat(64), + parentMemberPubkey: "c".repeat(64), + expiresAt: 9_999_999_999, + signature: "d".repeat(128), + }, +}; + +let dir: string; +let file: string; + +beforeEach(() => { + _resetRegistry(); + dir = mkdtempSync(join(tmpdir(), "cm-reg-")); + file = join(dir, "sessions.json"); +}); + +afterEach(() => { + _resetRegistry(); + rmSync(dir, { recursive: true, force: true }); +}); + +describe("registry persistence", () => { + test("off by default — no disk writes until enabled", () => { + registerSession({ token: "t1", sessionId: "s1", mesh: "flexicar", displayName: "a", pid: process.pid, startTime: "x" }); + expect(existsSync(file)).toBe(false); + }); + + test("register persists a slim record; readPersistedSessions round-trips", () => { + setRegistryPersistence(file); + registerSession({ + token: "t1", sessionId: "11111111-2222-3333-4444-555555555555", + mesh: "flexicar", displayName: "intra-back", pid: process.pid, + cwd: "/tmp/x", role: "dev", startTime: "x", presence: PRESENCE, + }); + const rows = readPersistedSessions(file); + expect(rows).toHaveLength(1); + expect(rows[0]).toMatchObject({ + token: "t1", mesh: "flexicar", displayName: "intra-back", cwd: "/tmp/x", role: "dev", + }); + }); + + test("session secret key is NEVER written to disk", () => { + setRegistryPersistence(file); + registerSession({ token: "t1", sessionId: "s1", mesh: "flexicar", displayName: "a", pid: process.pid, startTime: "x", presence: PRESENCE }); + const raw = readFileSync(file, "utf8"); + expect(raw).not.toContain(SECRET); + expect(raw).not.toContain("sessionSecretKey"); + expect(raw).not.toContain("parentAttestation"); + // And the parsed record carries no presence material. + expect(readPersistedSessions(file)[0]).not.toHaveProperty("presence"); + }); + + test("deregister removes the record from disk", () => { + setRegistryPersistence(file); + registerSession({ token: "t1", sessionId: "s1", mesh: "flexicar", displayName: "a", pid: process.pid, startTime: "x" }); + registerSession({ token: "t2", sessionId: "s2", mesh: "nedas", displayName: "b", pid: process.pid, startTime: "x" }); + expect(readPersistedSessions(file)).toHaveLength(2); + deregisterByToken("t1"); + const rows = readPersistedSessions(file); + expect(rows).toHaveLength(1); + expect(rows[0]!.token).toBe("t2"); + }); + + test("readPersistedSessions tolerates a missing/corrupt file", () => { + expect(readPersistedSessions(join(dir, "nope.json"))).toEqual([]); + const bad = join(dir, "bad.json"); + writeFileSync(bad, "{not json"); + expect(readPersistedSessions(bad)).toEqual([]); + }); +});