feat(cli): durable session→mesh binding + cross-mesh send (1.36.0)
Some checks failed
CI / Lint (push) Has been cancelled
CI / Typecheck (push) Has been cancelled
CI / Broker tests (Postgres) (push) Has been cancelled
CI / Docker build (linux/amd64) (push) Has been cancelled

Fixes the 'live peer looks disconnected' class of bugs. Two layers:

ROOT CAUSE — involuntary mesh context loss:
The session→mesh binding lived only in the daemon's in-memory registry,
so a daemon restart (e.g. `daemon down && up`) wiped it. Every live
session then lost its mesh, and CLI commands fell back to an arbitrary
default mesh — a peer that never moved looked offline.

Fix: persist session bindings to ~/.claudemesh/daemon/sessions.json
(secret-free — keypairs reload from the per-session keypair store). On
boot the daemon rehydrates each binding whose pid is still alive (with a
start-time PID-reuse guard), reloads its keypair, re-signs a parent
attestation, and re-registers it — which reconnects its SessionBroker
WS. Restarts are now transparent; sessions keep their mesh.

DEFENSIVE LAYER — cross-mesh send resolution:
`send` without --mesh and several joined meshes returned mesh_required;
a prefix under --mesh X resolved against the default mesh's roster, not
X's (only the full 64-char pubkey worked). Now a name/prefix is resolved
across all joined meshes (or scoped to --mesh): unique match auto-selects
its mesh, multi-mesh match asks for --mesh, none gives a clear error.
Kills mesh_required for peers on a non-default mesh and fixes P3.

Maps to field-report P1/P2/P3. P4 (shared member) left as-is (by design).
New: 5 persistence unit tests. Full suite 119/119. Daemon boot verified.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-06-02 20:38:37 +01:00
parent 71401b1d50
commit c747040e0d
6 changed files with 330 additions and 51 deletions

View File

@@ -1,6 +1,6 @@
{
"name": "claudemesh-cli",
"version": "1.35.1",
"version": "1.36.0",
"description": "Peer mesh for Claude Code sessions — CLI + MCP server.",
"keywords": [
"claude-code",

View File

@@ -40,63 +40,93 @@ export async function runSend(flags: SendFlags, to: string, message: string): Pr
: flags.priority === "low" ? "low"
: "next";
// Resolve which mesh to use. With --mesh, target it directly.
// Without, use first joined mesh — same default as withMesh.
// Resolve which mesh to use. With --mesh, target it directly. Without,
// use the only joined mesh, else leave null and let target resolution
// below discover the right mesh from where the peer actually lives.
const config = readConfig();
const meshSlug =
let meshSlug =
flags.mesh ??
(config.meshes.length === 1 ? config.meshes[0]!.slug : null);
// 1.31.6: hex-prefix resolution. If `to` looks like hex but isn't a
// full 64-char pubkey, resolve it against the peer list and replace
// it with the matching full pubkey. The broker stores `targetSpec`
// verbatim and the drain query at apps/broker/src/broker.ts:2408
// matches only on full pubkeys, so a 16-hex prefix would queue
// successfully but never fetch — sender saw "sent", recipient saw
// nothing. Resolving here makes the CLI's prefix UX work end-to-end
// and surfaces ambiguous / unmatched prefixes with a clear error
// instead of a silent drop.
if (
!to.startsWith("@") &&
!to.startsWith("#") &&
to !== "*" &&
/^[0-9a-f]{4,63}$/i.test(to)
) {
try {
const { tryListPeersViaDaemon } = await import("~/services/bridge/daemon-route.js");
const peers = (await tryListPeersViaDaemon()) ?? [];
const lower = to.toLowerCase();
const matches = peers.filter((p) => {
const pk = (p as { pubkey?: string }).pubkey ?? "";
const mpk = (p as { memberPubkey?: string }).memberPubkey ?? "";
return pk.toLowerCase().startsWith(lower) || mpk.toLowerCase().startsWith(lower);
});
if (matches.length === 0) {
render.err(`No peer matches hex prefix "${to}".`);
const names = peers
.map((p) => (p as { displayName?: string }).displayName)
.filter(Boolean)
.join(", ");
if (names) render.hint(`online: ${names}`);
process.exit(1);
// Cross-mesh target resolution (1.36.0). A direct send to a hex prefix
// or display name is resolved against the peer rosters so the CLI:
// - expands a prefix/name to the full session pubkey (the broker's
// drain matches only full pubkeys — a bare prefix would queue but
// never fetch: sender saw "sent", recipient saw nothing);
// - DISCOVERS which joined mesh the target is on when no --mesh was
// given and several meshes are joined. Previously this returned
// `mesh_required` and a live peer on a non-default mesh looked
// "disconnected". We now scan every joined mesh's roster and, if
// the target resolves in exactly one, auto-select that mesh.
// With --mesh (or a single joined mesh) the scan is scoped to that one
// mesh, so `send --mesh X <prefix>` resolves against X's roster — not
// the default mesh (the bug where only the full 64-char pubkey worked).
const isDirect = !to.startsWith("@") && !to.startsWith("#") && to !== "*";
const isFullPubkey = /^[0-9a-f]{64}$/i.test(to);
const isPrefix = /^[0-9a-f]{4,63}$/i.test(to);
const isName = isDirect && !isFullPubkey && !isPrefix;
if (isDirect && (isPrefix || isName || (isFullPubkey && !meshSlug))) {
const { tryListPeersViaDaemon } = await import("~/services/bridge/daemon-route.js");
const searchSlugs = meshSlug ? [meshSlug] : config.meshes.map((m) => m.slug);
const lower = to.toLowerCase();
let daemonReachable = false;
type Hit = { slug: string; pubkey: string; displayName: string };
const matches: Hit[] = [];
for (const slug of searchSlugs) {
const peers = await tryListPeersViaDaemon(slug);
if (peers === null) continue; // daemon unreachable for this query
daemonReachable = true;
for (const p of peers) {
const pk = ((p as { pubkey?: string }).pubkey ?? "").toLowerCase();
const mpk = ((p as { memberPubkey?: string }).memberPubkey ?? "").toLowerCase();
const dn = (p as { displayName?: string }).displayName ?? "?";
const hit = isName
? dn.toLowerCase() === lower
: pk.startsWith(lower) || mpk.startsWith(lower);
if (hit) matches.push({ slug, pubkey: (p as { pubkey?: string }).pubkey ?? "", displayName: dn });
}
if (matches.length > 1) {
const candidates = matches
.map((p) => {
const pk = (p as { pubkey?: string }).pubkey ?? "";
const dn = (p as { displayName?: string }).displayName ?? "?";
return `${dn} ${pk.slice(0, 16)}`;
})
}
// Only act on a reachable daemon. If it was down for every query, fall
// through to the cold path, which opens its own WS and resolves names.
if (daemonReachable) {
const byPubkey = new Map<string, Hit>();
for (const m of matches) if (!byPubkey.has(m.pubkey)) byPubkey.set(m.pubkey, m);
const uniq = [...byPubkey.values()];
const meshesHit = [...new Set(uniq.map((m) => m.slug))];
if (uniq.length === 0) {
// For a full pubkey we couldn't locate, keep going — the user gave
// a complete key and the daemon send will surface a clear error.
if (!isFullPubkey) {
render.err(`No peer matches "${to}"${flags.mesh ? ` on mesh "${flags.mesh}"` : " on any joined mesh"}.`);
render.hint("Check `claudemesh peer list` (add --mesh <slug> to scope).");
process.exit(1);
}
} else if (uniq.length > 1) {
if (meshesHit.length > 1 && !meshSlug) {
// Target lives on several meshes — disambiguate by mesh, not prefix.
const where = uniq
.map((m) => `${m.displayName} ${m.pubkey.slice(0, 12)}… @${m.slug}`)
.join(", ");
render.err(`"${to}" matches peers on ${meshesHit.length} meshes — pick one with --mesh <slug>.`);
render.hint(`candidates: ${where}`);
process.exit(1);
}
const candidates = uniq
.map((m) => `${m.displayName} ${m.pubkey.slice(0, 16)}`)
.join(", ");
render.err(`Ambiguous hex prefix "${to}" — matches ${matches.length} peers.`);
render.err(`Ambiguous ${isName ? "name" : "prefix"} "${to}" — matches ${uniq.length} peers.`);
render.hint(`candidates: ${candidates}`);
render.hint("Use a longer prefix or paste the full 64-char pubkey.");
process.exit(1);
} else {
// Exactly one match — adopt its mesh (P1: kills mesh_required for
// peers on a non-default mesh) and its full pubkey (prefix/name).
meshSlug = uniq[0]!.slug;
if (!isFullPubkey) to = uniq[0]!.pubkey;
}
to = (matches[0] as { pubkey?: string }).pubkey ?? to;
} catch {
// Daemon unreachable — fall through; cold path will try a name
// lookup and surface its own error if that also fails.
}
}
@@ -206,8 +236,10 @@ export async function runSend(flags: SendFlags, to: string, message: string): Pr
// was removed in 1.28.0.
}
// Cold path — open our own WS, encrypt locally, fire envelope.
await withMesh({ meshSlug: flags.mesh ?? null }, async (client) => {
// Cold path — open our own WS, encrypt locally, fire envelope. Use the
// resolved meshSlug (may have been discovered above) so a name/prefix
// that lives on a non-default mesh still targets the right one.
await withMesh({ meshSlug: meshSlug ?? flags.mesh ?? null }, async (client) => {
let targetSpec = to;
if (to.startsWith("#") && !/^#[0-9a-z_-]{20,}$/i.test(to)) {
// Topic by name → resolve to "#<topicId>" via topicList. The broker

View File

@@ -31,6 +31,11 @@ export const DAEMON_PATHS = {
get OUTBOX_DB() { return join(this.DAEMON_DIR, "outbox.db"); },
get INBOX_DB() { return join(this.DAEMON_DIR, "inbox.db"); },
get LOG_FILE() { return join(this.DAEMON_DIR, "daemon.log"); },
/** Persisted session→mesh bindings. Rehydrated on daemon restart so a
* restart never orphans a live session's mesh context (the bug where
* a peer looked "disconnected" after the daemon bounced). Holds no
* secrets — keypairs are reloaded from the per-session keypair store. */
get SESSIONS_FILE() { return join(this.DAEMON_DIR, "sessions.json"); },
} as const;
export const DAEMON_TCP_HOST = "127.0.0.1";

View File

@@ -4,7 +4,7 @@ import { DAEMON_PATHS } from "./paths.js";
import { acquireSingletonLock, releaseSingletonLock } from "./lock.js";
import { ensureLocalToken } from "./local-token.js";
import { startIpcServer } from "./ipc/server.js";
import { setRegistryHooks, startReaper, type SessionInfo } from "./session-registry.js";
import { setRegistryHooks, startReaper, registerSession, readPersistedSessions, setRegistryPersistence, type SessionInfo } from "./session-registry.js";
import { openSqlite, type SqliteDb } from "./db/sqlite.js";
import { migrateOutbox } from "./db/outbox.js";
import { migrateInbox } from "./db/inbox.js";
@@ -308,6 +308,81 @@ export async function runDaemon(opts: RunDaemonOptions = {}): Promise<number> {
startReaper();
// Rehydrate persisted session bindings (1.36.0). A daemon restart used
// to wipe the in-memory registry, so every live session lost its mesh
// context and CLI commands fell back to an arbitrary default mesh — a
// live peer then looked "disconnected" though nothing had moved. We now
// reload each persisted binding, validate the pid is still alive (with
// a start-time PID-reuse guard), reload its keypair from the per-session
// store, re-sign a fresh parent attestation, and re-register it — which
// fires onRegister and reconnects its SessionBrokerClient on the broker.
try {
const persisted = readPersistedSessions(DAEMON_PATHS.SESSIONS_FILE);
if (persisted.length > 0) {
const { loadOrCreateSessionKeypair } = await import("~/services/session/keypair-store.js");
const { signParentAttestation } = await import("~/services/broker/session-hello-sig.js");
const { isPidAlive, getProcessStartTimes } = await import("./process-info.js");
const liveStartTimes = await getProcessStartTimes(persisted.map((p) => p.pid)).catch(() => new Map<number, string>());
let revived = 0;
for (const s of persisted) {
if (!isPidAlive(s.pid)) continue;
if (s.startTime !== undefined) {
const live = liveStartTimes.get(s.pid);
if (live !== undefined && live !== s.startTime) continue; // PID reused
}
const meshConfig = meshConfigs.get(s.mesh);
if (!meshConfig) continue; // mesh no longer joined
try {
const kp = await loadOrCreateSessionKeypair(meshConfig.slug, s.sessionId);
const att = await signParentAttestation({
parentMemberPubkey: meshConfig.pubkey,
parentSecretKey: meshConfig.secretKey,
sessionPubkey: kp.publicKey,
});
registerSession({
token: s.token,
sessionId: s.sessionId,
mesh: s.mesh,
displayName: s.displayName,
pid: s.pid,
...(s.cwd ? { cwd: s.cwd } : {}),
...(s.role ? { role: s.role } : {}),
...(s.groups ? { groups: s.groups } : {}),
...(s.startTime ? { startTime: s.startTime } : {}),
presence: {
sessionPubkey: kp.publicKey,
sessionSecretKey: kp.secretKey,
parentAttestation: {
sessionPubkey: att.sessionPubkey,
parentMemberPubkey: att.parentMemberPubkey,
expiresAt: att.expiresAt,
signature: att.signature,
},
},
});
revived++;
} catch (err) {
process.stderr.write(JSON.stringify({
level: "warn", msg: "session_rehydrate_failed",
token: s.token.slice(0, 8), mesh: s.mesh, err: String(err),
ts: new Date().toISOString(),
}) + "\n");
}
}
process.stderr.write(JSON.stringify({
level: "info", msg: "sessions_rehydrated",
revived, persisted: persisted.length, ts: new Date().toISOString(),
}) + "\n");
}
} catch (err) {
process.stderr.write(JSON.stringify({
level: "warn", msg: "session_rehydrate_scan_failed", err: String(err),
ts: new Date().toISOString(),
}) + "\n");
}
// Enable ongoing persistence now that rehydration has read the old file.
setRegistryPersistence(DAEMON_PATHS.SESSIONS_FILE);
const ipc = startIpcServer({
localToken,
tcpEnabled,

View File

@@ -22,6 +22,10 @@
* session have no token to begin with.
*/
import { existsSync, mkdirSync, readFileSync, renameSync, writeFileSync } from "node:fs";
import { dirname } from "node:path";
import { randomBytes } from "node:crypto";
import { getProcessStartTime, getProcessStartTimes, isPidAlive } from "./process-info.js";
/**
@@ -83,6 +87,65 @@ const hooks: RegistryHooks = {};
let reaperHandle: NodeJS.Timeout | null = null;
/** When set, registry mutations are mirrored to this file so a daemon
* restart can rehydrate live sessions. Holds NO secret material — the
* session keypair is reloaded from the per-session keypair store on
* rehydrate. null (default) disables persistence, which keeps unit
* tests from touching disk unless they opt in. */
let persistPath: string | null = null;
/** Slim, secret-free projection persisted to disk. */
export interface PersistedSession {
token: string;
sessionId: string;
mesh: string;
displayName: string;
pid: number;
cwd?: string;
role?: string;
groups?: string[];
startTime?: string;
registeredAt: number;
}
function toPersisted(info: SessionInfo): PersistedSession {
// Drop `presence` (carries the session secret key) — never to disk here.
const { presence: _presence, ...rest } = info;
return rest;
}
/** Enable on-disk persistence of session bindings (called at daemon boot
* with DAEMON_PATHS.SESSIONS_FILE). Pass null to disable. */
export function setRegistryPersistence(path: string | null): void {
persistPath = path;
}
function persist(): void {
if (!persistPath) return;
try {
mkdirSync(dirname(persistPath), { recursive: true, mode: 0o700 });
const rows = [...byToken.values()].map(toPersisted);
const tmp = `${persistPath}.${randomBytes(6).toString("hex")}.tmp`;
writeFileSync(tmp, JSON.stringify({ version: 1, sessions: rows }), { mode: 0o600 });
renameSync(tmp, persistPath);
} catch {
// Best-effort: a persistence failure must never throttle the registry.
}
}
/** Read persisted session bindings from disk (pure — no registration, no
* liveness check). Returns [] when the file is absent or unreadable.
* The daemon's boot rehydration validates liveness and re-registers. */
export function readPersistedSessions(path: string): PersistedSession[] {
try {
if (!existsSync(path)) return [];
const parsed = JSON.parse(readFileSync(path, "utf8")) as { sessions?: PersistedSession[] };
return Array.isArray(parsed.sessions) ? parsed.sessions : [];
} catch {
return [];
}
}
export function startReaper(): void {
if (reaperHandle) return;
// The sweep is async (batched ps) — wrap in `void` so setInterval
@@ -125,6 +188,7 @@ export function registerSession(info: Omit<SessionInfo, "registeredAt">): Sessio
const stored: SessionInfo = { ...info, registeredAt: Date.now() };
byToken.set(info.token, stored);
bySessionId.set(info.sessionId, info.token);
persist();
try { hooks.onRegister?.(stored); } catch { /* see above */ }
if (stored.startTime === undefined) {
void captureStartTimeAsync(info.token, info.pid);
@@ -138,6 +202,7 @@ async function captureStartTimeAsync(token: string, pid: number): Promise<void>
const entry = byToken.get(token);
if (!entry || entry.pid !== pid) return; // entry was replaced; skip
entry.startTime = lstart;
persist(); // capture start-time on disk so restart can PID-reuse-guard
}
export function deregisterByToken(token: string): boolean {
@@ -145,6 +210,7 @@ export function deregisterByToken(token: string): boolean {
if (!entry) return false;
byToken.delete(token);
if (bySessionId.get(entry.sessionId) === token) bySessionId.delete(entry.sessionId);
persist();
try { hooks.onDeregister?.(entry); } catch { /* see above */ }
return true;
}
@@ -217,4 +283,5 @@ export function _resetRegistry(): void {
bySessionId.clear();
hooks.onRegister = undefined;
hooks.onDeregister = undefined;
persistPath = null;
}

View File

@@ -0,0 +1,100 @@
/**
* Session-registry persistence (1.36.0) — durable session→mesh bindings.
*
* A daemon restart used to wipe the in-memory registry, orphaning every
* live session's mesh context. Persistence lets the daemon rehydrate on
* boot. Verifies:
* - register writes a slim record to disk; readPersistedSessions reads it;
* - the session SECRET KEY is never written to disk;
* - deregister removes the record;
* - persistence is off by default (no disk writes until enabled).
*/
import { existsSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterEach, beforeEach, describe, expect, test } from "vitest";
import {
_resetRegistry,
deregisterByToken,
readPersistedSessions,
registerSession,
setRegistryPersistence,
} from "../../src/daemon/session-registry.js";
const SECRET = "b".repeat(128);
const PRESENCE = {
sessionPubkey: "a".repeat(64),
sessionSecretKey: SECRET,
parentAttestation: {
sessionPubkey: "a".repeat(64),
parentMemberPubkey: "c".repeat(64),
expiresAt: 9_999_999_999,
signature: "d".repeat(128),
},
};
let dir: string;
let file: string;
beforeEach(() => {
_resetRegistry();
dir = mkdtempSync(join(tmpdir(), "cm-reg-"));
file = join(dir, "sessions.json");
});
afterEach(() => {
_resetRegistry();
rmSync(dir, { recursive: true, force: true });
});
describe("registry persistence", () => {
test("off by default — no disk writes until enabled", () => {
registerSession({ token: "t1", sessionId: "s1", mesh: "flexicar", displayName: "a", pid: process.pid, startTime: "x" });
expect(existsSync(file)).toBe(false);
});
test("register persists a slim record; readPersistedSessions round-trips", () => {
setRegistryPersistence(file);
registerSession({
token: "t1", sessionId: "11111111-2222-3333-4444-555555555555",
mesh: "flexicar", displayName: "intra-back", pid: process.pid,
cwd: "/tmp/x", role: "dev", startTime: "x", presence: PRESENCE,
});
const rows = readPersistedSessions(file);
expect(rows).toHaveLength(1);
expect(rows[0]).toMatchObject({
token: "t1", mesh: "flexicar", displayName: "intra-back", cwd: "/tmp/x", role: "dev",
});
});
test("session secret key is NEVER written to disk", () => {
setRegistryPersistence(file);
registerSession({ token: "t1", sessionId: "s1", mesh: "flexicar", displayName: "a", pid: process.pid, startTime: "x", presence: PRESENCE });
const raw = readFileSync(file, "utf8");
expect(raw).not.toContain(SECRET);
expect(raw).not.toContain("sessionSecretKey");
expect(raw).not.toContain("parentAttestation");
// And the parsed record carries no presence material.
expect(readPersistedSessions(file)[0]).not.toHaveProperty("presence");
});
test("deregister removes the record from disk", () => {
setRegistryPersistence(file);
registerSession({ token: "t1", sessionId: "s1", mesh: "flexicar", displayName: "a", pid: process.pid, startTime: "x" });
registerSession({ token: "t2", sessionId: "s2", mesh: "nedas", displayName: "b", pid: process.pid, startTime: "x" });
expect(readPersistedSessions(file)).toHaveLength(2);
deregisterByToken("t1");
const rows = readPersistedSessions(file);
expect(rows).toHaveLength(1);
expect(rows[0]!.token).toBe("t2");
});
test("readPersistedSessions tolerates a missing/corrupt file", () => {
expect(readPersistedSessions(join(dir, "nope.json"))).toEqual([]);
const bad = join(dir, "bad.json");
writeFileSync(bad, "{not json");
expect(readPersistedSessions(bad)).toEqual([]);
});
});