Files
claudemesh/apps/cli/src/daemon/run.ts
Alejandro Gutiérrez c747040e0d
Some checks failed
CI / Lint (push) Has been cancelled
CI / Typecheck (push) Has been cancelled
CI / Broker tests (Postgres) (push) Has been cancelled
CI / Docker build (linux/amd64) (push) Has been cancelled
feat(cli): durable session→mesh binding + cross-mesh send (1.36.0)
Fixes the 'live peer looks disconnected' class of bugs. Two layers:

ROOT CAUSE — involuntary mesh context loss:
The session→mesh binding lived only in the daemon's in-memory registry,
so a daemon restart (e.g. `daemon down && up`) wiped it. Every live
session then lost its mesh, and CLI commands fell back to an arbitrary
default mesh — a peer that never moved looked offline.

Fix: persist session bindings to ~/.claudemesh/daemon/sessions.json
(secret-free — keypairs reload from the per-session keypair store). On
boot the daemon rehydrates each binding whose pid is still alive (with a
start-time PID-reuse guard), reloads its keypair, re-signs a parent
attestation, and re-registers it — which reconnects its SessionBroker
WS. Restarts are now transparent; sessions keep their mesh.

DEFENSIVE LAYER — cross-mesh send resolution:
`send` without --mesh and several joined meshes returned mesh_required;
a prefix under --mesh X resolved against the default mesh's roster, not
X's (only the full 64-char pubkey worked). Now a name/prefix is resolved
across all joined meshes (or scoped to --mesh): unique match auto-selects
its mesh, multi-mesh match asks for --mesh, none gives a clear error.
Kills mesh_required for peers on a non-default mesh and fixes P3.

Maps to field-report P1/P2/P3. P4 (shared member) left as-is (by design).
New: 5 persistence unit tests. Full suite 119/119. Daemon boot verified.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-02 20:38:37 +01:00

446 lines
19 KiB
TypeScript

import { existsSync, mkdirSync, readFileSync } from "node:fs";
import { DAEMON_PATHS } from "./paths.js";
import { acquireSingletonLock, releaseSingletonLock } from "./lock.js";
import { ensureLocalToken } from "./local-token.js";
import { startIpcServer } from "./ipc/server.js";
import { setRegistryHooks, startReaper, registerSession, readPersistedSessions, setRegistryPersistence, type SessionInfo } from "./session-registry.js";
import { openSqlite, type SqliteDb } from "./db/sqlite.js";
import { migrateOutbox } from "./db/outbox.js";
import { migrateInbox } from "./db/inbox.js";
import { DaemonBrokerClient } from "./broker.js";
import { SessionBrokerClient } from "./session-broker.js";
import { startDrainWorker, type DrainHandle } from "./drain.js";
import { startInboxPruner, type InboxPrunerHandle } from "./inbox-pruner.js";
import { handleBrokerPush } from "./inbound.js";
import { EventBus } from "./events.js";
import { checkFingerprint, type ClonePolicy } from "./identity.js";
import { readConfig } from "~/services/config/facade.js";
import { VERSION } from "~/constants/urls.js";
export interface RunDaemonOptions {
/** Disable TCP loopback (UDS-only). Defaults true in container envs. */
tcpEnabled?: boolean;
publicHealthCheck?: boolean;
/** Behavior on host_fingerprint mismatch. Defaults 'refuse'. */
clonePolicy?: ClonePolicy;
}
/** Detect a few common container environments to pick UDS-only by default. */
function detectContainer(): boolean {
if (process.env.KUBERNETES_SERVICE_HOST) return true;
if (process.env.CONTAINER === "1") return true;
try {
if (existsSync("/.dockerenv")) return true;
const cg = readFileSync("/proc/1/cgroup", "utf8");
if (/(docker|kubepods|containerd)/.test(cg)) return true;
} catch { /* not linux or no /proc */ }
return false;
}
export async function runDaemon(opts: RunDaemonOptions = {}): Promise<number> {
mkdirSync(DAEMON_PATHS.DAEMON_DIR, { recursive: true, mode: 0o700 });
const lock = acquireSingletonLock();
if (lock.result === "already-running") {
process.stderr.write(`daemon already running (pid ${lock.pid})\n`);
return 1;
}
if (lock.result === "stale") {
process.stderr.write(`recovered stale pid file; starting fresh\n`);
}
// Accidental-clone detection (spec §2.2). Default policy: refuse.
const fpCheck = checkFingerprint();
const policy: ClonePolicy = opts.clonePolicy ?? "refuse";
if (fpCheck.result === "mismatch") {
const msg = `host_fingerprint mismatch: this daemon dir was started on a different host.`;
if (policy === "refuse") {
process.stderr.write(`${msg}\n`);
process.stderr.write(` stored host_id: ${fpCheck.stored?.host_id}\n`);
process.stderr.write(` current host_id: ${fpCheck.current.host_id}\n`);
process.stderr.write(`Run \`claudemesh daemon accept-host\` to write a fresh fingerprint, or\n`);
process.stderr.write(`run \`claudemesh daemon remint\` to mint a new keypair (Sprint 7+).\n`);
releaseSingletonLock();
return 4;
}
if (policy === "warn") {
process.stderr.write(`WARN: ${msg} (continuing per [clone] policy=warn)\n`);
}
// 'allow' is silent.
}
if (fpCheck.result === "first_run") {
process.stdout.write(JSON.stringify({
msg: "host_fingerprint_written", fingerprint_prefix: fpCheck.current.fingerprint.slice(0, 16), ts: new Date().toISOString(),
}) + "\n");
}
const localToken = ensureLocalToken();
const tcpEnabled = opts.tcpEnabled ?? !detectContainer();
let outboxDb: SqliteDb;
let inboxDb: SqliteDb;
try {
outboxDb = await openSqlite(DAEMON_PATHS.OUTBOX_DB);
migrateOutbox(outboxDb);
inboxDb = await openSqlite(DAEMON_PATHS.INBOX_DB);
migrateInbox(inboxDb);
} catch (err) {
process.stderr.write(`db open failed: ${String(err)}\n`);
releaseSingletonLock();
return 1;
}
const bus = new EventBus();
// 1.34.10: the daemon is universal — attaches to every mesh listed
// in config.json. Single-mesh isolation is handled by simply joining
// only one mesh in that environment (containers, etc.). No --mesh
// flag, no per-mesh service unit; one daemon, every mesh.
const cfg = readConfig();
if (cfg.meshes.length === 0) {
process.stderr.write(`no mesh joined; run \`claudemesh join <invite-url>\` first\n`);
releaseSingletonLock();
try { outboxDb.close(); } catch { /* ignore */ }
return 2;
}
const meshes = cfg.meshes;
// 1.34.9 — declared upfront so the daemon-WS onPush closure can
// reach into the per-session map for the isOwnPubkey filter (drops
// peer_joined / peer_left events for our own session pubkeys before
// they surface as `[system] Peer "<self>" joined`). Populated below
// by setRegistryHooks; empty until the first session registers, but
// that's fine — the closure walks it lazily.
const sessionBrokers = new Map<string, SessionBrokerClient>();
const sessionBrokersByPubkey = new Map<string, SessionBrokerClient>();
// Spin up one broker per mesh. Connection failures are non-fatal:
// the outbox keeps queuing per-mesh and reconnect logic in
// DaemonBrokerClient handles reattach.
const brokers = new Map<string, DaemonBrokerClient>();
const meshConfigs = new Map<string, typeof cfg.meshes[number]>();
for (const mesh of meshes) {
meshConfigs.set(mesh.slug, mesh);
// 1.34.10: no global displayName override anymore. Each mesh's
// hello uses its own per-mesh display name from config.json (set
// at `claudemesh join` time). Sessions advertise their own name
// via `claudemesh launch --name`.
const broker: DaemonBrokerClient = new DaemonBrokerClient(mesh, {
onStatusChange: (s) => {
process.stdout.write(JSON.stringify({
msg: "broker_status", status: s, mesh: mesh.slug, ts: new Date().toISOString(),
}) + "\n");
bus.publish("broker_status", { mesh: mesh.slug, status: s });
},
onPush: (m) => {
// Daemon-WS is member-keyed, not session-keyed. Session-targeted
// DMs land on the per-session WS (SessionBrokerClient) since
// 1.32.1 and decrypt with the session secret there. Anything that
// arrives here can only be member-keyed (broadcasts, member DMs,
// system events) — pass member secret only.
// 1.34.9: drop self-echoes — broker fan-out paths mirror an
// outbound back to the SAME daemon's member-WS even when the
// send originated on a session-WS (because both connections
// belong to the same member from the broker's view). Filter on
// senderMemberPubkey alone: anything attributed to OUR member is
// either our own send echoing back or, theoretically, a peer
// send from a different connection that happens to share our
// pubkey — but two-different-clients-same-pubkey is impossible
// by construction (member pubkeys are stable + unique per
// identity). Sibling-session DMs don't fan to our member-WS;
// they fan session-to-session. So this is safe.
const senderMemberPk = String((m as Record<string, unknown>).senderMemberPubkey ?? "").toLowerCase();
const ownMember = mesh.pubkey.toLowerCase();
if (senderMemberPk && senderMemberPk === ownMember) {
return;
}
void handleBrokerPush(m, {
db: inboxDb,
bus,
meshSlug: mesh.slug,
recipientSecretKeyHex: mesh.secretKey,
// v2 agentic-comms (M1): client_ack closes the at-least-once
// loop. Broker holds the row claimed (not delivered) until ack.
ackClientMessage: (cmid, bmid) => broker.sendClientAck(cmid, bmid),
// 1.34.9: drop self-join system events. Member pubkey + every
// live session pubkey on this daemon all count as "us".
isOwnPubkey: (pubkey) => {
const lower = pubkey.toLowerCase();
if (lower === ownMember) return true;
return sessionBrokersByPubkey.has(lower);
},
// 1.34.10: tag the bus event with our member pubkey so the
// SSE demux only fans this row to MCPs whose subscriber
// matches (member-keyed broadcasts / DMs).
recipientPubkey: mesh.pubkey,
recipientKind: "member",
});
},
});
broker.connect().catch((err) => process.stderr.write(`broker connect failed for ${mesh.slug}: ${String(err)}\n`));
brokers.set(mesh.slug, broker);
}
// 1.30.0 — per-session broker presence. Always on. Older CLIs that
// don't include `presence` material in the register body just won't
// get a session WS; the daemon's own member-keyed broker still
// covers them.
//
// The two index maps (sessionBrokers by token, sessionBrokersByPubkey
// by session pubkey) are declared earlier in this function so the
// daemon-WS onPush closure can reference them for the isOwnPubkey
// self-join filter.
// Start the drain worker. With multi-mesh, drain dispatches each
// outbox row to its mesh's broker via the `mesh` column.
// 1.34.0: drain also accepts a session-pubkey lookup so rows
// written by authenticated sessions route via the matching session-WS
// (broker fan-out then attributes the push to the session pubkey).
let drain: DrainHandle | null = null;
drain = startDrainWorker({
db: outboxDb,
brokers,
getSessionBrokerByPubkey: (pubkey) => sessionBrokersByPubkey.get(pubkey),
});
// 1.34.8 — TTL prune for inbox.db. Runs hourly with a 30-day default
// retention. Without this the inbox grows unbounded; even on a moderate
// mesh that's tens of thousands of rows over a few weeks. Prune is a
// single DELETE; failures are non-fatal and the next interval retries.
const inboxPruner: InboxPrunerHandle = startInboxPruner({ db: inboxDb });
setRegistryHooks({
onRegister: (info) => {
if (!info.presence) return;
const meshConfig = meshConfigs.get(info.mesh);
if (!meshConfig) {
process.stderr.write(JSON.stringify({
level: "warn", msg: "session_broker_no_mesh_config", mesh: info.mesh,
ts: new Date().toISOString(),
}) + "\n");
return;
}
// Drop any pre-existing session WS under this token (re-register).
const prior = sessionBrokers.get(info.token);
if (prior) {
sessionBrokers.delete(info.token);
// 1.34.0: keep both indices in sync.
if (sessionBrokersByPubkey.get(prior.sessionPubkey) === prior) {
sessionBrokersByPubkey.delete(prior.sessionPubkey);
}
prior.close().catch(() => { /* ignore */ });
}
// Also drop any stale WS holding this session pubkey under a
// DIFFERENT token. With UUID-anchored persistent keypairs a relaunch
// reuses the pubkey, so without this the old SessionBrokerClient
// would linger connected (the broker then sees two presences for one
// pubkey — the same-name ghost that stole queued DMs). Dedup by
// pubkey closes it before the new WS opens.
const priorByPubkey = sessionBrokersByPubkey.get(info.presence.sessionPubkey);
if (priorByPubkey && priorByPubkey !== prior) {
for (const [tok, c] of sessionBrokers) {
if (c === priorByPubkey) { sessionBrokers.delete(tok); break; }
}
sessionBrokersByPubkey.delete(info.presence.sessionPubkey);
priorByPubkey.close().catch(() => { /* ignore */ });
}
// 1.32.1 — wire push delivery. Messages targeted at the launched
// session's pubkey land on THIS WS, not on the member-keyed one,
// so without this forward they'd silently disappear (the bug that
// kept inbox.db at zero rows since 1.30.0). Decrypt prefers the
// session secret key; member key remains the fallback for legacy
// member-targeted traffic that happens to fan out here.
const sessionSecretKeyHex = info.presence.sessionSecretKey;
// Capture the pubkey for the onPush closure below — TS can't
// narrow `info.presence` inside the async arrow even though we
// guard `if (!info.presence) return` earlier.
const sessionPubkeyHex = info.presence.sessionPubkey;
const client: SessionBrokerClient = new SessionBrokerClient({
mesh: meshConfig,
sessionPubkey: info.presence.sessionPubkey,
sessionSecretKey: info.presence.sessionSecretKey,
parentAttestation: info.presence.parentAttestation,
sessionId: info.sessionId,
displayName: info.displayName,
...(info.role ? { role: info.role } : {}),
...(info.cwd ? { cwd: info.cwd } : {}),
pid: info.pid,
onPush: (m) => {
void handleBrokerPush(m, {
db: inboxDb,
bus,
meshSlug: meshConfig.slug,
recipientSecretKeyHex: meshConfig.secretKey,
sessionSecretKeyHex,
// v2 agentic-comms (M1): close the at-least-once loop.
ackClientMessage: (cmid, bmid) => client.sendClientAck(cmid, bmid),
// 1.34.10: tag the bus event with this session's pubkey so
// the SSE demux only delivers to the MCP serving THIS
// session — not its siblings on the same daemon. Without
// this, A's MCP also rendered DMs intended for B because
// the bus was a single shared stream.
recipientPubkey: sessionPubkeyHex,
recipientKind: "session",
});
},
});
sessionBrokers.set(info.token, client);
sessionBrokersByPubkey.set(info.presence.sessionPubkey, client);
client.connect().catch((err) =>
process.stderr.write(JSON.stringify({
level: "warn", msg: "session_broker_connect_failed",
mesh: info.mesh, err: String(err), ts: new Date().toISOString(),
}) + "\n"),
);
},
onDeregister: (info: SessionInfo) => {
const client = sessionBrokers.get(info.token);
if (!client) return;
sessionBrokers.delete(info.token);
// 1.34.0: drop the pubkey index iff this client still owns it
// (a re-register may have already swapped the entry).
if (sessionBrokersByPubkey.get(client.sessionPubkey) === client) {
sessionBrokersByPubkey.delete(client.sessionPubkey);
}
client.close().catch(() => { /* ignore */ });
},
});
startReaper();
// Rehydrate persisted session bindings (1.36.0). A daemon restart used
// to wipe the in-memory registry, so every live session lost its mesh
// context and CLI commands fell back to an arbitrary default mesh — a
// live peer then looked "disconnected" though nothing had moved. We now
// reload each persisted binding, validate the pid is still alive (with
// a start-time PID-reuse guard), reload its keypair from the per-session
// store, re-sign a fresh parent attestation, and re-register it — which
// fires onRegister and reconnects its SessionBrokerClient on the broker.
try {
const persisted = readPersistedSessions(DAEMON_PATHS.SESSIONS_FILE);
if (persisted.length > 0) {
const { loadOrCreateSessionKeypair } = await import("~/services/session/keypair-store.js");
const { signParentAttestation } = await import("~/services/broker/session-hello-sig.js");
const { isPidAlive, getProcessStartTimes } = await import("./process-info.js");
const liveStartTimes = await getProcessStartTimes(persisted.map((p) => p.pid)).catch(() => new Map<number, string>());
let revived = 0;
for (const s of persisted) {
if (!isPidAlive(s.pid)) continue;
if (s.startTime !== undefined) {
const live = liveStartTimes.get(s.pid);
if (live !== undefined && live !== s.startTime) continue; // PID reused
}
const meshConfig = meshConfigs.get(s.mesh);
if (!meshConfig) continue; // mesh no longer joined
try {
const kp = await loadOrCreateSessionKeypair(meshConfig.slug, s.sessionId);
const att = await signParentAttestation({
parentMemberPubkey: meshConfig.pubkey,
parentSecretKey: meshConfig.secretKey,
sessionPubkey: kp.publicKey,
});
registerSession({
token: s.token,
sessionId: s.sessionId,
mesh: s.mesh,
displayName: s.displayName,
pid: s.pid,
...(s.cwd ? { cwd: s.cwd } : {}),
...(s.role ? { role: s.role } : {}),
...(s.groups ? { groups: s.groups } : {}),
...(s.startTime ? { startTime: s.startTime } : {}),
presence: {
sessionPubkey: kp.publicKey,
sessionSecretKey: kp.secretKey,
parentAttestation: {
sessionPubkey: att.sessionPubkey,
parentMemberPubkey: att.parentMemberPubkey,
expiresAt: att.expiresAt,
signature: att.signature,
},
},
});
revived++;
} catch (err) {
process.stderr.write(JSON.stringify({
level: "warn", msg: "session_rehydrate_failed",
token: s.token.slice(0, 8), mesh: s.mesh, err: String(err),
ts: new Date().toISOString(),
}) + "\n");
}
}
process.stderr.write(JSON.stringify({
level: "info", msg: "sessions_rehydrated",
revived, persisted: persisted.length, ts: new Date().toISOString(),
}) + "\n");
}
} catch (err) {
process.stderr.write(JSON.stringify({
level: "warn", msg: "session_rehydrate_scan_failed", err: String(err),
ts: new Date().toISOString(),
}) + "\n");
}
// Enable ongoing persistence now that rehydration has read the old file.
setRegistryPersistence(DAEMON_PATHS.SESSIONS_FILE);
const ipc = startIpcServer({
localToken,
tcpEnabled,
publicHealthCheck: opts.publicHealthCheck,
outboxDb,
inboxDb,
bus,
brokers,
meshConfigs,
onPendingInserted: () => drain?.wake(),
});
try {
await ipc.ready;
} catch (err) {
process.stderr.write(`ipc listen failed: ${String(err)}\n`);
releaseSingletonLock();
return 1;
}
process.stdout.write(JSON.stringify({
msg: "daemon_started",
// 1.34.10: stamp the version so users can tell whether the
// running daemon picked up a recent CLI ship. Read off the same
// VERSION constant the IPC `/v1/version` endpoint serves.
version: VERSION,
pid: process.pid,
sock: DAEMON_PATHS.SOCK_FILE,
tcp: tcpEnabled ? `127.0.0.1:47823` : null,
meshes: meshes.map((m) => m.slug),
ts: new Date().toISOString(),
}) + "\n");
let shuttingDown = false;
const shutdown = async (sig: string) => {
if (shuttingDown) return;
shuttingDown = true;
process.stdout.write(JSON.stringify({ msg: "daemon_shutdown", signal: sig, ts: new Date().toISOString() }) + "\n");
inboxPruner.stop();
if (drain) await drain.close();
for (const b of brokers.values()) {
try { await b.close(); } catch { /* ignore */ }
}
for (const b of sessionBrokers.values()) {
try { await b.close(); } catch { /* ignore */ }
}
sessionBrokers.clear();
await ipc.close();
try { outboxDb.close(); } catch { /* ignore */ }
try { inboxDb.close(); } catch { /* ignore */ }
releaseSingletonLock();
process.exit(0);
};
process.on("SIGINT", () => shutdown("SIGINT"));
process.on("SIGTERM", () => shutdown("SIGTERM"));
// Hold the event loop open until a signal arrives.
return new Promise<number>(() => { /* never resolves; signals call process.exit */ });
}