feat(cli): 1.31.0 — session autoclean + broker verification + service path

Three operability fixes for users running the daemon under launchd or systemd. PID-watcher autoclean ===================== The session reaper already dropped registry entries with dead pids on a 30s loop, but had two real-world gaps: - 30s sweep let stale presence linger on the broker for half a minute - bare process.kill(pid, 0) trusts a recycled pid; a registry entry could survive its real owner's death whenever the OS rolled the pid number forward to a new program Process-exit IPC from claude-code is best-effort and skipped on SIGKILL / OOM / segfault / panic, so it cannot replace the sweep. Fix: - New process-info.ts captures opaque per-process start-times via ps -o lstart= (works on macOS and Linux, ~1 ms per call) - registerSession stores the start-time alongside the pid - reapDead drops entries when pid is dead OR start-time changed since register - Sweep cadence 30s -> 5s - Best-effort fallback to bare liveness when start-time capture fails at register time Registry hooks already close the per-session broker WS on deregister, so peer list rebuilds within one sweep of any session exit. Service-managed daemon: no more "spawn failed" false alarms =========================================================== After claudemesh install (which writes a launchd plist or systemd unit with KeepAlive=true), users routinely saw [claudemesh] warn daemon spawn failed: socket did not appear within 3000ms even when the daemon was running fine. Two contributing causes: 1. Probe timeout was 800ms — the first IPC after a launchd-driven restart can take longer (SQLite migration + broker WS opens) and tripped it. Bumped to 2500ms. 2. On a failed probe the CLI tried its own detached spawn, which collided with launchd's KeepAlive restart cycle (singleton lock fails, child exits) and we'd then time out polling for a socket that was actually about to come up. Now: when the launchd plist or systemd unit exists, the CLI does not attempt a spawn. It waits up to 8s for the OS-managed unit to bring the socket up. New service-not-ready state distinguishes "OS hasn't restarted it yet" from "we tried to spawn and it failed". Install verifies broker connectivity, not just process start ============================================================ Previously install ended once launchctl reported the unit loaded — a daemon that boots but cannot reach the broker (blocked :443, expired TLS, DNS, broker outage) only surfaced on the user's first peer list or send. /v1/health now includes per-mesh broker WS state. install polls it for up to 15s after service boot and prints either "broker connected (mesh=...)" or a warning naming the meshes still in connecting state, with a hint at common causes. The verification is best-effort and does not fail the install — it just surfaces the issue early. Tests ===== 4 new vitest cases cover the reaper paths: dead pid, live pid plus matching start-time, live pid plus mismatched start-time (PID reuse), and the no-start-time fallback. 83 of 83 pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 14:05:44 +01:00
parent 71f7f81880
commit 1a14cef1e0
10 changed files with 436 additions and 15 deletions
--- a/apps/cli/src/commands/install.ts
+++ b/apps/cli/src/commands/install.ts
@@ -434,7 +434,7 @@ function installStatusLine(): { installed: boolean } {
  return { installed: true };
 }

-export function runInstall(args: string[] = []): void {
+export async function runInstall(args: string[] = []): Promise<void> {
  const skipHooks = args.includes("--no-hooks");
  const skipSkill = args.includes("--no-skill");
  const skipService = args.includes("--no-service");
@@ -559,7 +559,7 @@ export function runInstall(args: string[] = []): void {
  // install-service --mesh <slug>` explicitly.
  if (!skipService && hasMeshes) {
    try {
-      installDaemonService(entry);
+      await installDaemonService(entry);
    } catch (e) {
      render.warn(
        `daemon service install failed: ${e instanceof Error ? e.message : String(e)}`,
@@ -603,7 +603,7 @@ export function runInstall(args: string[] = []): void {
 * the user knows there's a problem before it shows up as "no messages
 * arriving."
 */
-function installDaemonService(binaryEntry: string): void {
+async function installDaemonService(binaryEntry: string): Promise<void> {
  const {
    installService,
    detectPlatform,
@@ -652,7 +652,52 @@ function installDaemonService(binaryEntry: string): void {
      `daemon service installed but failed to start: ${e instanceof Error ? e.message : String(e)}`,
      `Run manually: ${r.bootCommand}`,
    );
+    return;
  }
+
+  // 1.31.0 — post-flight: verify the daemon actually establishes a
+  // broker WebSocket. Boots that fail silently here (DNS, expired TLS,
+  // outbound :443 blocked, broker outage) used to surface only when
+  // the user's first `peer list` or `send` failed half an hour later.
+  // Polling /v1/health gives a clear, install-time signal.
+  await verifyBrokerConnectivity();
+}
+
+async function verifyBrokerConnectivity(): Promise<void> {
+  const VERIFY_BUDGET_MS = 15_000;
+  const POLL_INTERVAL_MS = 500;
+  const { ipc } = await import("~/daemon/ipc/client.js");
+  const start = Date.now();
+  let lastBrokers: Record<string, string> = {};
+
+  while (Date.now() - start < VERIFY_BUDGET_MS) {
+    try {
+      const res = await ipc<{ ok: boolean; brokers?: Record<string, string> }>({
+        path: "/v1/health",
+        timeoutMs: 2_000,
+      });
+      lastBrokers = res.body?.brokers ?? {};
+      const openMesh = Object.entries(lastBrokers).find(([, s]) => s === "open");
+      if (openMesh) {
+        const others = Object.entries(lastBrokers).filter(([slug]) => slug !== openMesh[0]);
+        const tail = others.length > 0 ? `, ${others.length} other mesh${others.length === 1 ? "" : "es"} attaching` : "";
+        render.ok(`broker connected (mesh=${openMesh[0]}${tail})`);
+        return;
+      }
+    } catch { /* daemon may still be starting up; keep polling */ }
+    await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS));
+  }
+
+  // Timed out without a single broker reaching `open`. Surface what we
+  // saw last so the user can act — this is exactly the bug class we
+  // want to catch at install time, not at first send.
+  const states = Object.keys(lastBrokers).length === 0
+    ? "no health response from daemon"
+    : Object.entries(lastBrokers).map(([m, s]) => `${m}=${s}`).join(", ");
+  render.warn(
+    `broker did not reach open within ${Math.round(VERIFY_BUDGET_MS / 1000)}s (${states})`,
+    "Check ~/.claudemesh/daemon/daemon.log for connect errors. Common causes: outbound :443 blocked, expired TLS, DNS resolution.",
+  );
 }

 export function runUninstall(): void {
--- a/apps/cli/src/daemon/ipc/server.ts
+++ b/apps/cli/src/daemon/ipc/server.ts
@@ -204,7 +204,17 @@ function makeHandler(opts: {
    }

    if (req.method === "GET" && url.pathname === "/v1/health") {
-      respond(res, 200, { ok: true, pid: process.pid });
+      // 1.31.0: include per-mesh broker WS state so callers can verify
+      // functional connectivity, not just that the daemon process is
+      // running. Used by `claudemesh install` post-flight to wait for
+      // at least one broker to be `open` before declaring success —
+      // catches dead WS / DNS / TLS / outbound-blocked-port issues at
+      // install time instead of when the user's first message fails.
+      const brokers: Record<string, string> = {};
+      if (opts.brokers) {
+        for (const [slug, client] of opts.brokers) brokers[slug] = client.status;
+      }
+      respond(res, 200, { ok: true, pid: process.pid, brokers });
      return;
    }

--- a/apps/cli/src/daemon/process-info.ts
+++ b/apps/cli/src/daemon/process-info.ts
@@ -0,0 +1,46 @@
+/**
+ * Process-info helpers used by the session reaper to detect dead-pid AND
+ * pid-reuse safely.
+ *
+ * `process.kill(pid, 0)` alone is insufficient: a recently-recycled pid
+ * passes the liveness check even though the process registered under it
+ * is long gone. To avoid mistakenly trusting a recycled pid, we capture
+ * a stable per-process start-time at register, and compare it on each
+ * sweep — if it changed, treat the original process as dead.
+ *
+ * macOS + Linux both expose `ps -o lstart=` returning a fixed-format
+ * timestamp ("Sun May  4 09:14:00 2026"). Equality is the only operation
+ * the reaper needs, so we keep the value as an opaque string.
+ */
+
+import { execFileSync } from "node:child_process";
+
+/**
+ * Returns a stable process-start identifier for `pid`, or null if the
+ * process is dead or unreachable. Cheap (~1 ms per call) — safe to use
+ * inside the 5-second reaper sweep.
+ */
+export function getProcessStartTime(pid: number): string | null {
+  if (!Number.isFinite(pid) || pid <= 0) return null;
+  try {
+    const out = execFileSync("ps", ["-o", "lstart=", "-p", String(pid)], {
+      encoding: "utf8",
+      timeout: 1_000,
+      stdio: ["ignore", "pipe", "ignore"],
+    }).trim();
+    return out.length > 0 ? out : null;
+  } catch {
+    return null;
+  }
+}
+
+/** Liveness-only probe (signal 0). Use together with start-time guard. */
+export function isPidAlive(pid: number): boolean {
+  if (!Number.isFinite(pid) || pid <= 0) return false;
+  try {
+    process.kill(pid, 0);
+    return true;
+  } catch {
+    return false;
+  }
+}
--- a/apps/cli/src/daemon/session-registry.ts
+++ b/apps/cli/src/daemon/session-registry.ts
@@ -10,7 +10,9 @@
 * Lifecycle:
 *   - register replaces any prior entry under the same `sessionId`
 *     (handles re-launch and `--resume` flows cleanly).
- *   - reaper polls every 30 s and drops entries whose pid is dead.
+ *   - reaper polls every 5 s. An entry is dropped when its pid is dead
+ *     OR when its captured start-time no longer matches the running
+ *     process (PID reuse — original is gone, OS recycled the number).
 *   - hard ttl ceiling of 24 h is a leak guard for forgotten sessions.
 *
 * Persistence: in-memory only for v1. A daemon restart clears the
@@ -20,6 +22,8 @@
 * session have no token to begin with.
 */

+import { getProcessStartTime, isPidAlive } from "./process-info.js";
+
 /**
 * Optional per-launch presence material. Carried opaquely through the
 * registry; the daemon's session-broker subsystem (1.30.0+) reads it to
@@ -51,6 +55,16 @@ export interface SessionInfo {
  groups?: string[];
  /** 1.30.0+: per-launch presence material. */
  presence?: SessionPresence;
+  /**
+   * 1.31.0+: opaque per-process start-time captured at register. The
+   * reaper compares the live value against this on every sweep — a
+   * mismatch means the original process exited and the pid was reused
+   * by an unrelated program, so the registry entry must be dropped.
+   * `undefined` when capture failed (process already dead at register
+   * time, ps unavailable, etc.) — the reaper falls back to bare
+   * liveness in that case.
+   */
+  startTime?: string;
  registeredAt: number;
 }

@@ -61,7 +75,7 @@ export interface RegistryHooks {
 }

 const TTL_MS = 24 * 60 * 60 * 1000;
-const REAPER_INTERVAL_MS = 30 * 1000;
+const REAPER_INTERVAL_MS = 5 * 1000;

 const byToken = new Map<string, SessionInfo>();
 const bySessionId = new Map<string, string>();
@@ -98,7 +112,12 @@ export function registerSession(info: Omit<SessionInfo, "registeredAt">): Sessio
    }
  }

-  const stored: SessionInfo = { ...info, registeredAt: Date.now() };
+  // Capture start-time at register so the reaper can detect PID reuse.
+  // Caller may pre-fill info.startTime (tests do this); only probe ps
+  // when the field is absent so we don't fork shell subprocesses in
+  // unit tests for fake pids.
+  const startTime = info.startTime ?? getProcessStartTime(info.pid) ?? undefined;
+  const stored: SessionInfo = { ...info, startTime, registeredAt: Date.now() };
  byToken.set(info.token, stored);
  bySessionId.set(info.sessionId, info.token);
  try { hooks.onRegister?.(stored); } catch { /* see above */ }
@@ -132,11 +151,24 @@ function reapDead(): void {
  const dead: string[] = [];
  for (const [token, info] of byToken.entries()) {
    if (Date.now() - info.registeredAt > TTL_MS) { dead.push(token); continue; }
-    try { process.kill(info.pid, 0); } catch { dead.push(token); }
+    if (!isPidAlive(info.pid)) { dead.push(token); continue; }
+    // PID reuse guard: process is alive, but if its start-time changed
+    // since register the original is gone and the OS recycled the pid
+    // for an unrelated program. Skip when we never captured a start-
+    // time (best-effort fallback to bare liveness above).
+    if (info.startTime !== undefined) {
+      const live = getProcessStartTime(info.pid);
+      if (live !== null && live !== info.startTime) { dead.push(token); continue; }
+    }
  }
  for (const t of dead) deregisterByToken(t);
 }

+/** Test helper: run a single reaper pass synchronously. */
+export function _runReaperOnce(): void {
+  reapDead();
+}
+
 /** Test helper. */
 export function _resetRegistry(): void {
  byToken.clear();
--- a/apps/cli/src/entrypoints/cli.ts
+++ b/apps/cli/src/entrypoints/cli.ts
@@ -470,7 +470,7 @@ async function main(): Promise<void> {
    }

    // Setup
-    case "install": { const { runInstall } = await import("~/commands/install.js"); runInstall(positionals); break; }
+    case "install": { const { runInstall } = await import("~/commands/install.js"); await runInstall(positionals); break; }
    case "uninstall": { const { uninstall } = await import("~/commands/uninstall.js"); process.exit(await uninstall()); break; }
    case "doctor": { const { runDoctor } = await import("~/commands/doctor.js"); await runDoctor(); break; }
    case "status": {
--- a/apps/cli/src/services/daemon/lifecycle.ts
+++ b/apps/cli/src/services/daemon/lifecycle.ts
@@ -30,6 +30,7 @@
 */

 import { existsSync, readFileSync, statSync, unlinkSync, writeFileSync } from "node:fs";
+import { homedir } from "node:os";
 import { join } from "node:path";

 import { ipc, IpcError } from "~/daemon/ipc/client.js";
@@ -40,7 +41,11 @@ export type DaemonReadyState =
  | "started"
  | "down"
  | "spawn-failed"
-  | "spawn-suppressed";
+  | "spawn-suppressed"
+  /** 1.31.0+: launchd / systemd manages the daemon and it didn't respond
+   *  within the service budget. Distinct from spawn-failed: the CLI did
+   *  not attempt to spawn (the OS owns the lifecycle). */
+  | "service-not-ready";

 export interface EnsureDaemonResult {
  state: DaemonReadyState;
@@ -62,7 +67,16 @@ export interface EnsureDaemonOpts {
 const SPAWN_LOCK_FILE  = () => join(DAEMON_PATHS.DAEMON_DIR, ".spawn.lock");
 const SPAWN_FAIL_FILE  = () => join(DAEMON_PATHS.DAEMON_DIR, ".spawn-failure");
 const SPAWN_FAIL_TTL_MS = 30_000;
-const PROBE_TIMEOUT_MS  = 800;
+// 1.31.0: 800 ms was too tight — the daemon's first IPC after a launchd
+// (re)start can take a beat while it migrates SQLite, opens broker WSes,
+// and warms up the event loop. False "stale" probes triggered the
+// pointless spawn → "socket did not appear" warning even on a perfectly
+// healthy service-managed daemon. 2500 ms still bounds the worst case.
+const PROBE_TIMEOUT_MS  = 2_500;
+// When the daemon is service-managed (launchd/systemd) and KeepAlive=true,
+// the OS guarantees a restart on death — the CLI must NOT race that with
+// its own spawn. Just wait longer for the service unit to come up.
+const SERVICE_BUDGET_MS = 8_000;

 let lastResultThisProcess: EnsureDaemonResult | null = null;

@@ -91,9 +105,30 @@ async function runEnsureDaemon(opts: EnsureDaemonOpts): Promise<EnsureDaemonResu
  // Step 1 — probe.
  const probe = await probeDaemon();
  if (probe === "up") return { state: "up", durationMs: Date.now() - t0 };
+
+  // Step 2 — service-managed shortcut. When launchd / systemd manages
+  // the daemon and KeepAlive is set, the OS will restart a crashed
+  // daemon on its own; the CLI must NOT race that with its own spawn
+  // (would double-bind the singleton lock and trigger "daemon already
+  // running" errors). Just wait quietly for the service to bring the
+  // socket up.
+  if (isServiceManaged()) {
+    if (probe === "stale") cleanupStaleFiles();
+    const polled = await pollForSocket(SERVICE_BUDGET_MS);
+    if (polled.ok) return { state: "up", durationMs: Date.now() - t0 };
+    const tool = process.platform === "darwin"
+      ? `launchctl print gui/$(id -u)/${SERVICE_LABEL}`
+      : `systemctl --user status ${SYSTEMD_UNIT}`;
+    return {
+      state: "service-not-ready",
+      durationMs: Date.now() - t0,
+      reason: `service-managed daemon not responding within ${SERVICE_BUDGET_MS}ms (run \`${tool}\`)`,
+    };
+  }
+
  if (probe === "stale") cleanupStaleFiles();

-  // Step 2 — auto-spawn unless forbidden.
+  // Step 3 — auto-spawn unless forbidden.
  if (opts.noAutoSpawn) {
    return { state: "down", durationMs: Date.now() - t0, reason: "auto-spawn disabled" };
  }
@@ -105,17 +140,37 @@ async function runEnsureDaemon(opts: EnsureDaemonOpts): Promise<EnsureDaemonResu
    };
  }

-  // Step 3 — spawn detached.
+  // Step 4 — spawn detached.
  const spawnRes = await spawnDaemon(opts);
  if (spawnRes.ok) {
    return { state: "started", durationMs: Date.now() - t0 };
  }

-  // Step 4 — record failure for backoff and report.
+  // Step 5 — record failure for backoff and report.
  markSpawnFailure();
  return { state: "spawn-failed", durationMs: Date.now() - t0, reason: spawnRes.reason };
 }

+const SERVICE_LABEL = "com.claudemesh.daemon";
+const SYSTEMD_UNIT = "claudemesh-daemon.service";
+
+/**
+ * Returns true when the user has installed the daemon as a launchd
+ * agent (macOS) or systemd --user unit (Linux). We detect by file
+ * presence rather than shelling out to launchctl/systemctl on every
+ * CLI invocation — this stays cheap and avoids spurious permission
+ * prompts on locked-down hosts.
+ */
+function isServiceManaged(): boolean {
+  if (process.platform === "darwin") {
+    return existsSync(join(homedir(), "Library", "LaunchAgents", `${SERVICE_LABEL}.plist`));
+  }
+  if (process.platform === "linux") {
+    return existsSync(join(homedir(), ".config", "systemd", "user", SYSTEMD_UNIT));
+  }
+  return false;
+}
+
 async function probeDaemon(): Promise<"up" | "absent" | "stale"> {
  if (!existsSync(DAEMON_PATHS.SOCK_FILE)) return "absent";
  try {
--- a/apps/cli/src/ui/warnings.ts
+++ b/apps/cli/src/ui/warnings.ts
@@ -50,6 +50,9 @@ export function warnDaemonState(
    case "spawn-failed":
      process.stderr.write(`${tag("warn")} daemon spawn failed${res.reason ? `: ${res.reason}` : ""} — using cold path ${hint("(check ~/.claudemesh/daemon/daemon.log)")}\n`);
      return true;
+    case "service-not-ready":
+      process.stderr.write(`${tag("warn")} ${res.reason ?? "service-managed daemon not responding"} — using cold path ${hint("(check ~/.claudemesh/daemon/daemon.log)")}\n`);
+      return true;
  }
  return false;
 }