feat(cli): 1.31.0 — session autoclean + broker verification + service path
Three operability fixes for users running the daemon under launchd or systemd. PID-watcher autoclean ===================== The session reaper already dropped registry entries with dead pids on a 30s loop, but had two real-world gaps: - 30s sweep let stale presence linger on the broker for half a minute - bare process.kill(pid, 0) trusts a recycled pid; a registry entry could survive its real owner's death whenever the OS rolled the pid number forward to a new program Process-exit IPC from claude-code is best-effort and skipped on SIGKILL / OOM / segfault / panic, so it cannot replace the sweep. Fix: - New process-info.ts captures opaque per-process start-times via ps -o lstart= (works on macOS and Linux, ~1 ms per call) - registerSession stores the start-time alongside the pid - reapDead drops entries when pid is dead OR start-time changed since register - Sweep cadence 30s -> 5s - Best-effort fallback to bare liveness when start-time capture fails at register time Registry hooks already close the per-session broker WS on deregister, so peer list rebuilds within one sweep of any session exit. Service-managed daemon: no more "spawn failed" false alarms =========================================================== After claudemesh install (which writes a launchd plist or systemd unit with KeepAlive=true), users routinely saw [claudemesh] warn daemon spawn failed: socket did not appear within 3000ms even when the daemon was running fine. Two contributing causes: 1. Probe timeout was 800ms — the first IPC after a launchd-driven restart can take longer (SQLite migration + broker WS opens) and tripped it. Bumped to 2500ms. 2. On a failed probe the CLI tried its own detached spawn, which collided with launchd's KeepAlive restart cycle (singleton lock fails, child exits) and we'd then time out polling for a socket that was actually about to come up. Now: when the launchd plist or systemd unit exists, the CLI does not attempt a spawn. It waits up to 8s for the OS-managed unit to bring the socket up. New service-not-ready state distinguishes "OS hasn't restarted it yet" from "we tried to spawn and it failed". Install verifies broker connectivity, not just process start ============================================================ Previously install ended once launchctl reported the unit loaded — a daemon that boots but cannot reach the broker (blocked :443, expired TLS, DNS, broker outage) only surfaced on the user's first peer list or send. /v1/health now includes per-mesh broker WS state. install polls it for up to 15s after service boot and prints either "broker connected (mesh=...)" or a warning naming the meshes still in connecting state, with a hint at common causes. The verification is best-effort and does not fail the install — it just surfaces the issue early. Tests ===== 4 new vitest cases cover the reaper paths: dead pid, live pid plus matching start-time, live pid plus mismatched start-time (PID reuse), and the no-start-time fallback. 83 of 83 pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -204,7 +204,17 @@ function makeHandler(opts: {
|
||||
}
|
||||
|
||||
if (req.method === "GET" && url.pathname === "/v1/health") {
|
||||
respond(res, 200, { ok: true, pid: process.pid });
|
||||
// 1.31.0: include per-mesh broker WS state so callers can verify
|
||||
// functional connectivity, not just that the daemon process is
|
||||
// running. Used by `claudemesh install` post-flight to wait for
|
||||
// at least one broker to be `open` before declaring success —
|
||||
// catches dead WS / DNS / TLS / outbound-blocked-port issues at
|
||||
// install time instead of when the user's first message fails.
|
||||
const brokers: Record<string, string> = {};
|
||||
if (opts.brokers) {
|
||||
for (const [slug, client] of opts.brokers) brokers[slug] = client.status;
|
||||
}
|
||||
respond(res, 200, { ok: true, pid: process.pid, brokers });
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
46
apps/cli/src/daemon/process-info.ts
Normal file
46
apps/cli/src/daemon/process-info.ts
Normal file
@@ -0,0 +1,46 @@
|
||||
/**
|
||||
* Process-info helpers used by the session reaper to detect dead-pid AND
|
||||
* pid-reuse safely.
|
||||
*
|
||||
* `process.kill(pid, 0)` alone is insufficient: a recently-recycled pid
|
||||
* passes the liveness check even though the process registered under it
|
||||
* is long gone. To avoid mistakenly trusting a recycled pid, we capture
|
||||
* a stable per-process start-time at register, and compare it on each
|
||||
* sweep — if it changed, treat the original process as dead.
|
||||
*
|
||||
* macOS + Linux both expose `ps -o lstart=` returning a fixed-format
|
||||
* timestamp ("Sun May 4 09:14:00 2026"). Equality is the only operation
|
||||
* the reaper needs, so we keep the value as an opaque string.
|
||||
*/
|
||||
|
||||
import { execFileSync } from "node:child_process";
|
||||
|
||||
/**
|
||||
* Returns a stable process-start identifier for `pid`, or null if the
|
||||
* process is dead or unreachable. Cheap (~1 ms per call) — safe to use
|
||||
* inside the 5-second reaper sweep.
|
||||
*/
|
||||
export function getProcessStartTime(pid: number): string | null {
|
||||
if (!Number.isFinite(pid) || pid <= 0) return null;
|
||||
try {
|
||||
const out = execFileSync("ps", ["-o", "lstart=", "-p", String(pid)], {
|
||||
encoding: "utf8",
|
||||
timeout: 1_000,
|
||||
stdio: ["ignore", "pipe", "ignore"],
|
||||
}).trim();
|
||||
return out.length > 0 ? out : null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/** Liveness-only probe (signal 0). Use together with start-time guard. */
|
||||
export function isPidAlive(pid: number): boolean {
|
||||
if (!Number.isFinite(pid) || pid <= 0) return false;
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
return true;
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -10,7 +10,9 @@
|
||||
* Lifecycle:
|
||||
* - register replaces any prior entry under the same `sessionId`
|
||||
* (handles re-launch and `--resume` flows cleanly).
|
||||
* - reaper polls every 30 s and drops entries whose pid is dead.
|
||||
* - reaper polls every 5 s. An entry is dropped when its pid is dead
|
||||
* OR when its captured start-time no longer matches the running
|
||||
* process (PID reuse — original is gone, OS recycled the number).
|
||||
* - hard ttl ceiling of 24 h is a leak guard for forgotten sessions.
|
||||
*
|
||||
* Persistence: in-memory only for v1. A daemon restart clears the
|
||||
@@ -20,6 +22,8 @@
|
||||
* session have no token to begin with.
|
||||
*/
|
||||
|
||||
import { getProcessStartTime, isPidAlive } from "./process-info.js";
|
||||
|
||||
/**
|
||||
* Optional per-launch presence material. Carried opaquely through the
|
||||
* registry; the daemon's session-broker subsystem (1.30.0+) reads it to
|
||||
@@ -51,6 +55,16 @@ export interface SessionInfo {
|
||||
groups?: string[];
|
||||
/** 1.30.0+: per-launch presence material. */
|
||||
presence?: SessionPresence;
|
||||
/**
|
||||
* 1.31.0+: opaque per-process start-time captured at register. The
|
||||
* reaper compares the live value against this on every sweep — a
|
||||
* mismatch means the original process exited and the pid was reused
|
||||
* by an unrelated program, so the registry entry must be dropped.
|
||||
* `undefined` when capture failed (process already dead at register
|
||||
* time, ps unavailable, etc.) — the reaper falls back to bare
|
||||
* liveness in that case.
|
||||
*/
|
||||
startTime?: string;
|
||||
registeredAt: number;
|
||||
}
|
||||
|
||||
@@ -61,7 +75,7 @@ export interface RegistryHooks {
|
||||
}
|
||||
|
||||
const TTL_MS = 24 * 60 * 60 * 1000;
|
||||
const REAPER_INTERVAL_MS = 30 * 1000;
|
||||
const REAPER_INTERVAL_MS = 5 * 1000;
|
||||
|
||||
const byToken = new Map<string, SessionInfo>();
|
||||
const bySessionId = new Map<string, string>();
|
||||
@@ -98,7 +112,12 @@ export function registerSession(info: Omit<SessionInfo, "registeredAt">): Sessio
|
||||
}
|
||||
}
|
||||
|
||||
const stored: SessionInfo = { ...info, registeredAt: Date.now() };
|
||||
// Capture start-time at register so the reaper can detect PID reuse.
|
||||
// Caller may pre-fill info.startTime (tests do this); only probe ps
|
||||
// when the field is absent so we don't fork shell subprocesses in
|
||||
// unit tests for fake pids.
|
||||
const startTime = info.startTime ?? getProcessStartTime(info.pid) ?? undefined;
|
||||
const stored: SessionInfo = { ...info, startTime, registeredAt: Date.now() };
|
||||
byToken.set(info.token, stored);
|
||||
bySessionId.set(info.sessionId, info.token);
|
||||
try { hooks.onRegister?.(stored); } catch { /* see above */ }
|
||||
@@ -132,11 +151,24 @@ function reapDead(): void {
|
||||
const dead: string[] = [];
|
||||
for (const [token, info] of byToken.entries()) {
|
||||
if (Date.now() - info.registeredAt > TTL_MS) { dead.push(token); continue; }
|
||||
try { process.kill(info.pid, 0); } catch { dead.push(token); }
|
||||
if (!isPidAlive(info.pid)) { dead.push(token); continue; }
|
||||
// PID reuse guard: process is alive, but if its start-time changed
|
||||
// since register the original is gone and the OS recycled the pid
|
||||
// for an unrelated program. Skip when we never captured a start-
|
||||
// time (best-effort fallback to bare liveness above).
|
||||
if (info.startTime !== undefined) {
|
||||
const live = getProcessStartTime(info.pid);
|
||||
if (live !== null && live !== info.startTime) { dead.push(token); continue; }
|
||||
}
|
||||
}
|
||||
for (const t of dead) deregisterByToken(t);
|
||||
}
|
||||
|
||||
/** Test helper: run a single reaper pass synchronously. */
|
||||
export function _runReaperOnce(): void {
|
||||
reapDead();
|
||||
}
|
||||
|
||||
/** Test helper. */
|
||||
export function _resetRegistry(): void {
|
||||
byToken.clear();
|
||||
|
||||
Reference in New Issue
Block a user