feat(cli): 1.31.0 — session autoclean + broker verification + service path
Three operability fixes for users running the daemon under launchd or systemd. PID-watcher autoclean ===================== The session reaper already dropped registry entries with dead pids on a 30s loop, but had two real-world gaps: - 30s sweep let stale presence linger on the broker for half a minute - bare process.kill(pid, 0) trusts a recycled pid; a registry entry could survive its real owner's death whenever the OS rolled the pid number forward to a new program Process-exit IPC from claude-code is best-effort and skipped on SIGKILL / OOM / segfault / panic, so it cannot replace the sweep. Fix: - New process-info.ts captures opaque per-process start-times via ps -o lstart= (works on macOS and Linux, ~1 ms per call) - registerSession stores the start-time alongside the pid - reapDead drops entries when pid is dead OR start-time changed since register - Sweep cadence 30s -> 5s - Best-effort fallback to bare liveness when start-time capture fails at register time Registry hooks already close the per-session broker WS on deregister, so peer list rebuilds within one sweep of any session exit. Service-managed daemon: no more "spawn failed" false alarms =========================================================== After claudemesh install (which writes a launchd plist or systemd unit with KeepAlive=true), users routinely saw [claudemesh] warn daemon spawn failed: socket did not appear within 3000ms even when the daemon was running fine. Two contributing causes: 1. Probe timeout was 800ms — the first IPC after a launchd-driven restart can take longer (SQLite migration + broker WS opens) and tripped it. Bumped to 2500ms. 2. On a failed probe the CLI tried its own detached spawn, which collided with launchd's KeepAlive restart cycle (singleton lock fails, child exits) and we'd then time out polling for a socket that was actually about to come up. Now: when the launchd plist or systemd unit exists, the CLI does not attempt a spawn. It waits up to 8s for the OS-managed unit to bring the socket up. New service-not-ready state distinguishes "OS hasn't restarted it yet" from "we tried to spawn and it failed". Install verifies broker connectivity, not just process start ============================================================ Previously install ended once launchctl reported the unit loaded — a daemon that boots but cannot reach the broker (blocked :443, expired TLS, DNS, broker outage) only surfaced on the user's first peer list or send. /v1/health now includes per-mesh broker WS state. install polls it for up to 15s after service boot and prints either "broker connected (mesh=...)" or a warning naming the meshes still in connecting state, with a hint at common causes. The verification is best-effort and does not fail the install — it just surfaces the issue early. Tests ===== 4 new vitest cases cover the reaper paths: dead pid, live pid plus matching start-time, live pid plus mismatched start-time (PID reuse), and the no-start-time fallback. 83 of 83 pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -30,6 +30,7 @@
|
||||
*/
|
||||
|
||||
import { existsSync, readFileSync, statSync, unlinkSync, writeFileSync } from "node:fs";
|
||||
import { homedir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
|
||||
import { ipc, IpcError } from "~/daemon/ipc/client.js";
|
||||
@@ -40,7 +41,11 @@ export type DaemonReadyState =
|
||||
| "started"
|
||||
| "down"
|
||||
| "spawn-failed"
|
||||
| "spawn-suppressed";
|
||||
| "spawn-suppressed"
|
||||
/** 1.31.0+: launchd / systemd manages the daemon and it didn't respond
|
||||
* within the service budget. Distinct from spawn-failed: the CLI did
|
||||
* not attempt to spawn (the OS owns the lifecycle). */
|
||||
| "service-not-ready";
|
||||
|
||||
export interface EnsureDaemonResult {
|
||||
state: DaemonReadyState;
|
||||
@@ -62,7 +67,16 @@ export interface EnsureDaemonOpts {
|
||||
const SPAWN_LOCK_FILE = () => join(DAEMON_PATHS.DAEMON_DIR, ".spawn.lock");
|
||||
const SPAWN_FAIL_FILE = () => join(DAEMON_PATHS.DAEMON_DIR, ".spawn-failure");
|
||||
const SPAWN_FAIL_TTL_MS = 30_000;
|
||||
const PROBE_TIMEOUT_MS = 800;
|
||||
// 1.31.0: 800 ms was too tight — the daemon's first IPC after a launchd
|
||||
// (re)start can take a beat while it migrates SQLite, opens broker WSes,
|
||||
// and warms up the event loop. False "stale" probes triggered the
|
||||
// pointless spawn → "socket did not appear" warning even on a perfectly
|
||||
// healthy service-managed daemon. 2500 ms still bounds the worst case.
|
||||
const PROBE_TIMEOUT_MS = 2_500;
|
||||
// When the daemon is service-managed (launchd/systemd) and KeepAlive=true,
|
||||
// the OS guarantees a restart on death — the CLI must NOT race that with
|
||||
// its own spawn. Just wait longer for the service unit to come up.
|
||||
const SERVICE_BUDGET_MS = 8_000;
|
||||
|
||||
let lastResultThisProcess: EnsureDaemonResult | null = null;
|
||||
|
||||
@@ -91,9 +105,30 @@ async function runEnsureDaemon(opts: EnsureDaemonOpts): Promise<EnsureDaemonResu
|
||||
// Step 1 — probe.
|
||||
const probe = await probeDaemon();
|
||||
if (probe === "up") return { state: "up", durationMs: Date.now() - t0 };
|
||||
|
||||
// Step 2 — service-managed shortcut. When launchd / systemd manages
|
||||
// the daemon and KeepAlive is set, the OS will restart a crashed
|
||||
// daemon on its own; the CLI must NOT race that with its own spawn
|
||||
// (would double-bind the singleton lock and trigger "daemon already
|
||||
// running" errors). Just wait quietly for the service to bring the
|
||||
// socket up.
|
||||
if (isServiceManaged()) {
|
||||
if (probe === "stale") cleanupStaleFiles();
|
||||
const polled = await pollForSocket(SERVICE_BUDGET_MS);
|
||||
if (polled.ok) return { state: "up", durationMs: Date.now() - t0 };
|
||||
const tool = process.platform === "darwin"
|
||||
? `launchctl print gui/$(id -u)/${SERVICE_LABEL}`
|
||||
: `systemctl --user status ${SYSTEMD_UNIT}`;
|
||||
return {
|
||||
state: "service-not-ready",
|
||||
durationMs: Date.now() - t0,
|
||||
reason: `service-managed daemon not responding within ${SERVICE_BUDGET_MS}ms (run \`${tool}\`)`,
|
||||
};
|
||||
}
|
||||
|
||||
if (probe === "stale") cleanupStaleFiles();
|
||||
|
||||
// Step 2 — auto-spawn unless forbidden.
|
||||
// Step 3 — auto-spawn unless forbidden.
|
||||
if (opts.noAutoSpawn) {
|
||||
return { state: "down", durationMs: Date.now() - t0, reason: "auto-spawn disabled" };
|
||||
}
|
||||
@@ -105,17 +140,37 @@ async function runEnsureDaemon(opts: EnsureDaemonOpts): Promise<EnsureDaemonResu
|
||||
};
|
||||
}
|
||||
|
||||
// Step 3 — spawn detached.
|
||||
// Step 4 — spawn detached.
|
||||
const spawnRes = await spawnDaemon(opts);
|
||||
if (spawnRes.ok) {
|
||||
return { state: "started", durationMs: Date.now() - t0 };
|
||||
}
|
||||
|
||||
// Step 4 — record failure for backoff and report.
|
||||
// Step 5 — record failure for backoff and report.
|
||||
markSpawnFailure();
|
||||
return { state: "spawn-failed", durationMs: Date.now() - t0, reason: spawnRes.reason };
|
||||
}
|
||||
|
||||
const SERVICE_LABEL = "com.claudemesh.daemon";
|
||||
const SYSTEMD_UNIT = "claudemesh-daemon.service";
|
||||
|
||||
/**
|
||||
* Returns true when the user has installed the daemon as a launchd
|
||||
* agent (macOS) or systemd --user unit (Linux). We detect by file
|
||||
* presence rather than shelling out to launchctl/systemctl on every
|
||||
* CLI invocation — this stays cheap and avoids spurious permission
|
||||
* prompts on locked-down hosts.
|
||||
*/
|
||||
function isServiceManaged(): boolean {
|
||||
if (process.platform === "darwin") {
|
||||
return existsSync(join(homedir(), "Library", "LaunchAgents", `${SERVICE_LABEL}.plist`));
|
||||
}
|
||||
if (process.platform === "linux") {
|
||||
return existsSync(join(homedir(), ".config", "systemd", "user", SYSTEMD_UNIT));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
async function probeDaemon(): Promise<"up" | "absent" | "stale"> {
|
||||
if (!existsSync(DAEMON_PATHS.SOCK_FILE)) return "absent";
|
||||
try {
|
||||
|
||||
Reference in New Issue
Block a user