feat(cli): 1.31.0 — session autoclean + broker verification + service path
Three operability fixes for users running the daemon under launchd or systemd. PID-watcher autoclean ===================== The session reaper already dropped registry entries with dead pids on a 30s loop, but had two real-world gaps: - 30s sweep let stale presence linger on the broker for half a minute - bare process.kill(pid, 0) trusts a recycled pid; a registry entry could survive its real owner's death whenever the OS rolled the pid number forward to a new program Process-exit IPC from claude-code is best-effort and skipped on SIGKILL / OOM / segfault / panic, so it cannot replace the sweep. Fix: - New process-info.ts captures opaque per-process start-times via ps -o lstart= (works on macOS and Linux, ~1 ms per call) - registerSession stores the start-time alongside the pid - reapDead drops entries when pid is dead OR start-time changed since register - Sweep cadence 30s -> 5s - Best-effort fallback to bare liveness when start-time capture fails at register time Registry hooks already close the per-session broker WS on deregister, so peer list rebuilds within one sweep of any session exit. Service-managed daemon: no more "spawn failed" false alarms =========================================================== After claudemesh install (which writes a launchd plist or systemd unit with KeepAlive=true), users routinely saw [claudemesh] warn daemon spawn failed: socket did not appear within 3000ms even when the daemon was running fine. Two contributing causes: 1. Probe timeout was 800ms — the first IPC after a launchd-driven restart can take longer (SQLite migration + broker WS opens) and tripped it. Bumped to 2500ms. 2. On a failed probe the CLI tried its own detached spawn, which collided with launchd's KeepAlive restart cycle (singleton lock fails, child exits) and we'd then time out polling for a socket that was actually about to come up. Now: when the launchd plist or systemd unit exists, the CLI does not attempt a spawn. It waits up to 8s for the OS-managed unit to bring the socket up. New service-not-ready state distinguishes "OS hasn't restarted it yet" from "we tried to spawn and it failed". Install verifies broker connectivity, not just process start ============================================================ Previously install ended once launchctl reported the unit loaded — a daemon that boots but cannot reach the broker (blocked :443, expired TLS, DNS, broker outage) only surfaced on the user's first peer list or send. /v1/health now includes per-mesh broker WS state. install polls it for up to 15s after service boot and prints either "broker connected (mesh=...)" or a warning naming the meshes still in connecting state, with a hint at common causes. The verification is best-effort and does not fail the install — it just surfaces the issue early. Tests ===== 4 new vitest cases cover the reaper paths: dead pid, live pid plus matching start-time, live pid plus mismatched start-time (PID reuse), and the no-start-time fallback. 83 of 83 pass. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -434,7 +434,7 @@ function installStatusLine(): { installed: boolean } {
|
||||
return { installed: true };
|
||||
}
|
||||
|
||||
export function runInstall(args: string[] = []): void {
|
||||
export async function runInstall(args: string[] = []): Promise<void> {
|
||||
const skipHooks = args.includes("--no-hooks");
|
||||
const skipSkill = args.includes("--no-skill");
|
||||
const skipService = args.includes("--no-service");
|
||||
@@ -559,7 +559,7 @@ export function runInstall(args: string[] = []): void {
|
||||
// install-service --mesh <slug>` explicitly.
|
||||
if (!skipService && hasMeshes) {
|
||||
try {
|
||||
installDaemonService(entry);
|
||||
await installDaemonService(entry);
|
||||
} catch (e) {
|
||||
render.warn(
|
||||
`daemon service install failed: ${e instanceof Error ? e.message : String(e)}`,
|
||||
@@ -603,7 +603,7 @@ export function runInstall(args: string[] = []): void {
|
||||
* the user knows there's a problem before it shows up as "no messages
|
||||
* arriving."
|
||||
*/
|
||||
function installDaemonService(binaryEntry: string): void {
|
||||
async function installDaemonService(binaryEntry: string): Promise<void> {
|
||||
const {
|
||||
installService,
|
||||
detectPlatform,
|
||||
@@ -652,7 +652,52 @@ function installDaemonService(binaryEntry: string): void {
|
||||
`daemon service installed but failed to start: ${e instanceof Error ? e.message : String(e)}`,
|
||||
`Run manually: ${r.bootCommand}`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
// 1.31.0 — post-flight: verify the daemon actually establishes a
|
||||
// broker WebSocket. Boots that fail silently here (DNS, expired TLS,
|
||||
// outbound :443 blocked, broker outage) used to surface only when
|
||||
// the user's first `peer list` or `send` failed half an hour later.
|
||||
// Polling /v1/health gives a clear, install-time signal.
|
||||
await verifyBrokerConnectivity();
|
||||
}
|
||||
|
||||
async function verifyBrokerConnectivity(): Promise<void> {
|
||||
const VERIFY_BUDGET_MS = 15_000;
|
||||
const POLL_INTERVAL_MS = 500;
|
||||
const { ipc } = await import("~/daemon/ipc/client.js");
|
||||
const start = Date.now();
|
||||
let lastBrokers: Record<string, string> = {};
|
||||
|
||||
while (Date.now() - start < VERIFY_BUDGET_MS) {
|
||||
try {
|
||||
const res = await ipc<{ ok: boolean; brokers?: Record<string, string> }>({
|
||||
path: "/v1/health",
|
||||
timeoutMs: 2_000,
|
||||
});
|
||||
lastBrokers = res.body?.brokers ?? {};
|
||||
const openMesh = Object.entries(lastBrokers).find(([, s]) => s === "open");
|
||||
if (openMesh) {
|
||||
const others = Object.entries(lastBrokers).filter(([slug]) => slug !== openMesh[0]);
|
||||
const tail = others.length > 0 ? `, ${others.length} other mesh${others.length === 1 ? "" : "es"} attaching` : "";
|
||||
render.ok(`broker connected (mesh=${openMesh[0]}${tail})`);
|
||||
return;
|
||||
}
|
||||
} catch { /* daemon may still be starting up; keep polling */ }
|
||||
await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS));
|
||||
}
|
||||
|
||||
// Timed out without a single broker reaching `open`. Surface what we
|
||||
// saw last so the user can act — this is exactly the bug class we
|
||||
// want to catch at install time, not at first send.
|
||||
const states = Object.keys(lastBrokers).length === 0
|
||||
? "no health response from daemon"
|
||||
: Object.entries(lastBrokers).map(([m, s]) => `${m}=${s}`).join(", ");
|
||||
render.warn(
|
||||
`broker did not reach open within ${Math.round(VERIFY_BUDGET_MS / 1000)}s (${states})`,
|
||||
"Check ~/.claudemesh/daemon/daemon.log for connect errors. Common causes: outbound :443 blocked, expired TLS, DNS resolution.",
|
||||
);
|
||||
}
|
||||
|
||||
export function runUninstall(): void {
|
||||
|
||||
Reference in New Issue
Block a user