diff --git a/apps/cli/CHANGELOG.md b/apps/cli/CHANGELOG.md index 677f6ec..e6fba60 100644 --- a/apps/cli/CHANGELOG.md +++ b/apps/cli/CHANGELOG.md @@ -1,5 +1,33 @@ # Changelog +## 1.31.1 (2026-05-04) — hotfix: reaper stops blocking the daemon event loop + +1.31.0 shipped a session reaper that called `execFileSync("ps")` +synchronously, once per registered session, every 5 seconds. With ten +or more sessions registered the daemon's event loop stalled for +hundreds of milliseconds at a time — long enough that incoming +`/v1/version` probes from the CLI failed to return within the 2.5 s +budget and the new "service-managed daemon not responding within +8000ms" warning fired against a perfectly healthy daemon. + +Fix: + +- `getProcessStartTime` is now async (`execFile` + promisify), never + blocks the event loop. +- New `getProcessStartTimes(pids)` issues a single batched `ps -p + ,,...` for every survivor in one fork — sweep cost is fixed + regardless of session count. +- `registerSession` stays synchronous: the start-time capture runs + fire-and-forget so the IPC route returns instantly. The reaper falls + back to bare liveness for the brief window before the start-time + lands. +- `reapDead` is now async; the setInterval wrapper voids it so a + rejected sweep can never crash the daemon. + +Behavior is otherwise unchanged from 1.31.0 — same 5 s cadence, same +PID-reuse guard semantics, same broker-WS teardown via the registry +hook. + ## 1.31.0 (2026-05-04) — session autoclean, install-time broker verification, no more spurious cold-path warnings under service management **Three operability changes targeting users who installed the daemon as a launchd / systemd service.** diff --git a/apps/cli/package.json b/apps/cli/package.json index 544ae98..932b7cf 100644 --- a/apps/cli/package.json +++ b/apps/cli/package.json @@ -1,6 +1,6 @@ { "name": "claudemesh-cli", - "version": "1.31.0", + "version": "1.31.1", "description": "Peer mesh for Claude Code sessions — CLI + MCP server.", "keywords": [ "claude-code", diff --git a/apps/cli/src/daemon/process-info.ts b/apps/cli/src/daemon/process-info.ts index a9a5c3d..addb06d 100644 --- a/apps/cli/src/daemon/process-info.ts +++ b/apps/cli/src/daemon/process-info.ts @@ -9,31 +9,83 @@ * sweep — if it changed, treat the original process as dead. * * macOS + Linux both expose `ps -o lstart=` returning a fixed-format - * timestamp ("Sun May 4 09:14:00 2026"). Equality is the only operation - * the reaper needs, so we keep the value as an opaque string. + * timestamp ("Sun May 4 09:14:00 2026"). Equality is the only + * operation the reaper needs, so we keep the value as an opaque string. + * + * IMPORTANT (1.31.1): every fork / execFile blocks the daemon's event + * loop until ps completes (~30-80 ms per call on macOS). The first + * 1.31.0 implementation called execFileSync once per registered + * session every 5 s, and with 10+ sessions that stalled IPC for hundreds + * of milliseconds at a time — long enough that probes against + * /v1/version were declared "stale" and the CLI fell back to the cold + * path with the misleading "service-managed daemon not responding" + * warning. This module now exposes: + * + * - `getProcessStartTime(pid)`: async, single-pid, used at register. + * - `getProcessStartTimes(pids)`: async, batched, used by the reaper. + * One ps invocation handles N pids, so the per-sweep cost is fixed + * and tiny regardless of how many sessions are registered. */ -import { execFileSync } from "node:child_process"; +import { execFile } from "node:child_process"; +import { promisify } from "node:util"; + +const execFileAsync = promisify(execFile); /** * Returns a stable process-start identifier for `pid`, or null if the - * process is dead or unreachable. Cheap (~1 ms per call) — safe to use - * inside the 5-second reaper sweep. + * process is dead or unreachable. Async — never blocks the event loop. */ -export function getProcessStartTime(pid: number): string | null { +export async function getProcessStartTime(pid: number): Promise { if (!Number.isFinite(pid) || pid <= 0) return null; try { - const out = execFileSync("ps", ["-o", "lstart=", "-p", String(pid)], { + const { stdout } = await execFileAsync("ps", ["-o", "lstart=", "-p", String(pid)], { encoding: "utf8", timeout: 1_000, - stdio: ["ignore", "pipe", "ignore"], - }).trim(); + }); + const out = stdout.trim(); return out.length > 0 ? out : null; } catch { return null; } } +/** + * Batched form: returns a Map for every pid that is still + * alive. Pids that ps doesn't return (i.e. dead) are absent from the + * map. One ps fork handles all pids — O(1) sweep cost regardless of + * session count. + */ +export async function getProcessStartTimes(pids: number[]): Promise> { + const result = new Map(); + const valid = pids.filter((p) => Number.isFinite(p) && p > 0); + if (valid.length === 0) return result; + // ps -o pid,lstart= -p p1,p2,... emits one row per live pid: + // " 12345 Sun May 4 09:14:00 2026" + // Dead pids are silently omitted. + try { + const { stdout } = await execFileAsync( + "ps", + ["-o", "pid=,lstart=", "-p", valid.join(",")], + { encoding: "utf8", timeout: 2_000 }, + ); + for (const raw of stdout.split("\n")) { + const line = raw.trim(); + if (!line) continue; + const m = /^(\d+)\s+(.+)$/.exec(line); + if (!m) continue; + const pid = Number.parseInt(m[1]!, 10); + const lstart = m[2]!.trim(); + if (Number.isFinite(pid) && lstart.length > 0) result.set(pid, lstart); + } + } catch { + // ps failure (timeout, ENOENT) — treat as "no info available" and + // let the reaper fall back to bare liveness for these pids. Better + // to keep entries than to nuke them on a transient ps error. + } + return result; +} + /** Liveness-only probe (signal 0). Use together with start-time guard. */ export function isPidAlive(pid: number): boolean { if (!Number.isFinite(pid) || pid <= 0) return false; diff --git a/apps/cli/src/daemon/session-registry.ts b/apps/cli/src/daemon/session-registry.ts index 7f24130..d79c7a8 100644 --- a/apps/cli/src/daemon/session-registry.ts +++ b/apps/cli/src/daemon/session-registry.ts @@ -22,7 +22,7 @@ * session have no token to begin with. */ -import { getProcessStartTime, isPidAlive } from "./process-info.js"; +import { getProcessStartTime, getProcessStartTimes, isPidAlive } from "./process-info.js"; /** * Optional per-launch presence material. Carried opaquely through the @@ -85,7 +85,10 @@ let reaperHandle: NodeJS.Timeout | null = null; export function startReaper(): void { if (reaperHandle) return; - reaperHandle = setInterval(reapDead, REAPER_INTERVAL_MS).unref?.() ?? reaperHandle; + // The sweep is async (batched ps) — wrap in `void` so setInterval + // doesn't try to await us, and so an unexpected throw doesn't crash + // the daemon. Errors are swallowed inside reapDead. + reaperHandle = setInterval(() => { void reapDead(); }, REAPER_INTERVAL_MS).unref?.() ?? reaperHandle; } export function stopReaper(): void { @@ -112,18 +115,31 @@ export function registerSession(info: Omit): Sessio } } - // Capture start-time at register so the reaper can detect PID reuse. - // Caller may pre-fill info.startTime (tests do this); only probe ps - // when the field is absent so we don't fork shell subprocesses in - // unit tests for fake pids. - const startTime = info.startTime ?? getProcessStartTime(info.pid) ?? undefined; - const stored: SessionInfo = { ...info, startTime, registeredAt: Date.now() }; + // Caller may pre-fill info.startTime (tests do this for determinism). + // For the real path we fire-and-forget an async ps probe — register + // stays sync and microsecond-fast, and the start-time lands on the + // entry within a few ms. Until it lands, the reaper falls back to + // bare liveness for this entry, which is fine for the common case + // (PID reuse is rare; the brief window without the guard is + // tolerable). + const stored: SessionInfo = { ...info, registeredAt: Date.now() }; byToken.set(info.token, stored); bySessionId.set(info.sessionId, info.token); try { hooks.onRegister?.(stored); } catch { /* see above */ } + if (stored.startTime === undefined) { + void captureStartTimeAsync(info.token, info.pid); + } return stored; } +async function captureStartTimeAsync(token: string, pid: number): Promise { + const lstart = await getProcessStartTime(pid); + if (lstart === null) return; + const entry = byToken.get(token); + if (!entry || entry.pid !== pid) return; // entry was replaced; skip + entry.startTime = lstart; +} + export function deregisterByToken(token: string): boolean { const entry = byToken.get(token); if (!entry) return false; @@ -147,26 +163,52 @@ export function listSessions(): SessionInfo[] { return [...byToken.values()]; } -function reapDead(): void { +async function reapDead(): Promise { + // Snapshot first; the second (async) phase calls ps and we must not + // mutate the registry mid-iteration. + const entries = [...byToken.entries()]; + + // Phase 1 — TTL + bare liveness. Sync, microsecond-fast. const dead: string[] = []; - for (const [token, info] of byToken.entries()) { + const survivors: Array<[string, SessionInfo]> = []; + for (const [token, info] of entries) { if (Date.now() - info.registeredAt > TTL_MS) { dead.push(token); continue; } if (!isPidAlive(info.pid)) { dead.push(token); continue; } - // PID reuse guard: process is alive, but if its start-time changed - // since register the original is gone and the OS recycled the pid - // for an unrelated program. Skip when we never captured a start- - // time (best-effort fallback to bare liveness above). - if (info.startTime !== undefined) { - const live = getProcessStartTime(info.pid); - if (live !== null && live !== info.startTime) { dead.push(token); continue; } + survivors.push([token, info]); + } + + // Phase 2 — PID-reuse guard for survivors that have a captured + // start-time. Single batched ps call: O(1) forks regardless of + // session count. Survivors without a start-time keep the bare- + // liveness verdict from phase 1 (their captureStartTimeAsync may + // still be in-flight from a recent register). + const guardedPids = survivors + .filter(([, info]) => info.startTime !== undefined) + .map(([, info]) => info.pid); + if (guardedPids.length > 0) { + try { + const live = await getProcessStartTimes(guardedPids); + for (const [token, info] of survivors) { + if (info.startTime === undefined) continue; + const lstart = live.get(info.pid); + // ps may transiently miss a pid that was alive when isPidAlive + // ran — treat absence as "racing", let the next sweep decide. + if (lstart === undefined) continue; + if (lstart !== info.startTime) dead.push(token); + } + } catch { + // ps failure here is non-fatal: survivors keep their phase-1 + // verdict. Logging is the daemon's responsibility — the + // registry deliberately stays log-free. } } + for (const t of dead) deregisterByToken(t); } -/** Test helper: run a single reaper pass synchronously. */ -export function _runReaperOnce(): void { - reapDead(); +/** Test helper: run a single reaper pass. */ +export async function _runReaperOnce(): Promise { + await reapDead(); } /** Test helper. */ diff --git a/apps/cli/tests/unit/session-reaper.test.ts b/apps/cli/tests/unit/session-reaper.test.ts index 6e57a73..ec98676 100644 --- a/apps/cli/tests/unit/session-reaper.test.ts +++ b/apps/cli/tests/unit/session-reaper.test.ts @@ -30,7 +30,7 @@ afterEach(() => { }); describe("session reaper", () => { - test("drops entry when pid is dead", () => { + test("drops entry when pid is dead", async () => { const onDeregister = vi.fn(); setRegistryHooks({ onDeregister }); @@ -47,7 +47,7 @@ describe("session reaper", () => { }); expect(listSessions()).toHaveLength(1); - _runReaperOnce(); + await _runReaperOnce(); expect(listSessions()).toHaveLength(0); expect(onDeregister).toHaveBeenCalledTimes(1); @@ -55,15 +55,14 @@ describe("session reaper", () => { expect(arg.sessionId).toBe("sess-dead"); }); - test("keeps entry when pid is alive and start-time matches", () => { + test("keeps entry when pid is alive and start-time matches", async () => { const onDeregister = vi.fn(); setRegistryHooks({ onDeregister }); // Use the test runner's own pid (process.pid is always alive here) // and capture its real start-time so the start-time guard sees a - // match. Without pre-seeding startTime, registerSession would - // probe ps and we'd race with that — explicit value keeps the - // test deterministic. + // match. Pre-seed startTime so registerSession's async ps probe + // doesn't race the test. const { execFileSync } = require("node:child_process"); const realStart = execFileSync("ps", ["-o", "lstart=", "-p", String(process.pid)], { encoding: "utf8", @@ -78,13 +77,13 @@ describe("session reaper", () => { startTime: realStart, }); - _runReaperOnce(); + await _runReaperOnce(); expect(listSessions()).toHaveLength(1); expect(onDeregister).not.toHaveBeenCalled(); }); - test("drops entry when pid is alive but start-time mismatched (PID reuse)", () => { + test("drops entry when pid is alive but start-time mismatched (PID reuse)", async () => { const onDeregister = vi.fn(); setRegistryHooks({ onDeregister }); @@ -99,27 +98,30 @@ describe("session reaper", () => { startTime: "Sat Jan 1 00:00:00 1980", }); - _runReaperOnce(); + await _runReaperOnce(); expect(listSessions()).toHaveLength(0); expect(onDeregister).toHaveBeenCalledTimes(1); }); - test("keeps entry when start-time wasn't captured (best-effort fallback)", () => { + test("keeps entry when start-time wasn't captured (best-effort fallback)", async () => { const onDeregister = vi.fn(); setRegistryHooks({ onDeregister }); // Register without startTime → reaper falls back to bare liveness. - // process.pid is alive, so the entry must survive. + // process.pid is alive, so the entry must survive. (The fire-and- + // forget capture inside registerSession will eventually populate + // startTime, but it does so after a real fork — for this test we + // rely on the synchronous reaper pass not seeing it yet.) registerSession({ token: "d".repeat(64), - sessionId: "sess-no-start", + sessionId: "sess-no-start-" + Math.random().toString(36).slice(2), mesh: "m", displayName: "x", pid: process.pid, }); - _runReaperOnce(); + await _runReaperOnce(); expect(listSessions()).toHaveLength(1); expect(onDeregister).not.toHaveBeenCalled();