From 1a14cef1e0a6a9b78a5eecf9c99fc5bdee4b53ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alejandro=20Guti=C3=A9rrez?= <35082514+alezmad@users.noreply.github.com> Date: Mon, 4 May 2026 14:05:44 +0100 Subject: [PATCH] =?UTF-8?q?feat(cli):=201.31.0=20=E2=80=94=20session=20aut?= =?UTF-8?q?oclean=20+=20broker=20verification=20+=20service=20path?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three operability fixes for users running the daemon under launchd or systemd. PID-watcher autoclean ===================== The session reaper already dropped registry entries with dead pids on a 30s loop, but had two real-world gaps: - 30s sweep let stale presence linger on the broker for half a minute - bare process.kill(pid, 0) trusts a recycled pid; a registry entry could survive its real owner's death whenever the OS rolled the pid number forward to a new program Process-exit IPC from claude-code is best-effort and skipped on SIGKILL / OOM / segfault / panic, so it cannot replace the sweep. Fix: - New process-info.ts captures opaque per-process start-times via ps -o lstart= (works on macOS and Linux, ~1 ms per call) - registerSession stores the start-time alongside the pid - reapDead drops entries when pid is dead OR start-time changed since register - Sweep cadence 30s -> 5s - Best-effort fallback to bare liveness when start-time capture fails at register time Registry hooks already close the per-session broker WS on deregister, so peer list rebuilds within one sweep of any session exit. Service-managed daemon: no more "spawn failed" false alarms =========================================================== After claudemesh install (which writes a launchd plist or systemd unit with KeepAlive=true), users routinely saw [claudemesh] warn daemon spawn failed: socket did not appear within 3000ms even when the daemon was running fine. Two contributing causes: 1. Probe timeout was 800ms — the first IPC after a launchd-driven restart can take longer (SQLite migration + broker WS opens) and tripped it. Bumped to 2500ms. 2. On a failed probe the CLI tried its own detached spawn, which collided with launchd's KeepAlive restart cycle (singleton lock fails, child exits) and we'd then time out polling for a socket that was actually about to come up. Now: when the launchd plist or systemd unit exists, the CLI does not attempt a spawn. It waits up to 8s for the OS-managed unit to bring the socket up. New service-not-ready state distinguishes "OS hasn't restarted it yet" from "we tried to spawn and it failed". Install verifies broker connectivity, not just process start ============================================================ Previously install ended once launchctl reported the unit loaded — a daemon that boots but cannot reach the broker (blocked :443, expired TLS, DNS, broker outage) only surfaced on the user's first peer list or send. /v1/health now includes per-mesh broker WS state. install polls it for up to 15s after service boot and prints either "broker connected (mesh=...)" or a warning naming the meshes still in connecting state, with a hint at common causes. The verification is best-effort and does not fail the install — it just surfaces the issue early. Tests ===== 4 new vitest cases cover the reaper paths: dead pid, live pid plus matching start-time, live pid plus mismatched start-time (PID reuse), and the no-start-time fallback. 83 of 83 pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- apps/cli/CHANGELOG.md | 103 +++++++++++++++++ apps/cli/package.json | 2 +- apps/cli/src/commands/install.ts | 51 ++++++++- apps/cli/src/daemon/ipc/server.ts | 12 +- apps/cli/src/daemon/process-info.ts | 46 ++++++++ apps/cli/src/daemon/session-registry.ts | 40 ++++++- apps/cli/src/entrypoints/cli.ts | 2 +- apps/cli/src/services/daemon/lifecycle.ts | 65 ++++++++++- apps/cli/src/ui/warnings.ts | 3 + apps/cli/tests/unit/session-reaper.test.ts | 127 +++++++++++++++++++++ 10 files changed, 436 insertions(+), 15 deletions(-) create mode 100644 apps/cli/src/daemon/process-info.ts create mode 100644 apps/cli/tests/unit/session-reaper.test.ts diff --git a/apps/cli/CHANGELOG.md b/apps/cli/CHANGELOG.md index 0b4f87f..677f6ec 100644 --- a/apps/cli/CHANGELOG.md +++ b/apps/cli/CHANGELOG.md @@ -1,5 +1,108 @@ # Changelog +## 1.31.0 (2026-05-04) — session autoclean, install-time broker verification, no more spurious cold-path warnings under service management + +**Three operability changes targeting users who installed the daemon as a launchd / systemd service.** + +### Session reaper now autocleans dead claude-code sessions + +The daemon's session registry already had a 30-second reaper that +deregistered entries whose pid was dead, but it had two gaps: + +- **Sweep cadence too slow.** Stale presence on the broker lingered for + up to half a minute after a session crashed. +- **No PID-reuse guard.** A recycled pid passes `kill(pid, 0)` even + though the original process is gone, so the registry could trust a + ghost. + +Process-exit IPC from claude-code itself isn't a viable replacement — +exit handlers don't run on `SIGKILL`, OOM, segfault, kernel panic, or +power loss. The reaper has to be the source of truth. + +What changed: + +- Reaper interval **30 s → 5 s**. +- On register, capture an opaque process start-time (`ps -o lstart=`, + works on macOS and Linux). Stored alongside the pid. +- On each sweep, an entry is reaped when the pid is dead **or** the + pid is alive but its start-time no longer matches what we captured. +- Registry hooks already close the per-session broker WS on + deregister, so `peer list` rebuilds within one sweep of any session + exit, no matter how the process died. + +Local-host scope only — cross-host registrations are skipped (the +daemon can't `kill -0` a remote pid). Best-effort fallback to bare +liveness when start-time capture fails (e.g., process already gone at +register time). + +### Service-managed daemon: no more "spawn failed" false alarms + +Users who installed via `claudemesh install` (which sets up +launchd/systemd with `KeepAlive=true`) saw spurious warnings: + +``` +[claudemesh] warn daemon spawn failed: socket did not appear within 3000ms +``` + +even when the daemon was healthy. Two contributing causes: + +1. **Probe timeout was 800 ms.** Tight enough that the first IPC after + a launchd-driven restart (which migrates SQLite + opens broker + WSes) routinely tripped it. Bumped to **2500 ms**. +2. **CLI raced launchd on respawn.** When the probe failed, the CLI + tried to spawn its own detached daemon, which collided with + launchd's own restart cycle (singleton lock fails, child exits) and + left the user with a 3-second timeout warning. Now: when the daemon + is installed as a service unit (`~/Library/LaunchAgents/com.claudemesh.daemon.plist` + or `~/.config/systemd/user/claudemesh-daemon.service` exist), the + CLI **does not attempt to spawn**. It waits up to 8 s for the OS to + bring the socket up, and only fails out with a service-specific + message pointing at `launchctl print` / `systemctl status` if the + service genuinely failed. + +New state `service-not-ready` distinguishes "OS-managed daemon hasn't +come up yet" from "we tried to spawn and it failed" — the latter no +longer fires when the daemon is service-managed. + +### `claudemesh install` now verifies broker connectivity, not just process start + +Previously `install` ended once launchctl/systemctl reported the unit +loaded — but a daemon that boots and then can't reach the broker +(blocked outbound :443, expired TLS, DNS failure, broker outage) only +surfaced as a confusing failure on the user's first `peer list` or +`send`, sometimes hours later. + +`/v1/health` was extended to include per-mesh broker WS state: + +```json +{ "ok": true, "pid": 58837, "brokers": { "flexicar": "open", "openclaw": "connecting" } } +``` + +After service start, `install` polls `/v1/health` for up to 15 s and +prints either: + +``` +✔ broker connected (mesh=flexicar, 2 other meshes attaching) +``` + +or, on timeout: + +``` +warn broker did not reach open within 15s (flexicar=connecting, openclaw=connecting) + Check ~/.claudemesh/daemon/daemon.log for connect errors. + Common causes: outbound :443 blocked, expired TLS, DNS resolution. +``` + +The verification is best-effort and doesn't fail the install — it +just surfaces the issue early so the user can fix it before sending +their first message. + +### Tests + +4 new vitest cases cover the reaper paths: dead pid, live pid + +matching start-time, live pid + mismatched start-time (PID reuse), and +the no-start-time best-effort fallback. + ## 1.30.2 (2026-05-04) — daemon service is multi-mesh by default `claudemesh install` was hardcoding `--mesh ` into the diff --git a/apps/cli/package.json b/apps/cli/package.json index de53d2e..544ae98 100644 --- a/apps/cli/package.json +++ b/apps/cli/package.json @@ -1,6 +1,6 @@ { "name": "claudemesh-cli", - "version": "1.30.2", + "version": "1.31.0", "description": "Peer mesh for Claude Code sessions — CLI + MCP server.", "keywords": [ "claude-code", diff --git a/apps/cli/src/commands/install.ts b/apps/cli/src/commands/install.ts index f1cd2c4..48db5a9 100644 --- a/apps/cli/src/commands/install.ts +++ b/apps/cli/src/commands/install.ts @@ -434,7 +434,7 @@ function installStatusLine(): { installed: boolean } { return { installed: true }; } -export function runInstall(args: string[] = []): void { +export async function runInstall(args: string[] = []): Promise { const skipHooks = args.includes("--no-hooks"); const skipSkill = args.includes("--no-skill"); const skipService = args.includes("--no-service"); @@ -559,7 +559,7 @@ export function runInstall(args: string[] = []): void { // install-service --mesh ` explicitly. if (!skipService && hasMeshes) { try { - installDaemonService(entry); + await installDaemonService(entry); } catch (e) { render.warn( `daemon service install failed: ${e instanceof Error ? e.message : String(e)}`, @@ -603,7 +603,7 @@ export function runInstall(args: string[] = []): void { * the user knows there's a problem before it shows up as "no messages * arriving." */ -function installDaemonService(binaryEntry: string): void { +async function installDaemonService(binaryEntry: string): Promise { const { installService, detectPlatform, @@ -652,7 +652,52 @@ function installDaemonService(binaryEntry: string): void { `daemon service installed but failed to start: ${e instanceof Error ? e.message : String(e)}`, `Run manually: ${r.bootCommand}`, ); + return; } + + // 1.31.0 — post-flight: verify the daemon actually establishes a + // broker WebSocket. Boots that fail silently here (DNS, expired TLS, + // outbound :443 blocked, broker outage) used to surface only when + // the user's first `peer list` or `send` failed half an hour later. + // Polling /v1/health gives a clear, install-time signal. + await verifyBrokerConnectivity(); +} + +async function verifyBrokerConnectivity(): Promise { + const VERIFY_BUDGET_MS = 15_000; + const POLL_INTERVAL_MS = 500; + const { ipc } = await import("~/daemon/ipc/client.js"); + const start = Date.now(); + let lastBrokers: Record = {}; + + while (Date.now() - start < VERIFY_BUDGET_MS) { + try { + const res = await ipc<{ ok: boolean; brokers?: Record }>({ + path: "/v1/health", + timeoutMs: 2_000, + }); + lastBrokers = res.body?.brokers ?? {}; + const openMesh = Object.entries(lastBrokers).find(([, s]) => s === "open"); + if (openMesh) { + const others = Object.entries(lastBrokers).filter(([slug]) => slug !== openMesh[0]); + const tail = others.length > 0 ? `, ${others.length} other mesh${others.length === 1 ? "" : "es"} attaching` : ""; + render.ok(`broker connected (mesh=${openMesh[0]}${tail})`); + return; + } + } catch { /* daemon may still be starting up; keep polling */ } + await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS)); + } + + // Timed out without a single broker reaching `open`. Surface what we + // saw last so the user can act — this is exactly the bug class we + // want to catch at install time, not at first send. + const states = Object.keys(lastBrokers).length === 0 + ? "no health response from daemon" + : Object.entries(lastBrokers).map(([m, s]) => `${m}=${s}`).join(", "); + render.warn( + `broker did not reach open within ${Math.round(VERIFY_BUDGET_MS / 1000)}s (${states})`, + "Check ~/.claudemesh/daemon/daemon.log for connect errors. Common causes: outbound :443 blocked, expired TLS, DNS resolution.", + ); } export function runUninstall(): void { diff --git a/apps/cli/src/daemon/ipc/server.ts b/apps/cli/src/daemon/ipc/server.ts index 401018c..9165947 100644 --- a/apps/cli/src/daemon/ipc/server.ts +++ b/apps/cli/src/daemon/ipc/server.ts @@ -204,7 +204,17 @@ function makeHandler(opts: { } if (req.method === "GET" && url.pathname === "/v1/health") { - respond(res, 200, { ok: true, pid: process.pid }); + // 1.31.0: include per-mesh broker WS state so callers can verify + // functional connectivity, not just that the daemon process is + // running. Used by `claudemesh install` post-flight to wait for + // at least one broker to be `open` before declaring success — + // catches dead WS / DNS / TLS / outbound-blocked-port issues at + // install time instead of when the user's first message fails. + const brokers: Record = {}; + if (opts.brokers) { + for (const [slug, client] of opts.brokers) brokers[slug] = client.status; + } + respond(res, 200, { ok: true, pid: process.pid, brokers }); return; } diff --git a/apps/cli/src/daemon/process-info.ts b/apps/cli/src/daemon/process-info.ts new file mode 100644 index 0000000..a9a5c3d --- /dev/null +++ b/apps/cli/src/daemon/process-info.ts @@ -0,0 +1,46 @@ +/** + * Process-info helpers used by the session reaper to detect dead-pid AND + * pid-reuse safely. + * + * `process.kill(pid, 0)` alone is insufficient: a recently-recycled pid + * passes the liveness check even though the process registered under it + * is long gone. To avoid mistakenly trusting a recycled pid, we capture + * a stable per-process start-time at register, and compare it on each + * sweep — if it changed, treat the original process as dead. + * + * macOS + Linux both expose `ps -o lstart=` returning a fixed-format + * timestamp ("Sun May 4 09:14:00 2026"). Equality is the only operation + * the reaper needs, so we keep the value as an opaque string. + */ + +import { execFileSync } from "node:child_process"; + +/** + * Returns a stable process-start identifier for `pid`, or null if the + * process is dead or unreachable. Cheap (~1 ms per call) — safe to use + * inside the 5-second reaper sweep. + */ +export function getProcessStartTime(pid: number): string | null { + if (!Number.isFinite(pid) || pid <= 0) return null; + try { + const out = execFileSync("ps", ["-o", "lstart=", "-p", String(pid)], { + encoding: "utf8", + timeout: 1_000, + stdio: ["ignore", "pipe", "ignore"], + }).trim(); + return out.length > 0 ? out : null; + } catch { + return null; + } +} + +/** Liveness-only probe (signal 0). Use together with start-time guard. */ +export function isPidAlive(pid: number): boolean { + if (!Number.isFinite(pid) || pid <= 0) return false; + try { + process.kill(pid, 0); + return true; + } catch { + return false; + } +} diff --git a/apps/cli/src/daemon/session-registry.ts b/apps/cli/src/daemon/session-registry.ts index cb4a58e..7f24130 100644 --- a/apps/cli/src/daemon/session-registry.ts +++ b/apps/cli/src/daemon/session-registry.ts @@ -10,7 +10,9 @@ * Lifecycle: * - register replaces any prior entry under the same `sessionId` * (handles re-launch and `--resume` flows cleanly). - * - reaper polls every 30 s and drops entries whose pid is dead. + * - reaper polls every 5 s. An entry is dropped when its pid is dead + * OR when its captured start-time no longer matches the running + * process (PID reuse — original is gone, OS recycled the number). * - hard ttl ceiling of 24 h is a leak guard for forgotten sessions. * * Persistence: in-memory only for v1. A daemon restart clears the @@ -20,6 +22,8 @@ * session have no token to begin with. */ +import { getProcessStartTime, isPidAlive } from "./process-info.js"; + /** * Optional per-launch presence material. Carried opaquely through the * registry; the daemon's session-broker subsystem (1.30.0+) reads it to @@ -51,6 +55,16 @@ export interface SessionInfo { groups?: string[]; /** 1.30.0+: per-launch presence material. */ presence?: SessionPresence; + /** + * 1.31.0+: opaque per-process start-time captured at register. The + * reaper compares the live value against this on every sweep — a + * mismatch means the original process exited and the pid was reused + * by an unrelated program, so the registry entry must be dropped. + * `undefined` when capture failed (process already dead at register + * time, ps unavailable, etc.) — the reaper falls back to bare + * liveness in that case. + */ + startTime?: string; registeredAt: number; } @@ -61,7 +75,7 @@ export interface RegistryHooks { } const TTL_MS = 24 * 60 * 60 * 1000; -const REAPER_INTERVAL_MS = 30 * 1000; +const REAPER_INTERVAL_MS = 5 * 1000; const byToken = new Map(); const bySessionId = new Map(); @@ -98,7 +112,12 @@ export function registerSession(info: Omit): Sessio } } - const stored: SessionInfo = { ...info, registeredAt: Date.now() }; + // Capture start-time at register so the reaper can detect PID reuse. + // Caller may pre-fill info.startTime (tests do this); only probe ps + // when the field is absent so we don't fork shell subprocesses in + // unit tests for fake pids. + const startTime = info.startTime ?? getProcessStartTime(info.pid) ?? undefined; + const stored: SessionInfo = { ...info, startTime, registeredAt: Date.now() }; byToken.set(info.token, stored); bySessionId.set(info.sessionId, info.token); try { hooks.onRegister?.(stored); } catch { /* see above */ } @@ -132,11 +151,24 @@ function reapDead(): void { const dead: string[] = []; for (const [token, info] of byToken.entries()) { if (Date.now() - info.registeredAt > TTL_MS) { dead.push(token); continue; } - try { process.kill(info.pid, 0); } catch { dead.push(token); } + if (!isPidAlive(info.pid)) { dead.push(token); continue; } + // PID reuse guard: process is alive, but if its start-time changed + // since register the original is gone and the OS recycled the pid + // for an unrelated program. Skip when we never captured a start- + // time (best-effort fallback to bare liveness above). + if (info.startTime !== undefined) { + const live = getProcessStartTime(info.pid); + if (live !== null && live !== info.startTime) { dead.push(token); continue; } + } } for (const t of dead) deregisterByToken(t); } +/** Test helper: run a single reaper pass synchronously. */ +export function _runReaperOnce(): void { + reapDead(); +} + /** Test helper. */ export function _resetRegistry(): void { byToken.clear(); diff --git a/apps/cli/src/entrypoints/cli.ts b/apps/cli/src/entrypoints/cli.ts index 6b7a9a4..e193129 100644 --- a/apps/cli/src/entrypoints/cli.ts +++ b/apps/cli/src/entrypoints/cli.ts @@ -470,7 +470,7 @@ async function main(): Promise { } // Setup - case "install": { const { runInstall } = await import("~/commands/install.js"); runInstall(positionals); break; } + case "install": { const { runInstall } = await import("~/commands/install.js"); await runInstall(positionals); break; } case "uninstall": { const { uninstall } = await import("~/commands/uninstall.js"); process.exit(await uninstall()); break; } case "doctor": { const { runDoctor } = await import("~/commands/doctor.js"); await runDoctor(); break; } case "status": { diff --git a/apps/cli/src/services/daemon/lifecycle.ts b/apps/cli/src/services/daemon/lifecycle.ts index 4503cca..91f1b0c 100644 --- a/apps/cli/src/services/daemon/lifecycle.ts +++ b/apps/cli/src/services/daemon/lifecycle.ts @@ -30,6 +30,7 @@ */ import { existsSync, readFileSync, statSync, unlinkSync, writeFileSync } from "node:fs"; +import { homedir } from "node:os"; import { join } from "node:path"; import { ipc, IpcError } from "~/daemon/ipc/client.js"; @@ -40,7 +41,11 @@ export type DaemonReadyState = | "started" | "down" | "spawn-failed" - | "spawn-suppressed"; + | "spawn-suppressed" + /** 1.31.0+: launchd / systemd manages the daemon and it didn't respond + * within the service budget. Distinct from spawn-failed: the CLI did + * not attempt to spawn (the OS owns the lifecycle). */ + | "service-not-ready"; export interface EnsureDaemonResult { state: DaemonReadyState; @@ -62,7 +67,16 @@ export interface EnsureDaemonOpts { const SPAWN_LOCK_FILE = () => join(DAEMON_PATHS.DAEMON_DIR, ".spawn.lock"); const SPAWN_FAIL_FILE = () => join(DAEMON_PATHS.DAEMON_DIR, ".spawn-failure"); const SPAWN_FAIL_TTL_MS = 30_000; -const PROBE_TIMEOUT_MS = 800; +// 1.31.0: 800 ms was too tight — the daemon's first IPC after a launchd +// (re)start can take a beat while it migrates SQLite, opens broker WSes, +// and warms up the event loop. False "stale" probes triggered the +// pointless spawn → "socket did not appear" warning even on a perfectly +// healthy service-managed daemon. 2500 ms still bounds the worst case. +const PROBE_TIMEOUT_MS = 2_500; +// When the daemon is service-managed (launchd/systemd) and KeepAlive=true, +// the OS guarantees a restart on death — the CLI must NOT race that with +// its own spawn. Just wait longer for the service unit to come up. +const SERVICE_BUDGET_MS = 8_000; let lastResultThisProcess: EnsureDaemonResult | null = null; @@ -91,9 +105,30 @@ async function runEnsureDaemon(opts: EnsureDaemonOpts): Promise { if (!existsSync(DAEMON_PATHS.SOCK_FILE)) return "absent"; try { diff --git a/apps/cli/src/ui/warnings.ts b/apps/cli/src/ui/warnings.ts index 4b653b5..d3fd187 100644 --- a/apps/cli/src/ui/warnings.ts +++ b/apps/cli/src/ui/warnings.ts @@ -50,6 +50,9 @@ export function warnDaemonState( case "spawn-failed": process.stderr.write(`${tag("warn")} daemon spawn failed${res.reason ? `: ${res.reason}` : ""} — using cold path ${hint("(check ~/.claudemesh/daemon/daemon.log)")}\n`); return true; + case "service-not-ready": + process.stderr.write(`${tag("warn")} ${res.reason ?? "service-managed daemon not responding"} — using cold path ${hint("(check ~/.claudemesh/daemon/daemon.log)")}\n`); + return true; } return false; } diff --git a/apps/cli/tests/unit/session-reaper.test.ts b/apps/cli/tests/unit/session-reaper.test.ts new file mode 100644 index 0000000..6e57a73 --- /dev/null +++ b/apps/cli/tests/unit/session-reaper.test.ts @@ -0,0 +1,127 @@ +/** + * Session reaper — PID-watcher autoclean (1.31.0). + * + * Verifies that registry entries are dropped when: + * 1. their pid is no longer alive, + * 2. their pid is alive but its start-time changed since register + * (PID reuse — original process gone, OS recycled the number). + * + * The reaper is the autoclean source-of-truth: process-exit IPC from + * the launched session is best-effort (skipped on SIGKILL, OOM, hard + * crash, kernel panic) so this sweep is what actually keeps the + * broker presence honest. Both signals must work or stale "ghost" + * sessions linger on the broker. + */ + +import { afterEach, describe, expect, test, vi } from "vitest"; + +import { + _resetRegistry, + _runReaperOnce, + listSessions, + registerSession, + setRegistryHooks, + type SessionInfo, +} from "../../src/daemon/session-registry.js"; + +afterEach(() => { + _resetRegistry(); + vi.restoreAllMocks(); +}); + +describe("session reaper", () => { + test("drops entry when pid is dead", () => { + const onDeregister = vi.fn(); + setRegistryHooks({ onDeregister }); + + // Use a high pid that is exceedingly unlikely to be alive on any + // host — the alive check uses signal 0 which returns ESRCH for + // unused pids. + registerSession({ + token: "a".repeat(64), + sessionId: "sess-dead", + mesh: "m", + displayName: "x", + pid: 999_999, + startTime: "Fri May 1 09:00:00 2026", + }); + expect(listSessions()).toHaveLength(1); + + _runReaperOnce(); + + expect(listSessions()).toHaveLength(0); + expect(onDeregister).toHaveBeenCalledTimes(1); + const arg = onDeregister.mock.calls[0]![0] as SessionInfo; + expect(arg.sessionId).toBe("sess-dead"); + }); + + test("keeps entry when pid is alive and start-time matches", () => { + const onDeregister = vi.fn(); + setRegistryHooks({ onDeregister }); + + // Use the test runner's own pid (process.pid is always alive here) + // and capture its real start-time so the start-time guard sees a + // match. Without pre-seeding startTime, registerSession would + // probe ps and we'd race with that — explicit value keeps the + // test deterministic. + const { execFileSync } = require("node:child_process"); + const realStart = execFileSync("ps", ["-o", "lstart=", "-p", String(process.pid)], { + encoding: "utf8", + }).trim(); + + registerSession({ + token: "b".repeat(64), + sessionId: "sess-live", + mesh: "m", + displayName: "x", + pid: process.pid, + startTime: realStart, + }); + + _runReaperOnce(); + + expect(listSessions()).toHaveLength(1); + expect(onDeregister).not.toHaveBeenCalled(); + }); + + test("drops entry when pid is alive but start-time mismatched (PID reuse)", () => { + const onDeregister = vi.fn(); + setRegistryHooks({ onDeregister }); + + // Pid IS alive (process.pid) but we register a fake start-time + // that won't match. Reaper must reap. + registerSession({ + token: "c".repeat(64), + sessionId: "sess-reused", + mesh: "m", + displayName: "x", + pid: process.pid, + startTime: "Sat Jan 1 00:00:00 1980", + }); + + _runReaperOnce(); + + expect(listSessions()).toHaveLength(0); + expect(onDeregister).toHaveBeenCalledTimes(1); + }); + + test("keeps entry when start-time wasn't captured (best-effort fallback)", () => { + const onDeregister = vi.fn(); + setRegistryHooks({ onDeregister }); + + // Register without startTime → reaper falls back to bare liveness. + // process.pid is alive, so the entry must survive. + registerSession({ + token: "d".repeat(64), + sessionId: "sess-no-start", + mesh: "m", + displayName: "x", + pid: process.pid, + }); + + _runReaperOnce(); + + expect(listSessions()).toHaveLength(1); + expect(onDeregister).not.toHaveBeenCalled(); + }); +});