Files
claudemesh/apps/cli/src/daemon/session-registry.ts
Alejandro Gutiérrez 15b7920b2a
Some checks failed
CI / Lint (push) Has been cancelled
CI / Typecheck (push) Has been cancelled
CI / Broker tests (Postgres) (push) Has been cancelled
CI / Docker build (linux/amd64) (push) Has been cancelled
fix(cli): 1.31.1 — reaper no longer blocks the daemon event loop
1.31.0 introduced a session reaper that called execFileSync(ps) once
per registered session every 5s. With many sessions registered, the
daemon's event loop stalled for hundreds of ms — long enough that
incoming /v1/version probes from the CLI timed out against a healthy
daemon and the new service-managed warning fired.

Fix:

- getProcessStartTime is now async (execFile + promisify); never
  blocks the event loop
- New getProcessStartTimes(pids) issues one batched ps for all
  survivors instead of N separate forks. Sweep cost is fixed
  regardless of session count.
- registerSession stays sync; start-time capture is fire-and-forget
- reapDead is now async; the setInterval wrapper voids it so a
  rejected sweep cannot crash the daemon

Behavior is otherwise unchanged from 1.31.0: same 5s cadence, same
PID-reuse guard semantics, same broker-WS teardown via the registry
hook. 83/83 tests still green.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 14:15:48 +01:00

221 lines
8.0 KiB
TypeScript

/**
* In-memory per-token session registry kept by the daemon.
*
* `claudemesh launch` POSTs `/v1/sessions/register` with the token it
* minted plus session metadata (sessionId, mesh, displayName, pid,
* cwd, role, groups). Subsequent CLI invocations from inside that
* session present the token via `Authorization: ClaudeMesh-Session
* <hex>` and the daemon's IPC auth middleware resolves it here in O(1).
*
* Lifecycle:
* - register replaces any prior entry under the same `sessionId`
* (handles re-launch and `--resume` flows cleanly).
* - reaper polls every 5 s. An entry is dropped when its pid is dead
* OR when its captured start-time no longer matches the running
* process (PID reuse — original is gone, OS recycled the number).
* - hard ttl ceiling of 24 h is a leak guard for forgotten sessions.
*
* Persistence: in-memory only for v1. A daemon restart clears the
* registry — every launched session needs to re-register. That's fine
* for now because launch.ts re-registers on `ensureDaemonRunning`'s
* success path, and most ad-hoc CLI invocations from outside a launched
* session have no token to begin with.
*/
import { getProcessStartTime, getProcessStartTimes, isPidAlive } from "./process-info.js";
/**
* Optional per-launch presence material. Carried opaquely through the
* registry; the daemon's session-broker subsystem (1.30.0+) reads it to
* open a long-lived broker WebSocket per session. Absent on older CLIs
* — register accepts payloads without it for backward compat.
*/
export interface SessionPresence {
/** Hex ed25519 pubkey, 64 chars. */
sessionPubkey: string;
/** Hex ed25519 secret key (held in-memory only; never disk). */
sessionSecretKey: string;
/** Parent-member-signed attestation; see signParentAttestation. */
parentAttestation: {
sessionPubkey: string;
parentMemberPubkey: string;
expiresAt: number;
signature: string;
};
}
export interface SessionInfo {
token: string;
sessionId: string;
mesh: string;
displayName: string;
pid: number;
cwd?: string;
role?: string;
groups?: string[];
/** 1.30.0+: per-launch presence material. */
presence?: SessionPresence;
/**
* 1.31.0+: opaque per-process start-time captured at register. The
* reaper compares the live value against this on every sweep — a
* mismatch means the original process exited and the pid was reused
* by an unrelated program, so the registry entry must be dropped.
* `undefined` when capture failed (process already dead at register
* time, ps unavailable, etc.) — the reaper falls back to bare
* liveness in that case.
*/
startTime?: string;
registeredAt: number;
}
/** Lifecycle callbacks invoked synchronously after registry mutation. */
export interface RegistryHooks {
onRegister?: (info: SessionInfo) => void;
onDeregister?: (info: SessionInfo) => void;
}
const TTL_MS = 24 * 60 * 60 * 1000;
const REAPER_INTERVAL_MS = 5 * 1000;
const byToken = new Map<string, SessionInfo>();
const bySessionId = new Map<string, string>();
const hooks: RegistryHooks = {};
let reaperHandle: NodeJS.Timeout | null = null;
export function startReaper(): void {
if (reaperHandle) return;
// The sweep is async (batched ps) — wrap in `void` so setInterval
// doesn't try to await us, and so an unexpected throw doesn't crash
// the daemon. Errors are swallowed inside reapDead.
reaperHandle = setInterval(() => { void reapDead(); }, REAPER_INTERVAL_MS).unref?.() ?? reaperHandle;
}
export function stopReaper(): void {
if (reaperHandle) { clearInterval(reaperHandle); reaperHandle = null; }
}
/**
* Wire daemon-level lifecycle hooks. Called once at daemon boot — passing
* `{}` clears them. Idempotent across calls so tests can re-bind.
*/
export function setRegistryHooks(next: RegistryHooks): void {
hooks.onRegister = next.onRegister;
hooks.onDeregister = next.onDeregister;
}
export function registerSession(info: Omit<SessionInfo, "registeredAt">): SessionInfo {
// Replace any prior entry under the same sessionId.
const priorToken = bySessionId.get(info.sessionId);
if (priorToken && priorToken !== info.token) {
const prior = byToken.get(priorToken);
if (prior) {
byToken.delete(priorToken);
try { hooks.onDeregister?.(prior); } catch { /* hook errors must never throttle the registry */ }
}
}
// Caller may pre-fill info.startTime (tests do this for determinism).
// For the real path we fire-and-forget an async ps probe — register
// stays sync and microsecond-fast, and the start-time lands on the
// entry within a few ms. Until it lands, the reaper falls back to
// bare liveness for this entry, which is fine for the common case
// (PID reuse is rare; the brief window without the guard is
// tolerable).
const stored: SessionInfo = { ...info, registeredAt: Date.now() };
byToken.set(info.token, stored);
bySessionId.set(info.sessionId, info.token);
try { hooks.onRegister?.(stored); } catch { /* see above */ }
if (stored.startTime === undefined) {
void captureStartTimeAsync(info.token, info.pid);
}
return stored;
}
async function captureStartTimeAsync(token: string, pid: number): Promise<void> {
const lstart = await getProcessStartTime(pid);
if (lstart === null) return;
const entry = byToken.get(token);
if (!entry || entry.pid !== pid) return; // entry was replaced; skip
entry.startTime = lstart;
}
export function deregisterByToken(token: string): boolean {
const entry = byToken.get(token);
if (!entry) return false;
byToken.delete(token);
if (bySessionId.get(entry.sessionId) === token) bySessionId.delete(entry.sessionId);
try { hooks.onDeregister?.(entry); } catch { /* see above */ }
return true;
}
export function resolveToken(token: string): SessionInfo | null {
const entry = byToken.get(token);
if (!entry) return null;
if (Date.now() - entry.registeredAt > TTL_MS) {
deregisterByToken(token);
return null;
}
return entry;
}
export function listSessions(): SessionInfo[] {
return [...byToken.values()];
}
async function reapDead(): Promise<void> {
// Snapshot first; the second (async) phase calls ps and we must not
// mutate the registry mid-iteration.
const entries = [...byToken.entries()];
// Phase 1 — TTL + bare liveness. Sync, microsecond-fast.
const dead: string[] = [];
const survivors: Array<[string, SessionInfo]> = [];
for (const [token, info] of entries) {
if (Date.now() - info.registeredAt > TTL_MS) { dead.push(token); continue; }
if (!isPidAlive(info.pid)) { dead.push(token); continue; }
survivors.push([token, info]);
}
// Phase 2 — PID-reuse guard for survivors that have a captured
// start-time. Single batched ps call: O(1) forks regardless of
// session count. Survivors without a start-time keep the bare-
// liveness verdict from phase 1 (their captureStartTimeAsync may
// still be in-flight from a recent register).
const guardedPids = survivors
.filter(([, info]) => info.startTime !== undefined)
.map(([, info]) => info.pid);
if (guardedPids.length > 0) {
try {
const live = await getProcessStartTimes(guardedPids);
for (const [token, info] of survivors) {
if (info.startTime === undefined) continue;
const lstart = live.get(info.pid);
// ps may transiently miss a pid that was alive when isPidAlive
// ran — treat absence as "racing", let the next sweep decide.
if (lstart === undefined) continue;
if (lstart !== info.startTime) dead.push(token);
}
} catch {
// ps failure here is non-fatal: survivors keep their phase-1
// verdict. Logging is the daemon's responsibility — the
// registry deliberately stays log-free.
}
}
for (const t of dead) deregisterByToken(t);
}
/** Test helper: run a single reaper pass. */
export async function _runReaperOnce(): Promise<void> {
await reapDead();
}
/** Test helper. */
export function _resetRegistry(): void {
byToken.clear();
bySessionId.clear();
hooks.onRegister = undefined;
hooks.onDeregister = undefined;
}