refactor(cli): m1 lifecycle + role-aware peer list
Foundational cleanups before agentic-comms architecture work (.artifacts/specs/2026-05-04-agentic-comms-architecture-v2.md). All behavior-preserving. 1. Extract `connectWsWithBackoff` into apps/cli/src/daemon/ws-lifecycle.ts. Both DaemonBrokerClient and SessionBrokerClient now share one lifecycle implementation (connect, hello-handshake, ack-timeout, close + backoff reconnect). Each client provides its own buildHello / isHelloAck / onMessage hooks and keeps its own RPC bookkeeping (pendingAcks, peerListResolvers, onPush). Composition over inheritance per Codex's review; no protocol shape changes. 2. Drop daemon-WS ephemeral session pubkey. DaemonBrokerClient no longer mints + sends a per-reconnect ephemeral keypair in its hello. Session-targeted DMs land on SessionBrokerClient since 1.32.1, not the member-keyed daemon-WS, so the field was vestigial. Send-encrypt path now signs DMs with the stable mesh member secret. handleBrokerPush invocations from daemon-WS only pass the member secret — session decryption is the session-WS's job. 3. Role-aware peer list. `peer list` now hides peers whose broker-emitted `role` is `'control-plane'`. `--all` opts back in. JSON output emits `role` at top level. Older brokers that don't emit role yet default to 'session', so legacy peer rows stay visible without the broker-side change shipped first. Replaces the prior `peerType === 'claudemesh-daemon'` channel-name hack. Typecheck + tests + build all green.
This commit is contained in:
234
apps/cli/src/daemon/ws-lifecycle.ts
Normal file
234
apps/cli/src/daemon/ws-lifecycle.ts
Normal file
@@ -0,0 +1,234 @@
|
||||
/**
|
||||
* Shared WS lifecycle helper for the daemon's two broker clients.
|
||||
*
|
||||
* Both `DaemonBrokerClient` (member-keyed, one per joined mesh) and
|
||||
* `SessionBrokerClient` (session-keyed, one per launched session) used
|
||||
* to inline the same connect/hello/ack-timeout/close-reconnect logic.
|
||||
* They drifted apart subtly — different ack-timeout names, different
|
||||
* reconnect log messages, slightly different status flips — and that's
|
||||
* how 1.32.x bugs shipped (push handler attached to the wrong client,
|
||||
* etc).
|
||||
*
|
||||
* This helper owns ONLY the lifecycle:
|
||||
* - new WebSocket(url), wire up open/message/close/error
|
||||
* - on open → call buildHello() and send the result
|
||||
* - start an ack-timeout timer; if it fires before the hello ack
|
||||
* arrives, close the socket and reject the connect promise
|
||||
* - on message, gate on isHelloAck(); when true, flip status to
|
||||
* "open", clear the ack timer, resolve. All other messages are
|
||||
* forwarded to onMessage()
|
||||
* - on close, schedule a backoff reconnect (unless explicitly closed)
|
||||
*
|
||||
* Each client keeps its own concerns: DaemonBrokerClient still owns
|
||||
* pendingAcks / peerListResolvers / etc; SessionBrokerClient still owns
|
||||
* its onPush callback. The helper just hands them an open WS and a
|
||||
* stable status field, and reconnects under their feet on disconnect.
|
||||
*
|
||||
* Composition over inheritance — callers receive a `WsLifecycle` handle
|
||||
* with `send` / `close` / `status`, NOT a subclass.
|
||||
*/
|
||||
|
||||
import WebSocket from "ws";
|
||||
|
||||
export type WsStatus = "connecting" | "open" | "closed" | "reconnecting";
|
||||
|
||||
export type WsLogLevel = "info" | "warn" | "error";
|
||||
export type WsLog = (level: WsLogLevel, msg: string, meta?: Record<string, unknown>) => void;
|
||||
|
||||
export interface WsLifecycleOptions {
|
||||
/** Broker URL (e.g. wss://ic.claudemesh.com/ws). */
|
||||
url: string;
|
||||
/**
|
||||
* Build the hello frame to send right after the WS opens. Async because
|
||||
* signing the hello may need libsodium initialization. Whatever this
|
||||
* returns is JSON.stringified and sent verbatim — the helper does NOT
|
||||
* inspect or modify it.
|
||||
*/
|
||||
buildHello: () => Promise<unknown>;
|
||||
/**
|
||||
* Returns true iff `msg` is the hello ack the helper should treat as
|
||||
* "broker accepted us; flip status to open". Both daemon-WS and
|
||||
* session-WS use `{ type: "hello_ack" }` today, but keeping this a
|
||||
* predicate lets either client narrow further (e.g. on a `code` field)
|
||||
* without leaking client-specific shape into the helper.
|
||||
*/
|
||||
isHelloAck: (msg: Record<string, unknown>) => boolean;
|
||||
/**
|
||||
* Called for every parsed message that is NOT the hello ack. The
|
||||
* helper does NOT decide which messages are pushes vs RPCs vs errors;
|
||||
* that's the caller's concern.
|
||||
*/
|
||||
onMessage: (msg: Record<string, unknown>) => void;
|
||||
onStatusChange?: (s: WsStatus) => void;
|
||||
/**
|
||||
* How long to wait for the broker's hello ack before giving up and
|
||||
* forcing a close. Defaults 5s — same as both pre-refactor clients.
|
||||
*/
|
||||
helloAckTimeoutMs?: number;
|
||||
/**
|
||||
* Reconnect backoff schedule. Defaults [1s, 2s, 4s, 8s, 16s, 30s] —
|
||||
* matches both pre-refactor clients exactly.
|
||||
*/
|
||||
backoffCapsMs?: readonly number[];
|
||||
log?: WsLog;
|
||||
/**
|
||||
* Hook for the close path BEFORE the helper schedules a reconnect.
|
||||
* Used by DaemonBrokerClient to fail its in-flight pendingAcks map
|
||||
* with a "broker_disconnected_<code>" reason. The helper passes the
|
||||
* raw close code so the caller can shape its rejection text.
|
||||
*
|
||||
* Returns nothing — close handling continues regardless.
|
||||
*/
|
||||
onBeforeReconnect?: (code: number, reason: string) => void;
|
||||
}
|
||||
|
||||
export interface WsLifecycle {
|
||||
/** Current connection status. Updated synchronously before onStatusChange fires. */
|
||||
readonly status: WsStatus;
|
||||
/** Underlying socket. Exposed for callers that need OPEN-state checks
|
||||
* before sending (mirrors the pre-refactor `this.ws.readyState` checks). */
|
||||
readonly ws: WebSocket | null;
|
||||
/** Send a JSON payload over the open WS. Throws if not open — callers
|
||||
* that need queue-while-disconnected semantics should layer that
|
||||
* themselves (DaemonBrokerClient does, via its `opens` deferred-fn array). */
|
||||
send(payload: unknown): void;
|
||||
/** Close the WS and stop reconnecting. Idempotent. */
|
||||
close(): Promise<void>;
|
||||
}
|
||||
|
||||
const DEFAULT_HELLO_ACK_TIMEOUT_MS = 5_000;
|
||||
const DEFAULT_BACKOFF_CAPS_MS: readonly number[] = [1_000, 2_000, 4_000, 8_000, 16_000, 30_000];
|
||||
|
||||
const defaultLog: WsLog = (level, msg, meta) => {
|
||||
const line = JSON.stringify({ level, msg, ...meta, ts: new Date().toISOString() });
|
||||
if (level === "info") process.stdout.write(line + "\n");
|
||||
else process.stderr.write(line + "\n");
|
||||
};
|
||||
|
||||
/**
|
||||
* Connect a WebSocket with hello-handshake, ack-timeout, and reconnect
|
||||
* with exponential backoff. Resolves once the broker accepts the hello;
|
||||
* rejects if the first connect closes before the ack lands.
|
||||
*
|
||||
* Subsequent automatic reconnects are silent — they fire on the close
|
||||
* handler's backoff timer and surface only via onStatusChange (and any
|
||||
* caller-installed log).
|
||||
*/
|
||||
export function connectWsWithBackoff(opts: WsLifecycleOptions): Promise<WsLifecycle> {
|
||||
const helloAckTimeoutMs = opts.helloAckTimeoutMs ?? DEFAULT_HELLO_ACK_TIMEOUT_MS;
|
||||
const backoffCapsMs = opts.backoffCapsMs ?? DEFAULT_BACKOFF_CAPS_MS;
|
||||
const log: WsLog = opts.log ?? defaultLog;
|
||||
|
||||
let ws: WebSocket | null = null;
|
||||
let status: WsStatus = "closed";
|
||||
let closed = false;
|
||||
let reconnectAttempt = 0;
|
||||
let reconnectTimer: NodeJS.Timeout | null = null;
|
||||
let helloTimer: NodeJS.Timeout | null = null;
|
||||
|
||||
const setStatus = (s: WsStatus) => {
|
||||
if (status === s) return;
|
||||
status = s;
|
||||
opts.onStatusChange?.(s);
|
||||
};
|
||||
|
||||
/**
|
||||
* Open one WS attempt. Returns a promise that resolves on hello ack
|
||||
* or rejects if the socket closes before we get one. Used by both the
|
||||
* initial connect and the close-handler backoff timer (which awaits
|
||||
* but ignores the rejection — by then the close handler has already
|
||||
* scheduled its own reconnect).
|
||||
*/
|
||||
const openOnce = (): Promise<void> => {
|
||||
if (closed) return Promise.reject(new Error("client_closed"));
|
||||
setStatus("connecting");
|
||||
|
||||
const sock = new WebSocket(opts.url);
|
||||
ws = sock;
|
||||
|
||||
return new Promise<void>((resolve, reject) => {
|
||||
sock.on("open", () => {
|
||||
// Build and send the hello inside a microtask so any sync
|
||||
// throws from buildHello() reject this connect attempt cleanly.
|
||||
(async () => {
|
||||
try {
|
||||
const hello = await opts.buildHello();
|
||||
sock.send(JSON.stringify(hello));
|
||||
helloTimer = setTimeout(() => {
|
||||
log("warn", "hello_ack_timeout", { url: opts.url });
|
||||
try { sock.close(); } catch { /* ignore */ }
|
||||
reject(new Error("hello_ack_timeout"));
|
||||
}, helloAckTimeoutMs);
|
||||
} catch (e) {
|
||||
reject(e instanceof Error ? e : new Error(String(e)));
|
||||
}
|
||||
})();
|
||||
});
|
||||
|
||||
sock.on("message", (raw) => {
|
||||
let msg: Record<string, unknown>;
|
||||
try { msg = JSON.parse(raw.toString()) as Record<string, unknown>; }
|
||||
catch { return; }
|
||||
|
||||
if (opts.isHelloAck(msg)) {
|
||||
if (helloTimer) { clearTimeout(helloTimer); helloTimer = null; }
|
||||
setStatus("open");
|
||||
reconnectAttempt = 0;
|
||||
resolve();
|
||||
// Don't forward hello_ack to onMessage — both pre-refactor
|
||||
// clients consumed it inline and never delegated.
|
||||
return;
|
||||
}
|
||||
|
||||
opts.onMessage(msg);
|
||||
});
|
||||
|
||||
sock.on("close", (code, reason) => {
|
||||
if (helloTimer) { clearTimeout(helloTimer); helloTimer = null; }
|
||||
const reasonStr = reason.toString("utf8");
|
||||
opts.onBeforeReconnect?.(code, reasonStr);
|
||||
|
||||
if (closed) {
|
||||
setStatus("closed");
|
||||
return;
|
||||
}
|
||||
setStatus("reconnecting");
|
||||
const wait = backoffCapsMs[Math.min(reconnectAttempt, backoffCapsMs.length - 1)] ?? 30_000;
|
||||
reconnectAttempt++;
|
||||
log("info", "ws_reconnect_scheduled", { url: opts.url, wait_ms: wait, code, reason: reasonStr });
|
||||
reconnectTimer = setTimeout(
|
||||
() => openOnce().catch((err) => log("warn", "ws_reconnect_failed", { url: opts.url, err: String(err) })),
|
||||
wait,
|
||||
);
|
||||
// First attempt failure (still in connecting) also rejects the
|
||||
// initial connect promise so callers can surface it.
|
||||
if (status === "connecting" || status === "reconnecting") {
|
||||
reject(new Error(`closed_before_hello_${code}`));
|
||||
}
|
||||
});
|
||||
|
||||
sock.on("error", (err) => log("warn", "ws_error", { url: opts.url, err: err.message }));
|
||||
});
|
||||
};
|
||||
|
||||
return openOnce().then(() => {
|
||||
const handle: WsLifecycle = {
|
||||
get status() { return status; },
|
||||
get ws() { return ws; },
|
||||
send(payload: unknown) {
|
||||
if (!ws || ws.readyState !== ws.OPEN) {
|
||||
throw new Error("ws_not_open");
|
||||
}
|
||||
ws.send(JSON.stringify(payload));
|
||||
},
|
||||
async close() {
|
||||
closed = true;
|
||||
if (reconnectTimer) { clearTimeout(reconnectTimer); reconnectTimer = null; }
|
||||
if (helloTimer) { clearTimeout(helloTimer); helloTimer = null; }
|
||||
try { ws?.close(); } catch { /* ignore */ }
|
||||
setStatus("closed");
|
||||
},
|
||||
};
|
||||
return handle;
|
||||
});
|
||||
}
|
||||
Reference in New Issue
Block a user