feat(cli,broker): 1.34.14 + 1.34.15 — env-var fallback, peer list scope, kick refuses control-plane

Three follow-ups from the 1.34.x multi-session correctness train,
all backwards-compatible.

1.34.14 — stale CLAUDEMESH_CONFIG_DIR falls back. The launch flow
exposes CLAUDEMESH_CONFIG_DIR=<tmpdir> to its spawned claude; if a
later claudemesh invocation inherited that env (Bash tool inside
Claude Code, tmux update-environment, exported var), the inherited
path pointed at a tmpdir that no longer existed and readConfig()
silently returned empty. paths.ts now memoizes resolution: env unset
→ default; env points at a real dir → trust it; env set but dir gone
→ TTY-only stderr warning with shell-specific unset hint, fall back
to ~/.claudemesh.

1.34.15 — peer list --mesh actually scopes. peers.ts and launch.ts
were calling tryListPeersViaDaemon() with no argument; the daemon's
?mesh= filter (server-side, since 1.26.0) was already correct, the
CLI just wasn't passing the slug. Forwarding fixed in both sites;
send.ts cross-mesh hex-prefix resolution intentionally untouched.

1.34.15 — kick refuses no-op kicks on control-plane. Pre-1.34.15
kicking a daemon's member-WS just closed the socket and triggered
auto-reconnect — a no-op with a misleading "session ended" message.
Broker now skips peers where peerRole === "control-plane" and
surfaces them in a new additive ack field skipped_control_plane;
the CLI reads it and prints a clearer hint pointing at ban / daemon
down. Soft disconnect verb keeps old behavior. PeerConn gains a
peerRole slot populated at both connections.set sites.

Tests: 4 new for paths-stale-env, 5 for kick-control-plane-skip.
CLI 87/87 green; broker 55/55 unit green (integration tests
pre-existing infra failure on this machine).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-05-04 21:59:06 +01:00
parent 6780899185
commit a2a53ff355
9 changed files with 376 additions and 10 deletions

View File

@@ -156,6 +156,11 @@ interface PeerConn {
bio?: string;
capabilities?: string[];
};
/** v2 agentic-comms presence taxonomy. Mirrors the value passed to
* `recordPresence`. Used by the kick handler to refuse no-op kicks
* on long-lived control-plane connections (daemon, dashboard) that
* would just auto-reconnect. */
peerRole: "control-plane" | "session" | "service";
}
const connections = new Map<string, PeerConn>();
@@ -1797,6 +1802,7 @@ async function handleHello(
groups: initialGroups,
visible: saved?.visible ?? true,
profile: saved?.profile ?? {},
peerRole: "control-plane",
});
incMeshCount(hello.meshId);
void audit(hello.meshId, "peer_joined", member.id, effectiveDisplayName, {
@@ -2022,6 +2028,7 @@ async function handleSessionHello(
groups: initialGroups,
visible: true,
profile: {},
peerRole: "session",
});
incMeshCount(hello.meshId);
void audit(hello.meshId, "peer_joined", member.id, effectiveDisplayName, {
@@ -4645,11 +4652,30 @@ function handleConnection(ws: WebSocket): void {
}
const affected: string[] = [];
// 1.34.15 (gap #3a): kick was a no-op against long-lived
// control-plane connections (daemon, dashboard) — closing
// their WS just triggered the auto-reconnect loop, the
// kicker's CLI rendered "Their Claude Code session ended"
// (which was misleading), and the user-visible state was
// unchanged seconds later. We now refuse to close control-
// plane WSes and surface the skipped peers in a new
// additive ack field. Pre-1.34.15 CLI clients only read
// `kicked`/`affected`, so this stays back-compat.
//
// For `kick`-only: the soft `disconnect` verb still closes
// control-plane WSes intentionally — that's what users want
// when they're nudging a peer for it to re-authenticate.
const skippedControlPlane: string[] = [];
const skipControlPlane = isKick;
const now = Date.now();
if (km.all) {
for (const [pid, peer] of connections) {
if (peer.meshId !== conn.meshId || pid === presenceId) continue;
if (skipControlPlane && peer.peerRole === "control-plane") {
skippedControlPlane.push(peer.displayName || pid);
continue;
}
try { peer.ws.close(closeCode, closeReason); } catch {}
connections.delete(pid);
void disconnectPresence(pid);
@@ -4661,6 +4687,10 @@ function handleConnection(ws: WebSocket): void {
if (peer.meshId !== conn.meshId || pid === presenceId) continue;
const [pres] = await db.select({ lastPingAt: presence.lastPingAt }).from(presence).where(eq(presence.id, pid)).limit(1);
if (pres && pres.lastPingAt && pres.lastPingAt.getTime() < cutoff) {
if (skipControlPlane && peer.peerRole === "control-plane") {
skippedControlPlane.push(peer.displayName || pid);
continue;
}
try { peer.ws.close(closeCode, `${closeReason}_stale`); } catch {}
connections.delete(pid);
void disconnectPresence(pid);
@@ -4671,6 +4701,10 @@ function handleConnection(ws: WebSocket): void {
for (const [pid, peer] of connections) {
if (peer.meshId !== conn.meshId) continue;
if (peer.displayName === km.target || peer.memberPubkey === km.target || peer.memberPubkey.startsWith(km.target)) {
if (skipControlPlane && peer.peerRole === "control-plane") {
skippedControlPlane.push(peer.displayName || pid);
continue;
}
try { peer.ws.close(closeCode, closeReason); } catch {}
connections.delete(pid);
void disconnectPresence(pid);
@@ -4679,8 +4713,20 @@ function handleConnection(ws: WebSocket): void {
}
}
conn.ws.send(JSON.stringify({ type: ackType, kicked: affected, affected, _reqId: km._reqId }));
log.info(`ws ${closeReason}`, { presence_id: presenceId, count: affected.length, target: km.target ?? km.stale ?? "all" });
conn.ws.send(JSON.stringify({
type: ackType,
kicked: affected,
affected,
// Additive — older CLI clients ignore this field.
...(skippedControlPlane.length > 0 ? { skipped_control_plane: skippedControlPlane } : {}),
_reqId: km._reqId,
}));
log.info(`ws ${closeReason}`, {
presence_id: presenceId,
count: affected.length,
target: km.target ?? km.stale ?? "all",
skipped_control_plane: skippedControlPlane.length,
});
break;
}

View File

@@ -0,0 +1,47 @@
/**
* Kick control-plane skip: 1.34.15 (gap #3a) refuses to close
* long-lived control-plane connections (claudemesh daemon, dashboard)
* via `kick`, because they auto-reconnect within seconds and the verb
* was effectively a no-op. The soft `disconnect` verb keeps the old
* behavior so users can still nudge a control-plane peer to
* re-authenticate.
*
* Pure-logic test — mirrors the branch inside handleSend's kick case
* without spinning up a broker. Same pattern as
* grants-enforcement.test.ts.
*/
import { describe, expect, test } from "vitest";
type PeerRole = "control-plane" | "session" | "service";
/** Mirrors the predicate inserted into the kick handler. */
function shouldSkipKick(args: {
verb: "kick" | "disconnect";
peerRole: PeerRole;
}): boolean {
const skipControlPlane = args.verb === "kick";
return skipControlPlane && args.peerRole === "control-plane";
}
describe("kick control-plane skip (gap #3a)", () => {
test("kick on control-plane → skipped (would auto-reconnect)", () => {
expect(shouldSkipKick({ verb: "kick", peerRole: "control-plane" })).toBe(true);
});
test("kick on session → not skipped (closes user session)", () => {
expect(shouldSkipKick({ verb: "kick", peerRole: "session" })).toBe(false);
});
test("kick on service → not skipped", () => {
expect(shouldSkipKick({ verb: "kick", peerRole: "service" })).toBe(false);
});
test("disconnect on control-plane → not skipped (intentional nudge)", () => {
expect(shouldSkipKick({ verb: "disconnect", peerRole: "control-plane" })).toBe(false);
});
test("disconnect on session → not skipped", () => {
expect(shouldSkipKick({ verb: "disconnect", peerRole: "session" })).toBe(false);
});
});