feat(broker): m1 — two-phase claim/deliver + client_ack + role-tagged presence

Three correctness fixes on top of the m1 schema migration:

1) Fix the drainForMember claim-then-push race
   ----------------------------------------------------------------
   Previously the claim CTE set delivered_at = NOW() *before* the WS
   send. If readyState !== OPEN at push time, the row was marked
   delivered and the message dropped silently — at-most-once with no
   retry hook.

   The new flow:
     - claim sets (claimed_at, claim_id, claim_expires_at = NOW()+30s)
     - delivered_at stays NULL until the recipient acks
     - re-eligibility predicate now also accepts rows whose lease
       expired, so dropped pushes redeliver (at-least-once)

   Adds two helpers:
     - markDelivered() — scoped to (mesh_id, recipient pubkey) so a
       peer can only ack its own messages
     - sweepExpiredClaims() — clears expired (claimed_at, claim_id,
       claim_expires_at) every 15s, wired into startSweepers

2) Accept `client_ack` from recipients
   ----------------------------------------------------------------
   New WS message type handled in the dispatcher right after `send`.
   Lookups by clientMessageId or brokerMessageId; either is fine. Until
   the daemon (apps/cli, separate worktree) starts emitting acks, leases
   will simply expire and re-deliver — which is the desired retry
   behaviour.

3) Tag presence rows with `role`
   ----------------------------------------------------------------
   handleHello (member-keyed, used by the long-lived daemon WS) →
     role: 'control-plane'
   handleSessionHello (per-Claude-Code session WS) →
     role: 'session'

   listPeersInMesh exposes the new field; the peers_list response
   surfaces it. WSPeersListMessage type adds an optional `role` plus the
   long-undocumented `memberPubkey`. CLI-side filter swap from peerType
   to role lands in a follow-up worktree — that's why the CLI is
   untouched here per the M1 spec.

Typechecks clean (apps/broker tsc --noEmit, packages/db tsc --noEmit).
Test suite needs a real DB so wasn't run in this worktree; existing
dup-delivery and broker tests use drainForMember positionally and the
new claimerPresenceId arg is optional, so they should continue to pass.
This commit is contained in:
Alejandro Gutiérrez
2026-05-04 18:10:25 +01:00
parent 5a8db796a0
commit b57e47ed65
3 changed files with 210 additions and 1 deletions

View File

@@ -49,6 +49,7 @@ import {
listFiles,
listPeersInMesh,
listState,
markDelivered,
listTasks,
queueMessage,
recallMemory,
@@ -546,6 +547,7 @@ async function maybePushQueuedMessages(
conn.sessionPubkey ?? undefined,
excludeSenderSessionPubkey,
conn.groups.map((g) => g.name),
presenceId,
);
log.info("maybePush", {
presence_id: presenceId,
@@ -1772,6 +1774,11 @@ async function handleHello(
pid: hello.pid,
cwd: hello.cwd,
groups: initialGroups,
// v2 agentic-comms (M1): the regular member-keyed `hello` path is
// used by long-lived control-plane connections (claudemesh daemon,
// dashboard, automation). Per-Claude-Code sessions go through
// `session_hello` and get role='session'.
role: "control-plane",
});
const effectiveDisplayName = hello.displayName || member.displayName;
connections.set(presenceId, {
@@ -1796,6 +1803,7 @@ async function handleHello(
pubkey: hello.pubkey,
groups: initialGroups,
restored: !!saved,
role: "control-plane",
});
log.info("ws hello", {
mesh_id: hello.meshId,
@@ -1993,6 +2001,9 @@ async function handleSessionHello(
pid: hello.pid,
cwd: hello.cwd,
groups: initialGroups,
// v2 agentic-comms (M1): per-Claude-Code session WS — these are the
// user-facing peers shown in `claudemesh peer list`.
role: "session",
});
const effectiveDisplayName = hello.displayName || member.displayName;
connections.set(presenceId, {
@@ -2018,6 +2029,7 @@ async function handleSessionHello(
session_pubkey: hello.sessionPubkey,
groups: initialGroups,
via: "session_hello",
role: "session",
});
log.info("ws session_hello", {
mesh_id: hello.meshId,
@@ -2567,6 +2579,39 @@ function handleConnection(ws: WebSocket): void {
case "send":
await handleSend(conn, msg);
break;
case "client_ack": {
// v2 agentic-comms (M1): close out a previously pushed message.
// Lookup is scoped to (mesh_id, recipient pubkey) so a peer can
// only ack messages addressed to itself.
const ack = msg as Extract<WSClientMessage, { type: "client_ack" }>;
if (!ack.clientMessageId && !ack.brokerMessageId) {
// Nothing to do; don't error — the daemon may speculatively
// ack and we'd rather be lenient than break a CLI release.
break;
}
try {
const n = await markDelivered({
meshId: conn.meshId,
recipientMemberId: conn.memberId,
recipientMemberPubkey: conn.memberPubkey,
recipientSessionPubkey: conn.sessionPubkey ?? null,
clientMessageId: ack.clientMessageId ?? null,
brokerMessageId: ack.brokerMessageId ?? null,
});
log.debug("ws client_ack", {
presence_id: presenceId,
client_message_id: ack.clientMessageId,
broker_message_id: ack.brokerMessageId,
marked: n,
});
} catch (e) {
log.warn("ws client_ack failed", {
presence_id: presenceId,
error: e instanceof Error ? e.message : String(e),
});
}
break;
}
case "set_status":
await writeStatus(presenceId, msg.status, "manual", new Date());
log.info("ws set_status", {
@@ -2604,6 +2649,10 @@ function handleConnection(ws: WebSocket): void {
sessionId: p.sessionId,
connectedAt: p.connectedAt.toISOString(),
cwd: pc?.cwd ?? p.cwd,
// v2 agentic-comms (M1): typed connection role. CLI uses
// this to hide control-plane daemons from user-facing
// peer lists (filter swap from peerType happens CLI-side).
role: p.role,
...(pc?.hostname ? { hostname: pc.hostname } : {}),
...(pc?.peerType ? { peerType: pc.peerType } : {}),
...(pc?.channel ? { channel: pc.channel } : {}),