feat(broker): m1 — two-phase claim/deliver + client_ack + role-tagged presence
Three correctness fixes on top of the m1 schema migration:
1) Fix the drainForMember claim-then-push race
----------------------------------------------------------------
Previously the claim CTE set delivered_at = NOW() *before* the WS
send. If readyState !== OPEN at push time, the row was marked
delivered and the message dropped silently — at-most-once with no
retry hook.
The new flow:
- claim sets (claimed_at, claim_id, claim_expires_at = NOW()+30s)
- delivered_at stays NULL until the recipient acks
- re-eligibility predicate now also accepts rows whose lease
expired, so dropped pushes redeliver (at-least-once)
Adds two helpers:
- markDelivered() — scoped to (mesh_id, recipient pubkey) so a
peer can only ack its own messages
- sweepExpiredClaims() — clears expired (claimed_at, claim_id,
claim_expires_at) every 15s, wired into startSweepers
2) Accept `client_ack` from recipients
----------------------------------------------------------------
New WS message type handled in the dispatcher right after `send`.
Lookups by clientMessageId or brokerMessageId; either is fine. Until
the daemon (apps/cli, separate worktree) starts emitting acks, leases
will simply expire and re-deliver — which is the desired retry
behaviour.
3) Tag presence rows with `role`
----------------------------------------------------------------
handleHello (member-keyed, used by the long-lived daemon WS) →
role: 'control-plane'
handleSessionHello (per-Claude-Code session WS) →
role: 'session'
listPeersInMesh exposes the new field; the peers_list response
surfaces it. WSPeersListMessage type adds an optional `role` plus the
long-undocumented `memberPubkey`. CLI-side filter swap from peerType
to role lands in a follow-up worktree — that's why the CLI is
untouched here per the M1 spec.
Typechecks clean (apps/broker tsc --noEmit, packages/db tsc --noEmit).
Test suite needs a real DB so wasn't run in this worktree; existing
dup-delivery and broker tests use drainForMember positionally and the
new claimerPresenceId arg is optional, so they should continue to pass.
This commit is contained in:
@@ -49,6 +49,7 @@ import {
|
||||
listFiles,
|
||||
listPeersInMesh,
|
||||
listState,
|
||||
markDelivered,
|
||||
listTasks,
|
||||
queueMessage,
|
||||
recallMemory,
|
||||
@@ -546,6 +547,7 @@ async function maybePushQueuedMessages(
|
||||
conn.sessionPubkey ?? undefined,
|
||||
excludeSenderSessionPubkey,
|
||||
conn.groups.map((g) => g.name),
|
||||
presenceId,
|
||||
);
|
||||
log.info("maybePush", {
|
||||
presence_id: presenceId,
|
||||
@@ -1772,6 +1774,11 @@ async function handleHello(
|
||||
pid: hello.pid,
|
||||
cwd: hello.cwd,
|
||||
groups: initialGroups,
|
||||
// v2 agentic-comms (M1): the regular member-keyed `hello` path is
|
||||
// used by long-lived control-plane connections (claudemesh daemon,
|
||||
// dashboard, automation). Per-Claude-Code sessions go through
|
||||
// `session_hello` and get role='session'.
|
||||
role: "control-plane",
|
||||
});
|
||||
const effectiveDisplayName = hello.displayName || member.displayName;
|
||||
connections.set(presenceId, {
|
||||
@@ -1796,6 +1803,7 @@ async function handleHello(
|
||||
pubkey: hello.pubkey,
|
||||
groups: initialGroups,
|
||||
restored: !!saved,
|
||||
role: "control-plane",
|
||||
});
|
||||
log.info("ws hello", {
|
||||
mesh_id: hello.meshId,
|
||||
@@ -1993,6 +2001,9 @@ async function handleSessionHello(
|
||||
pid: hello.pid,
|
||||
cwd: hello.cwd,
|
||||
groups: initialGroups,
|
||||
// v2 agentic-comms (M1): per-Claude-Code session WS — these are the
|
||||
// user-facing peers shown in `claudemesh peer list`.
|
||||
role: "session",
|
||||
});
|
||||
const effectiveDisplayName = hello.displayName || member.displayName;
|
||||
connections.set(presenceId, {
|
||||
@@ -2018,6 +2029,7 @@ async function handleSessionHello(
|
||||
session_pubkey: hello.sessionPubkey,
|
||||
groups: initialGroups,
|
||||
via: "session_hello",
|
||||
role: "session",
|
||||
});
|
||||
log.info("ws session_hello", {
|
||||
mesh_id: hello.meshId,
|
||||
@@ -2567,6 +2579,39 @@ function handleConnection(ws: WebSocket): void {
|
||||
case "send":
|
||||
await handleSend(conn, msg);
|
||||
break;
|
||||
case "client_ack": {
|
||||
// v2 agentic-comms (M1): close out a previously pushed message.
|
||||
// Lookup is scoped to (mesh_id, recipient pubkey) so a peer can
|
||||
// only ack messages addressed to itself.
|
||||
const ack = msg as Extract<WSClientMessage, { type: "client_ack" }>;
|
||||
if (!ack.clientMessageId && !ack.brokerMessageId) {
|
||||
// Nothing to do; don't error — the daemon may speculatively
|
||||
// ack and we'd rather be lenient than break a CLI release.
|
||||
break;
|
||||
}
|
||||
try {
|
||||
const n = await markDelivered({
|
||||
meshId: conn.meshId,
|
||||
recipientMemberId: conn.memberId,
|
||||
recipientMemberPubkey: conn.memberPubkey,
|
||||
recipientSessionPubkey: conn.sessionPubkey ?? null,
|
||||
clientMessageId: ack.clientMessageId ?? null,
|
||||
brokerMessageId: ack.brokerMessageId ?? null,
|
||||
});
|
||||
log.debug("ws client_ack", {
|
||||
presence_id: presenceId,
|
||||
client_message_id: ack.clientMessageId,
|
||||
broker_message_id: ack.brokerMessageId,
|
||||
marked: n,
|
||||
});
|
||||
} catch (e) {
|
||||
log.warn("ws client_ack failed", {
|
||||
presence_id: presenceId,
|
||||
error: e instanceof Error ? e.message : String(e),
|
||||
});
|
||||
}
|
||||
break;
|
||||
}
|
||||
case "set_status":
|
||||
await writeStatus(presenceId, msg.status, "manual", new Date());
|
||||
log.info("ws set_status", {
|
||||
@@ -2604,6 +2649,10 @@ function handleConnection(ws: WebSocket): void {
|
||||
sessionId: p.sessionId,
|
||||
connectedAt: p.connectedAt.toISOString(),
|
||||
cwd: pc?.cwd ?? p.cwd,
|
||||
// v2 agentic-comms (M1): typed connection role. CLI uses
|
||||
// this to hide control-plane daemons from user-facing
|
||||
// peer lists (filter swap from peerType happens CLI-side).
|
||||
role: p.role,
|
||||
...(pc?.hostname ? { hostname: pc.hostname } : {}),
|
||||
...(pc?.peerType ? { peerType: pc.peerType } : {}),
|
||||
...(pc?.channel ? { channel: pc.channel } : {}),
|
||||
|
||||
Reference in New Issue
Block a user