Files
claudemesh/apps/cli/src/daemon/drain.ts
Alejandro Gutiérrez 6780899185
Some checks failed
CI / Lint (push) Has been cancelled
CI / Typecheck (push) Has been cancelled
CI / Broker tests (Postgres) (push) Has been cancelled
CI / Docker build (linux/amd64) (push) Has been cancelled
feat(cli): 1.34.7 → 1.34.13 — multi-session correctness train
Seven-ship sequence that took the daemon from "works for one session"
to "internally consistent for N sessions on one daemon." Architecture
invariant after 1.34.13: every shared store / channel scopes by
recipient (SSE demux at bind layer + token forwarding, inbox per-
recipient columns, outbox sender-session routing).

- 1.34.7  inbox flush + delete commands
- 1.34.8  seen_at column + TTL prune + first echo guard
- 1.34.9  broader echo guard + system-event polish + staleness warning
- 1.34.10 per-session SSE demux (SseFilterOptions) + universal daemon
          (--mesh / --name deprecated) + daemon_started version stamp
- 1.34.11 inbox per-recipient column (storage half of 1.34.10)
- 1.34.12 daemon up detaches by default (logs to ~/.claudemesh/daemon/
          daemon.log; service units explicitly pass --foreground)
- 1.34.13 MCP forwards session token on /v1/events — the actual fix
          that activates 1.34.10's demux. Without this header the
          daemon's session resolved null, filter was empty, every MCP
          received the unfiltered global stream.

Roadmap entry at docs/roadmap.md captures the timeline + the four
known gaps tracked for follow-ups (launch env-var leak, broker
listPeers mesh-filter, kick on control-plane no-op, session caps as
first-class concept).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-04 21:10:07 +01:00

266 lines
9.7 KiB
TypeScript

// Outbox drain worker. Walks `outbox.pending` rows, sends them to the
// broker via DaemonBrokerClient, and transitions row state per spec §4.6.1.
//
// Lifecycle per row:
// pending → inflight → done (broker accepted)
// → pending+backoff (transient broker error)
// → dead (permanent broker error or
// attempt cap reached)
//
// Wakeable: insertPending in the IPC handler can call wake() to skip the
// idle interval. We use a simple promise-replacing pattern instead of a
// pollable signal.
import type { SqliteDb } from "./db/sqlite.js";
import type { DaemonBrokerClient } from "./broker.js";
import type { SessionBrokerClient } from "./session-broker.js";
import type { OutboxStatus } from "./db/outbox.js";
const POLL_INTERVAL_MS = 500;
const MAX_ATTEMPTS_PER_ROW = 25;
const BACKOFF_BASE_MS = 500;
const BACKOFF_CAP_MS = 30_000;
interface PendingRow {
id: string;
client_message_id: string;
request_fingerprint: Uint8Array;
payload: Uint8Array;
attempts: number;
/** Sprint 4 routing fields. NULL on legacy v0.9.0 rows → broadcast fallback. */
target_spec: string | null;
nonce: string | null;
ciphertext: string | null;
priority: string | null;
mesh: string | null;
/** 1.34.0: hex pubkey of the originating session — drain prefers
* routing via that session's WS so broker fan-out attributes the
* push to the session pubkey. NULL on cold-path / pre-1.34.0 rows. */
sender_session_pubkey: string | null;
}
export interface DrainOptions {
db: SqliteDb;
/** v1.26.0: per-mesh broker map. Drain dispatches each row to the
* broker keyed by its `mesh` column. Single-mesh daemons pass a
* Map of size 1; multi-mesh daemons pass one entry per joined mesh. */
brokers: Map<string, DaemonBrokerClient>;
/**
* 1.34.0: lookup for the per-session WS keyed by hex session pubkey.
* When an outbox row has `sender_session_pubkey` set and this lookup
* returns an open client, the drain routes via the session-WS so the
* broker fan-out attributes the push to the session pubkey instead
* of the daemon's stable member pubkey.
*
* Returning `undefined` (or an unopened client) signals "no session
* WS available" — the drain backs off and retries; it does NOT fall
* back to the daemon-WS, because the row was encrypted with the
* session secret and would fail to decrypt on the recipient side
* if attribution silently changed mid-flight.
*/
getSessionBrokerByPubkey?: (sessionPubkey: string) => SessionBrokerClient | undefined;
log?: (level: "info" | "warn" | "error", msg: string, meta?: Record<string, unknown>) => void;
}
export interface DrainHandle {
wake(): void;
close(): Promise<void>;
}
export function startDrainWorker(opts: DrainOptions): DrainHandle {
const log = opts.log ?? defaultLog;
let stopped = false;
let wakeResolve: (() => void) | null = null;
let wakePromise = new Promise<void>((r) => { wakeResolve = r; });
const wake = () => {
if (wakeResolve) {
const r = wakeResolve;
wakeResolve = null;
r();
}
};
const tick = async () => {
while (!stopped) {
try { await drainOnce(opts, log); }
catch (e) { log("warn", "drain_tick_failed", { err: String(e) }); }
// Sleep up to POLL_INTERVAL_MS, but wake immediately on signal.
await Promise.race([
wakePromise,
new Promise<void>((r) => setTimeout(r, POLL_INTERVAL_MS)),
]);
// Reset wake promise after each loop.
wakePromise = new Promise<void>((r) => { wakeResolve = r; });
}
};
void tick();
return {
wake,
close: async () => { stopped = true; wake(); },
};
}
async function drainOnce(opts: DrainOptions, log: NonNullable<DrainOptions["log"]>): Promise<void> {
const now = Date.now();
const rows = opts.db.prepare(`
SELECT id, client_message_id, request_fingerprint, payload, attempts,
target_spec, nonce, ciphertext, priority, mesh,
sender_session_pubkey
FROM outbox
WHERE status = 'pending' AND next_attempt_at <= ?
ORDER BY enqueued_at
LIMIT 32
`).all<PendingRow>(now);
if (rows.length === 0) return;
for (const row of rows) {
if (markInflight(opts.db, row.id, now) === 0) continue; // raced with another drainer
const fpHex = bufferToHex(row.request_fingerprint);
// v1.26.0: pick the daemon-WS broker keyed by the row's mesh.
// Legacy rows (mesh=NULL) fall back to the only broker if there's
// exactly one; otherwise mark dead because we don't know where to
// send them.
let daemonBroker: DaemonBrokerClient | undefined;
if (row.mesh) {
daemonBroker = opts.brokers.get(row.mesh);
} else if (opts.brokers.size === 1) {
daemonBroker = opts.brokers.values().next().value;
}
if (!daemonBroker) {
log("warn", "drain_no_broker_for_mesh", { id: row.id, mesh: row.mesh ?? "(null)" });
markDead(opts.db, row.id, `no_broker_for_mesh:${row.mesh ?? "null"}`);
continue;
}
// 1.34.0: when the row was written by an authenticated session,
// dispatch via the matching SessionBrokerClient so broker fan-out
// attributes the push to the session pubkey. Encryption is
// session-secret based on those rows, so we MUST NOT silently fall
// back to the daemon-WS — the recipient's decrypt would fail. If
// the session-WS is closed (reconnecting / session terminated), we
// back off and retry.
let sessionBroker: SessionBrokerClient | undefined;
if (row.sender_session_pubkey && opts.getSessionBrokerByPubkey) {
sessionBroker = opts.getSessionBrokerByPubkey(row.sender_session_pubkey);
}
// Sprint 4: use the row's resolved target/ciphertext if present.
// Legacy v0.9.0 rows (NULL on these columns) fall back to the
// broadcast smoke-test shape so existing in-flight rows still drain.
let targetSpec: string;
let nonce: string;
let ciphertext: string;
let priority: "now" | "next" | "low";
if (row.target_spec && row.nonce && row.ciphertext) {
targetSpec = row.target_spec;
nonce = row.nonce;
ciphertext = row.ciphertext;
priority = (row.priority === "now" || row.priority === "low") ? row.priority : "next";
} else {
targetSpec = "*";
nonce = await randomNonce();
ciphertext = Buffer.from(row.payload).toString("base64");
priority = "next";
}
const sendArgs = {
targetSpec,
priority,
nonce,
ciphertext,
client_message_id: row.client_message_id,
request_fingerprint_hex: fpHex,
};
let res;
try {
if (row.sender_session_pubkey) {
// Session-attributed row. Require an open session-WS — see comment
// above on why we don't fall back to the daemon-WS.
if (!sessionBroker || !sessionBroker.isOpen()) {
log("info", "drain_session_ws_not_ready", {
id: row.id, session_pubkey: row.sender_session_pubkey.slice(0, 12),
});
backoffPending(opts.db, row.id, row.attempts + 1, "session_ws_not_open", "session_ws_not_open");
continue;
}
res = await sessionBroker.send(sendArgs);
} else {
res = await daemonBroker.send(sendArgs);
}
} catch (e) {
log("warn", "drain_send_threw", { id: row.id, err: String(e) });
backoffPending(opts.db, row.id, row.attempts + 1, "exception", String(e));
continue;
}
if (res.ok) {
markDone(opts.db, row.id, res.messageId, Date.now());
} else if (res.permanent) {
log("warn", "drain_permanent_failure", { id: row.id, err: res.error });
markDead(opts.db, row.id, res.error);
} else if (row.attempts + 1 >= MAX_ATTEMPTS_PER_ROW) {
log("warn", "drain_max_attempts", { id: row.id, err: res.error });
markDead(opts.db, row.id, `max_attempts: ${res.error}`);
} else {
backoffPending(opts.db, row.id, row.attempts + 1, "retry", res.error);
}
}
}
function markInflight(db: SqliteDb, id: string, now: number): number {
return Number(db.prepare(`
UPDATE outbox
SET status = 'inflight', attempts = attempts + 1, next_attempt_at = ?
WHERE id = ? AND status = 'pending'
`).run(now + BACKOFF_CAP_MS, id).changes);
}
function markDone(db: SqliteDb, id: string, brokerMessageId: string, now: number) {
db.prepare(`
UPDATE outbox
SET status = 'done', delivered_at = ?, broker_message_id = ?, last_error = NULL
WHERE id = ?
`).run(now, brokerMessageId, id);
}
function markDead(db: SqliteDb, id: string, err: string) {
db.prepare(`UPDATE outbox SET status = 'dead', last_error = ? WHERE id = ?`).run(err, id);
}
function backoffPending(db: SqliteDb, id: string, attempts: number, _kind: string, err: string) {
const wait = Math.min(BACKOFF_CAP_MS, BACKOFF_BASE_MS * (2 ** Math.min(attempts, 12)));
const next = Date.now() + wait;
db.prepare(`
UPDATE outbox
SET status = 'pending', attempts = ?, next_attempt_at = ?, last_error = ?
WHERE id = ?
`).run(attempts, next, err, id);
}
function bufferToHex(b: Uint8Array): string {
let s = "";
for (let i = 0; i < b.length; i++) s += b[i]!.toString(16).padStart(2, "0");
return s;
}
async function randomNonce(): Promise<string> {
const { randomBytes } = await import("node:crypto");
return randomBytes(24).toString("base64");
}
function defaultLog(level: "info" | "warn" | "error", msg: string, meta?: Record<string, unknown>) {
const line = JSON.stringify({ level, msg, ...meta, ts: new Date().toISOString() });
if (level === "info") process.stdout.write(line + "\n");
else process.stderr.write(line + "\n");
}
// Suppress unused-status warning under strict tsc:
const _statuses: OutboxStatus[] = ["pending", "inflight", "done", "dead", "aborted"];
void _statuses;