fix(broker): balance per-mesh connection counter — stop capacity-leak that bricks meshes
`connectionsPerMesh` (the in-memory counter enforcing MAX_CONNECTIONS_PER_MESH=100) was incremented on every successful hello (incMeshCount at member + session paths) but decremented ONLY inside evictPresenceFully. Every other removal path — session-id dedup on reconnect (the common one), kick, and ban — deleted the entry from `connections` without decrementing, leaking +1 each time. Because the counter is in-memory and only resets on broker restart, it crept up to 100 over hours/days of normal reconnect churn (network blips, sleep/wake, relaunches) until the mesh hit capacity and rejected ALL new connections with `1008 "capacity"` — bricking it until the broker process was restarted. A user with <10 sessions saw "mesh at connection capacity" because the 100 were leaked phantoms, not live connections. Fix: route every non-evict removal through a new dropConnection() helper that deletes from `connections` AND decMeshCount()s, so the counter tracks map membership exactly. The replaced socket's own close handler then no-ops (entry already gone, guarded by `if (!conn) return`), so the decrement happens exactly once — no double-count. evictPresenceFully keeps its existing balanced delete+dec. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1635,6 +1635,30 @@ function decMeshCount(meshId: string): void {
|
|||||||
metrics.connectionsActive.set(connections.size);
|
metrics.connectionsActive.set(connections.size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove a connection from the in-memory map AND keep the per-mesh
|
||||||
|
* connection counter balanced.
|
||||||
|
*
|
||||||
|
* THE counter (`connectionsPerMesh`) is incremented once per successful
|
||||||
|
* hello (incMeshCount). Before this helper, it was decremented ONLY in
|
||||||
|
* evictPresenceFully — so every other removal path (session-id dedup on
|
||||||
|
* reconnect, kick, ban) deleted the entry from `connections` without
|
||||||
|
* decrementing, leaking +1 each time. Over many reconnects the counter
|
||||||
|
* crept up to MAX_CONNECTIONS_PER_MESH and the mesh got bricked with
|
||||||
|
* `capacity` rejections until the broker process restarted (the counter
|
||||||
|
* is in-memory). Routing every non-evict removal through here makes the
|
||||||
|
* counter track `connections` membership exactly. The replaced socket's
|
||||||
|
* own close handler then no-ops (entry already gone), so the decrement
|
||||||
|
* happens exactly once. Returns true if an entry was actually removed.
|
||||||
|
*/
|
||||||
|
function dropConnection(presenceId: string): boolean {
|
||||||
|
const c = connections.get(presenceId);
|
||||||
|
if (!c) return false; // already removed (e.g. evicted) — don't double-decrement
|
||||||
|
connections.delete(presenceId);
|
||||||
|
decMeshCount(c.meshId);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
function sendError(
|
function sendError(
|
||||||
ws: WebSocket,
|
ws: WebSocket,
|
||||||
code: string,
|
code: string,
|
||||||
@@ -1954,7 +1978,7 @@ async function handleHello(
|
|||||||
if (oldConn.meshId === hello.meshId && oldConn.sessionId === hello.sessionId) {
|
if (oldConn.meshId === hello.meshId && oldConn.sessionId === hello.sessionId) {
|
||||||
log.info("hello dedup", { old_presence: oldPid, session_id: hello.sessionId });
|
log.info("hello dedup", { old_presence: oldPid, session_id: hello.sessionId });
|
||||||
try { oldConn.ws.close(1000, "session_replaced"); } catch { /* already dead */ }
|
try { oldConn.ws.close(1000, "session_replaced"); } catch { /* already dead */ }
|
||||||
connections.delete(oldPid);
|
dropConnection(oldPid);
|
||||||
void disconnectPresence(oldPid);
|
void disconnectPresence(oldPid);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -2241,7 +2265,7 @@ async function handleSessionHello(
|
|||||||
if (oldConn.meshId === hello.meshId && oldConn.sessionId === hello.sessionId) {
|
if (oldConn.meshId === hello.meshId && oldConn.sessionId === hello.sessionId) {
|
||||||
log.info("session_hello dedup", { old_presence: oldPid, session_id: hello.sessionId });
|
log.info("session_hello dedup", { old_presence: oldPid, session_id: hello.sessionId });
|
||||||
try { oldConn.ws.close(1000, "session_replaced"); } catch { /* already dead */ }
|
try { oldConn.ws.close(1000, "session_replaced"); } catch { /* already dead */ }
|
||||||
connections.delete(oldPid);
|
dropConnection(oldPid);
|
||||||
void disconnectPresence(oldPid);
|
void disconnectPresence(oldPid);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -4932,7 +4956,7 @@ function handleConnection(ws: WebSocket): void {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
try { peer.ws.close(closeCode, closeReason); } catch {}
|
try { peer.ws.close(closeCode, closeReason); } catch {}
|
||||||
connections.delete(pid);
|
dropConnection(pid);
|
||||||
void disconnectPresence(pid);
|
void disconnectPresence(pid);
|
||||||
affected.push(peer.displayName || pid);
|
affected.push(peer.displayName || pid);
|
||||||
}
|
}
|
||||||
@@ -4947,7 +4971,7 @@ function handleConnection(ws: WebSocket): void {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
try { peer.ws.close(closeCode, `${closeReason}_stale`); } catch {}
|
try { peer.ws.close(closeCode, `${closeReason}_stale`); } catch {}
|
||||||
connections.delete(pid);
|
dropConnection(pid);
|
||||||
void disconnectPresence(pid);
|
void disconnectPresence(pid);
|
||||||
affected.push(peer.displayName || pid);
|
affected.push(peer.displayName || pid);
|
||||||
}
|
}
|
||||||
@@ -4961,7 +4985,7 @@ function handleConnection(ws: WebSocket): void {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
try { peer.ws.close(closeCode, closeReason); } catch {}
|
try { peer.ws.close(closeCode, closeReason); } catch {}
|
||||||
connections.delete(pid);
|
dropConnection(pid);
|
||||||
void disconnectPresence(pid);
|
void disconnectPresence(pid);
|
||||||
affected.push(peer.displayName || pid);
|
affected.push(peer.displayName || pid);
|
||||||
}
|
}
|
||||||
@@ -5017,7 +5041,7 @@ function handleConnection(ws: WebSocket): void {
|
|||||||
for (const [pid, peer] of connections) {
|
for (const [pid, peer] of connections) {
|
||||||
if (peer.meshId === conn.meshId && peer.memberPubkey === targetMember.peerPubkey) {
|
if (peer.meshId === conn.meshId && peer.memberPubkey === targetMember.peerPubkey) {
|
||||||
try { peer.ws.close(4002, "banned"); } catch {}
|
try { peer.ws.close(4002, "banned"); } catch {}
|
||||||
connections.delete(pid);
|
dropConnection(pid);
|
||||||
void disconnectPresence(pid);
|
void disconnectPresence(pid);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user