feat(broker): production hardening — caps, limits, metrics, logging
Adds the minimum ops surface area for a production broker without
over-engineering. All new config knobs are env-var driven with sane
defaults.
New modules:
- logger.ts: structured JSON logs (one line, stderr, ready for
Loki/Datadog ingestion without preprocessing)
- metrics.ts: in-process Prometheus counters + gauges, exposed at
GET /metrics. Tracks connections, messages, queue depth, TTL
sweeps, hook requests, DB health.
- rate-limit.ts: token-bucket rate limiter keyed by (pid, cwd).
Applied to POST /hook/set-status at 30/min default.
- db-health.ts: Postgres ping loop with exponential-backoff retry.
GET /health returns 503 while DB is down.
- build-info.ts: version + gitSha (from GIT_SHA env or `git rev-parse`
fallback) + uptime, surfaced on /health.
Behavior changes:
- Connection caps: MAX_CONNECTIONS_PER_MESH (default 100). Exceed →
close(1008, "capacity") + metric increment.
- Message size: MAX_MESSAGE_BYTES (default 65536). WS applies it via
`ws.maxPayload`. Hook POST bodies cap out with 413.
- Structured logs everywhere replacing the old `log()` helper.
- Env validation stricter: DATABASE_URL required + regex-checked for
postgres:// prefix.
New endpoints:
- GET /health → {status, db, version, gitSha, uptime}. 503 if DB down.
- GET /metrics → Prometheus text format.
Verified: 21/21 tests still pass. Hit /health + /metrics live —
gitSha resolves correctly via `git rev-parse --short HEAD` in dev.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
61
apps/broker/src/rate-limit.ts
Normal file
61
apps/broker/src/rate-limit.ts
Normal file
@@ -0,0 +1,61 @@
|
||||
/**
|
||||
* Token-bucket rate limiter keyed by an arbitrary string.
|
||||
*
|
||||
* Used to cap POST /hook/set-status at a sane per-session rate
|
||||
* (hook scripts legitimately fire every turn; anything faster is
|
||||
* either a loop or a compromised agent).
|
||||
*
|
||||
* In-process only. If we scale to multiple broker instances this
|
||||
* moves to Redis, but for the single-instance broker it's enough.
|
||||
*/
|
||||
|
||||
interface Bucket {
|
||||
tokens: number;
|
||||
lastRefill: number;
|
||||
}
|
||||
|
||||
export class TokenBucket {
|
||||
private buckets = new Map<string, Bucket>();
|
||||
private readonly refillPerMs: number;
|
||||
|
||||
constructor(
|
||||
private capacity: number,
|
||||
refillPerMinute: number,
|
||||
) {
|
||||
this.refillPerMs = refillPerMinute / 60_000;
|
||||
}
|
||||
|
||||
/** Take one token. Returns true if allowed, false if rate-limited. */
|
||||
take(key: string, now = Date.now()): boolean {
|
||||
const bucket = this.buckets.get(key) ?? {
|
||||
tokens: this.capacity,
|
||||
lastRefill: now,
|
||||
};
|
||||
const elapsed = now - bucket.lastRefill;
|
||||
if (elapsed > 0) {
|
||||
bucket.tokens = Math.min(
|
||||
this.capacity,
|
||||
bucket.tokens + elapsed * this.refillPerMs,
|
||||
);
|
||||
bucket.lastRefill = now;
|
||||
}
|
||||
if (bucket.tokens < 1) {
|
||||
this.buckets.set(key, bucket);
|
||||
return false;
|
||||
}
|
||||
bucket.tokens -= 1;
|
||||
this.buckets.set(key, bucket);
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Periodic GC: drop buckets whose keys haven't been touched in a while. */
|
||||
sweep(olderThanMs = 10 * 60 * 1000, now = Date.now()): void {
|
||||
for (const [key, bucket] of this.buckets) {
|
||||
if (now - bucket.lastRefill > olderThanMs) this.buckets.delete(key);
|
||||
}
|
||||
}
|
||||
|
||||
get size(): number {
|
||||
return this.buckets.size;
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user