feat(broker): production hardening — caps, limits, metrics, logging
Adds the minimum ops surface area for a production broker without
over-engineering. All new config knobs are env-var driven with sane
defaults.
New modules:
- logger.ts: structured JSON logs (one line, stderr, ready for
Loki/Datadog ingestion without preprocessing)
- metrics.ts: in-process Prometheus counters + gauges, exposed at
GET /metrics. Tracks connections, messages, queue depth, TTL
sweeps, hook requests, DB health.
- rate-limit.ts: token-bucket rate limiter keyed by (pid, cwd).
Applied to POST /hook/set-status at 30/min default.
- db-health.ts: Postgres ping loop with exponential-backoff retry.
GET /health returns 503 while DB is down.
- build-info.ts: version + gitSha (from GIT_SHA env or `git rev-parse`
fallback) + uptime, surfaced on /health.
Behavior changes:
- Connection caps: MAX_CONNECTIONS_PER_MESH (default 100). Exceed →
close(1008, "capacity") + metric increment.
- Message size: MAX_MESSAGE_BYTES (default 65536). WS applies it via
`ws.maxPayload`. Hook POST bodies cap out with 413.
- Structured logs everywhere replacing the old `log()` helper.
- Env validation stricter: DATABASE_URL required + regex-checked for
postgres:// prefix.
New endpoints:
- GET /health → {status, db, version, gitSha, uptime}. 503 if DB down.
- GET /metrics → Prometheus text format.
Verified: 21/21 tests still pass. Hit /health + /metrics live —
gitSha resolves correctly via `git rev-parse --short HEAD` in dev.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
70
apps/broker/src/db-health.ts
Normal file
70
apps/broker/src/db-health.ts
Normal file
@@ -0,0 +1,70 @@
|
||||
/**
|
||||
* Postgres connection health check with backoff retry.
|
||||
*
|
||||
* We don't tear down the broker on a transient DB blip — the
|
||||
* surrounding HTTP/WS layer keeps serving, /health flips to 503,
|
||||
* and the metrics gauge reflects reality. New queries will naturally
|
||||
* fail while the DB is down; connectors that have retry logic of
|
||||
* their own (postgres.js does) will recover transparently.
|
||||
*/
|
||||
|
||||
import { sql } from "drizzle-orm";
|
||||
import { db } from "./db";
|
||||
import { log } from "./logger";
|
||||
import { metrics } from "./metrics";
|
||||
|
||||
let healthy = false;
|
||||
let consecutiveFailures = 0;
|
||||
let pollTimer: ReturnType<typeof setInterval> | null = null;
|
||||
|
||||
export function isDbHealthy(): boolean {
|
||||
return healthy;
|
||||
}
|
||||
|
||||
export async function pingDb(): Promise<boolean> {
|
||||
try {
|
||||
await db.execute(sql`SELECT 1`);
|
||||
if (!healthy) {
|
||||
log.info("db healthy", { prior_failures: consecutiveFailures });
|
||||
}
|
||||
healthy = true;
|
||||
consecutiveFailures = 0;
|
||||
metrics.dbHealthy.set(1);
|
||||
return true;
|
||||
} catch (e) {
|
||||
consecutiveFailures += 1;
|
||||
if (healthy || consecutiveFailures === 1) {
|
||||
log.error("db ping failed", {
|
||||
consecutive_failures: consecutiveFailures,
|
||||
error: e instanceof Error ? e.message : String(e),
|
||||
});
|
||||
}
|
||||
healthy = false;
|
||||
metrics.dbHealthy.set(0);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Poll the DB on a backoff schedule while unhealthy, steady-state
|
||||
* 30s interval while healthy. Runs in background; call stopDbHealth
|
||||
* on shutdown.
|
||||
*/
|
||||
export function startDbHealth(): void {
|
||||
if (pollTimer) return;
|
||||
const tick = async (): Promise<void> => {
|
||||
await pingDb();
|
||||
const next = healthy
|
||||
? 30_000
|
||||
: Math.min(30_000, 500 * Math.pow(2, Math.min(consecutiveFailures, 6)));
|
||||
pollTimer = setTimeout(() => {
|
||||
void tick();
|
||||
}, next);
|
||||
};
|
||||
void tick();
|
||||
}
|
||||
|
||||
export function stopDbHealth(): void {
|
||||
if (pollTimer) clearTimeout(pollTimer as unknown as number);
|
||||
pollTimer = null;
|
||||
}
|
||||
Reference in New Issue
Block a user