feat(broker): production hardening — caps, limits, metrics, logging

Adds the minimum ops surface area for a production broker without over-engineering. All new config knobs are env-var driven with sane defaults. New modules: - logger.ts: structured JSON logs (one line, stderr, ready for Loki/Datadog ingestion without preprocessing) - metrics.ts: in-process Prometheus counters + gauges, exposed at GET /metrics. Tracks connections, messages, queue depth, TTL sweeps, hook requests, DB health. - rate-limit.ts: token-bucket rate limiter keyed by (pid, cwd). Applied to POST /hook/set-status at 30/min default. - db-health.ts: Postgres ping loop with exponential-backoff retry. GET /health returns 503 while DB is down. - build-info.ts: version + gitSha (from GIT_SHA env or `git rev-parse` fallback) + uptime, surfaced on /health. Behavior changes: - Connection caps: MAX_CONNECTIONS_PER_MESH (default 100). Exceed → close(1008, "capacity") + metric increment. - Message size: MAX_MESSAGE_BYTES (default 65536). WS applies it via `ws.maxPayload`. Hook POST bodies cap out with 413. - Structured logs everywhere replacing the old `log()` helper. - Env validation stricter: DATABASE_URL required + regex-checked for postgres:// prefix. New endpoints: - GET /health → {status, db, version, gitSha, uptime}. 503 if DB down. - GET /metrics → Prometheus text format. Verified: 21/21 tests still pass. Hit /health + /metrics live — gitSha resolves correctly via `git rev-parse --short HEAD` in dev. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-04 22:14:31 +01:00
parent 84e14ff410
commit 5bf815b304
8 changed files with 630 additions and 139 deletions
--- a/apps/broker/src/db-health.ts
+++ b/apps/broker/src/db-health.ts
@@ -0,0 +1,70 @@
+/**
+ * Postgres connection health check with backoff retry.
+ *
+ * We don't tear down the broker on a transient DB blip — the
+ * surrounding HTTP/WS layer keeps serving, /health flips to 503,
+ * and the metrics gauge reflects reality. New queries will naturally
+ * fail while the DB is down; connectors that have retry logic of
+ * their own (postgres.js does) will recover transparently.
+ */
+
+import { sql } from "drizzle-orm";
+import { db } from "./db";
+import { log } from "./logger";
+import { metrics } from "./metrics";
+
+let healthy = false;
+let consecutiveFailures = 0;
+let pollTimer: ReturnType<typeof setInterval> | null = null;
+
+export function isDbHealthy(): boolean {
+  return healthy;
+}
+
+export async function pingDb(): Promise<boolean> {
+  try {
+    await db.execute(sql`SELECT 1`);
+    if (!healthy) {
+      log.info("db healthy", { prior_failures: consecutiveFailures });
+    }
+    healthy = true;
+    consecutiveFailures = 0;
+    metrics.dbHealthy.set(1);
+    return true;
+  } catch (e) {
+    consecutiveFailures += 1;
+    if (healthy || consecutiveFailures === 1) {
+      log.error("db ping failed", {
+        consecutive_failures: consecutiveFailures,
+        error: e instanceof Error ? e.message : String(e),
+      });
+    }
+    healthy = false;
+    metrics.dbHealthy.set(0);
+    return false;
+  }
+}
+
+/**
+ * Poll the DB on a backoff schedule while unhealthy, steady-state
+ * 30s interval while healthy. Runs in background; call stopDbHealth
+ * on shutdown.
+ */
+export function startDbHealth(): void {
+  if (pollTimer) return;
+  const tick = async (): Promise<void> => {
+    await pingDb();
+    const next = healthy
+      ? 30_000
+      : Math.min(30_000, 500 * Math.pow(2, Math.min(consecutiveFailures, 6)));
+    pollTimer = setTimeout(() => {
+      void tick();
+    }, next);
+  };
+  void tick();
+}
+
+export function stopDbHealth(): void {
+  if (pollTimer) clearTimeout(pollTimer as unknown as number);
+  pollTimer = null;
+}