fix(broker): sync with runner on boot instead of re-deploying
Some checks failed
CI / Lint (push) Has been cancelled
CI / Typecheck (push) Has been cancelled
CI / Broker tests (Postgres) (push) Has been cancelled
CI / Docker build (linux/amd64) (push) Has been cancelled

Boot restore now checks runner /health to see what's already running,
then updates DB status to match. Fixes the bug where broker restart
marked running services as 'failed' because it tried to re-deploy
without shared source volume.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Alejandro Gutiérrez
2026-04-08 16:26:43 +01:00
parent 4c385a16cc
commit b6224c4186

View File

@@ -3660,33 +3660,35 @@ function main(): void {
for (const m of allMeshes) {
const running = await getRunningServices(m.id);
if (running.length === 0) continue;
log.info("restoring services for mesh", { mesh_id: m.id, count: running.length });
for (const svc of running) {
try {
const config = (svc.config as Record<string, unknown>) ?? {};
// Decrypt env vars if stored encrypted
let resolvedEnv: Record<string, string> | undefined;
if (config._encryptedEnv) {
const decrypted = decryptFromStorage(config._encryptedEnv as string);
if (decrypted) {
resolvedEnv = JSON.parse(decrypted);
} else {
log.warn("failed to decrypt env for service", { service: svc.name });
}
log.info("syncing services for mesh", { mesh_id: m.id, count: running.length });
// Sync DB status with runner reality instead of re-deploying
try {
const healthRes = await fetch(`${env.RUNNER_URL}/health`);
const health = await healthRes.json() as { ok: boolean; services: Array<{ name: string; status: string; tools: number }> };
const runnerServices = new Map((health.services ?? []).map((s: any) => [s.name, s]));
for (const svc of running) {
const runnerSvc = runnerServices.get(svc.name);
if (runnerSvc && runnerSvc.status === "running") {
// Runner has it running — update DB to match
log.info("service still running on runner", { service: svc.name, mesh_id: m.id });
// Refresh tools from runner
try {
const toolsRes = await fetch(`${env.RUNNER_URL}/list?name=${svc.name}`);
const toolsData = await toolsRes.json() as { tools?: any[] };
if (toolsData.tools) {
await updateServiceStatus(m.id, svc.name, "running", { toolsSchema: toolsData.tools });
}
} catch { /* keep existing tools */ }
} else {
// Runner doesn't have it — mark as stopped
log.warn("service not found on runner, marking stopped", { service: svc.name });
await updateServiceStatus(m.id, svc.name, "stopped");
}
const sourcePath = `${env.CLAUDEMESH_SERVICES_DIR}/${m.id}/${svc.name}/source`;
await serviceManager.deploy({
meshId: m.id,
name: svc.name,
sourcePath,
config: { runtime: svc.runtime as any, ...(config.memory_mb ? { memory_mb: config.memory_mb as number } : {}) },
resolvedEnv,
});
log.info("service restored", { service: svc.name, mesh_id: m.id });
} catch (e) {
log.error("service restore failed", { service: svc.name, error: e instanceof Error ? e.message : String(e) });
await updateServiceStatus(m.id, svc.name, "failed");
}
} catch (e) {
log.warn("runner health check failed during restore", { error: e instanceof Error ? e.message : String(e) });
// Runner might not be up yet — don't mark services as failed
}
}
} catch (e) {