fix(broker): sync with runner on boot instead of re-deploying
Boot restore now checks runner /health to see what's already running, then updates DB status to match. Fixes the bug where broker restart marked running services as 'failed' because it tried to re-deploy without shared source volume. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -3660,33 +3660,35 @@ function main(): void {
|
|||||||
for (const m of allMeshes) {
|
for (const m of allMeshes) {
|
||||||
const running = await getRunningServices(m.id);
|
const running = await getRunningServices(m.id);
|
||||||
if (running.length === 0) continue;
|
if (running.length === 0) continue;
|
||||||
log.info("restoring services for mesh", { mesh_id: m.id, count: running.length });
|
log.info("syncing services for mesh", { mesh_id: m.id, count: running.length });
|
||||||
for (const svc of running) {
|
// Sync DB status with runner reality instead of re-deploying
|
||||||
try {
|
try {
|
||||||
const config = (svc.config as Record<string, unknown>) ?? {};
|
const healthRes = await fetch(`${env.RUNNER_URL}/health`);
|
||||||
// Decrypt env vars if stored encrypted
|
const health = await healthRes.json() as { ok: boolean; services: Array<{ name: string; status: string; tools: number }> };
|
||||||
let resolvedEnv: Record<string, string> | undefined;
|
const runnerServices = new Map((health.services ?? []).map((s: any) => [s.name, s]));
|
||||||
if (config._encryptedEnv) {
|
|
||||||
const decrypted = decryptFromStorage(config._encryptedEnv as string);
|
for (const svc of running) {
|
||||||
if (decrypted) {
|
const runnerSvc = runnerServices.get(svc.name);
|
||||||
resolvedEnv = JSON.parse(decrypted);
|
if (runnerSvc && runnerSvc.status === "running") {
|
||||||
} else {
|
// Runner has it running — update DB to match
|
||||||
log.warn("failed to decrypt env for service", { service: svc.name });
|
log.info("service still running on runner", { service: svc.name, mesh_id: m.id });
|
||||||
}
|
// Refresh tools from runner
|
||||||
|
try {
|
||||||
|
const toolsRes = await fetch(`${env.RUNNER_URL}/list?name=${svc.name}`);
|
||||||
|
const toolsData = await toolsRes.json() as { tools?: any[] };
|
||||||
|
if (toolsData.tools) {
|
||||||
|
await updateServiceStatus(m.id, svc.name, "running", { toolsSchema: toolsData.tools });
|
||||||
|
}
|
||||||
|
} catch { /* keep existing tools */ }
|
||||||
|
} else {
|
||||||
|
// Runner doesn't have it — mark as stopped
|
||||||
|
log.warn("service not found on runner, marking stopped", { service: svc.name });
|
||||||
|
await updateServiceStatus(m.id, svc.name, "stopped");
|
||||||
}
|
}
|
||||||
const sourcePath = `${env.CLAUDEMESH_SERVICES_DIR}/${m.id}/${svc.name}/source`;
|
|
||||||
await serviceManager.deploy({
|
|
||||||
meshId: m.id,
|
|
||||||
name: svc.name,
|
|
||||||
sourcePath,
|
|
||||||
config: { runtime: svc.runtime as any, ...(config.memory_mb ? { memory_mb: config.memory_mb as number } : {}) },
|
|
||||||
resolvedEnv,
|
|
||||||
});
|
|
||||||
log.info("service restored", { service: svc.name, mesh_id: m.id });
|
|
||||||
} catch (e) {
|
|
||||||
log.error("service restore failed", { service: svc.name, error: e instanceof Error ? e.message : String(e) });
|
|
||||||
await updateServiceStatus(m.id, svc.name, "failed");
|
|
||||||
}
|
}
|
||||||
|
} catch (e) {
|
||||||
|
log.warn("runner health check failed during restore", { error: e instanceof Error ? e.message : String(e) });
|
||||||
|
// Runner might not be up yet — don't mark services as failed
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
|
|||||||
Reference in New Issue
Block a user