feat(cli): self-healing daemon lifecycle
every daemon-routed verb now probes the ipc socket via /v1/version (instead of trusting existsSync), cleans up stale sock/pid files left by a crashed daemon, and auto-spawns a detached `claudemesh daemon up` under a file-lock when the daemon is down. polls for liveness up to a budget (3s for ad-hoc verbs, 10s for launch) before falling through to cold path. includes a per-process result cache (script doing 50 sends pays spawn cost at most once), a 30s recently-failed marker (no thundering-herd retries on crash-loop), a spawn-lock (concurrent invocations share one attempt), and a recursion guard env var (nested cli calls inside the daemon process skip auto-spawn). fixes the stale-socket bug where launch's ensureDaemonRunning returned early on a left-over socket file from a crashed daemon, silently breaking the spawned claude session's mcp shim. deferred to 1.28.0: --strict / --no-daemon flags, lazy-loading of cold-path code, per-session ipc tokens. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,5 +1,54 @@
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
|
## 1.27.3 (2026-05-04) — self-healing daemon lifecycle
|
||||||
|
|
||||||
|
The CLI now auto-recovers from a dead daemon on every invocation
|
||||||
|
instead of silently mis-routing through a stale socket.
|
||||||
|
|
||||||
|
### What changed
|
||||||
|
|
||||||
|
- New `services/daemon/lifecycle.ts` — single helper that probes the
|
||||||
|
IPC socket via `/v1/version` (instead of trusting `existsSync`),
|
||||||
|
cleans up stale `daemon.sock` / `daemon.pid` files, and auto-spawns
|
||||||
|
a detached `claudemesh daemon up` under a file-lock when the daemon
|
||||||
|
is missing.
|
||||||
|
- Polls for socket liveness up to a budget (3 s for ad-hoc verbs,
|
||||||
|
10 s for `claudemesh launch`) before falling through.
|
||||||
|
- Recently-failed marker (`~/.claudemesh/daemon/.spawn-failure`,
|
||||||
|
30 s TTL) prevents thundering-herd retries when the daemon
|
||||||
|
crash-loops at startup.
|
||||||
|
- Spawn-lock (`~/.claudemesh/daemon/.spawn.lock`) ensures concurrent
|
||||||
|
CLI invocations share one spawn attempt instead of racing.
|
||||||
|
- Per-process result cache — a script doing 50 sends pays the spawn
|
||||||
|
cost at most once, not 50 times.
|
||||||
|
- Recursion guard via `CLAUDEMESH_INTERNAL_NO_AUTOSPAWN=1` env (set
|
||||||
|
on the spawned daemon's env) so nested CLI calls inside the daemon
|
||||||
|
process don't re-trigger spawn.
|
||||||
|
|
||||||
|
### User-visible behavior
|
||||||
|
|
||||||
|
- `peer list`, `send`, `state get`, etc. now restart the daemon
|
||||||
|
automatically when invoked while the daemon is down.
|
||||||
|
- One-line stderr info on auto-restart:
|
||||||
|
`[claudemesh] info daemon restarted automatically (took 615ms)`.
|
||||||
|
- Cold-path fallback fires only when auto-spawn fails or is
|
||||||
|
suppressed by the recently-failed marker; in those cases a `warn`
|
||||||
|
line points at the daemon log.
|
||||||
|
|
||||||
|
### Bug fixed
|
||||||
|
|
||||||
|
`claudemesh launch`'s `ensureDaemonRunning` previously checked only
|
||||||
|
`existsSync(SOCK_FILE)` and returned early on a stale socket left by
|
||||||
|
a crashed daemon — silently breaking new sessions. Now delegates to
|
||||||
|
the lifecycle helper which probes the socket and recovers.
|
||||||
|
|
||||||
|
### What's not in this patch
|
||||||
|
|
||||||
|
- `--strict` and `--no-daemon` flags (deferred to D in 1.28.0).
|
||||||
|
- Lazy-loading of cold-path code (deferred to 1.28.0).
|
||||||
|
- Per-session IPC tokens (deferred to 1.28.0 alongside D's
|
||||||
|
thin-client conversion).
|
||||||
|
|
||||||
## 1.27.2 (2026-05-04) — skill: full-flag launch templates
|
## 1.27.2 (2026-05-04) — skill: full-flag launch templates
|
||||||
|
|
||||||
Documentation-only ship. `skills/claudemesh/SKILL.md` gains a canonical
|
Documentation-only ship. `skills/claudemesh/SKILL.md` gains a canonical
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "claudemesh-cli",
|
"name": "claudemesh-cli",
|
||||||
"version": "1.27.2",
|
"version": "1.27.3",
|
||||||
"description": "Peer mesh for Claude Code sessions — CLI + MCP server.",
|
"description": "Peer mesh for Claude Code sessions — CLI + MCP server.",
|
||||||
"keywords": [
|
"keywords": [
|
||||||
"claude-code",
|
"claude-code",
|
||||||
|
|||||||
@@ -49,46 +49,28 @@ export interface LaunchFlags {
|
|||||||
*
|
*
|
||||||
* As of 1.24.0 the daemon owns the broker WS and feeds the MCP push-pipe
|
* As of 1.24.0 the daemon owns the broker WS and feeds the MCP push-pipe
|
||||||
* over IPC SSE. If the socket is absent when Claude boots its MCP shim,
|
* over IPC SSE. If the socket is absent when Claude boots its MCP shim,
|
||||||
* the shim bails (no fallback). So we probe for the socket here and, if
|
* the shim bails (no fallback). Delegates to the shared lifecycle helper
|
||||||
* missing, spawn `claudemesh daemon up --mesh <slug>` in the background,
|
* (services/daemon/lifecycle.ts) which probes the socket properly
|
||||||
* waiting briefly for the socket to appear.
|
* (avoiding the stale-socket bug where existsSync was a false positive
|
||||||
*
|
* after a daemon crash), spawns under a file-lock, and polls for liveness.
|
||||||
* Best-effort: if the daemon spawn fails, we surface the error and let
|
|
||||||
* the launch proceed — Claude Code will print the same "daemon not
|
|
||||||
* running" message and the user can fix it manually.
|
|
||||||
*/
|
*/
|
||||||
async function ensureDaemonRunning(meshSlug: string, quiet: boolean): Promise<void> {
|
async function ensureDaemonRunning(meshSlug: string, quiet: boolean): Promise<void> {
|
||||||
const { DAEMON_PATHS } = await import("~/daemon/paths.js");
|
const { ensureDaemonReady } = await import("~/services/daemon/lifecycle.js");
|
||||||
if (existsSync(DAEMON_PATHS.SOCK_FILE)) return;
|
if (!quiet) render.info("ensuring claudemesh daemon is running…");
|
||||||
|
// Larger budget for `launch` — it's a one-shot flow where the user
|
||||||
if (!quiet) render.info("starting claudemesh daemon…");
|
// is actively waiting; cold node start + broker hello can take
|
||||||
const { spawn } = await import("node:child_process");
|
// longer than the default 3s budget for ad-hoc verbs.
|
||||||
const argv0 = process.argv[1] ?? "claudemesh";
|
const res = await ensureDaemonReady({ budgetMs: 10_000, mesh: meshSlug });
|
||||||
let binary = argv0;
|
if (res.state === "up") {
|
||||||
if (/\.ts$/.test(binary) || /node_modules|src\/entrypoints/.test(binary)) {
|
if (!quiet) render.ok("daemon already running");
|
||||||
try {
|
return;
|
||||||
const { execSync } = await import("node:child_process");
|
|
||||||
binary = execSync("which claudemesh", { encoding: "utf8" }).trim();
|
|
||||||
} catch { binary = "claudemesh"; }
|
|
||||||
}
|
}
|
||||||
const child = spawn(binary, ["daemon", "up", "--mesh", meshSlug], {
|
if (res.state === "started") {
|
||||||
detached: true,
|
if (!quiet) render.ok(`daemon ready (${res.durationMs}ms)`);
|
||||||
stdio: "ignore",
|
return;
|
||||||
});
|
|
||||||
child.unref();
|
|
||||||
|
|
||||||
// Wait for the socket to appear. 10 s budget — covers cold node start +
|
|
||||||
// broker hello round-trip on slow links.
|
|
||||||
const start = Date.now();
|
|
||||||
while (Date.now() - start < 10_000) {
|
|
||||||
if (existsSync(DAEMON_PATHS.SOCK_FILE)) {
|
|
||||||
if (!quiet) render.ok("daemon ready");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
await new Promise((r) => setTimeout(r, 200));
|
|
||||||
}
|
}
|
||||||
render.warn(
|
render.warn(
|
||||||
"daemon failed to start within 10s",
|
`daemon ${res.state}${res.reason ? `: ${res.reason}` : ""}`,
|
||||||
"Run `claudemesh daemon up --mesh " + meshSlug + "` manually, then re-launch.",
|
"Run `claudemesh daemon up --mesh " + meshSlug + "` manually, then re-launch.",
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,21 +1,40 @@
|
|||||||
// Try forwarding a send through the local daemon's IPC. Returns null if
|
// Try forwarding a send through the local daemon's IPC. Returns null if
|
||||||
// the daemon isn't running or the daemon's mesh doesn't match the target
|
// the daemon isn't running or the daemon's mesh doesn't match the target
|
||||||
// mesh — the caller falls back to the bridge or cold path.
|
// mesh — the caller falls back to the bridge or cold path.
|
||||||
|
//
|
||||||
import { existsSync } from "node:fs";
|
// Auto-recovery: when the daemon socket is missing or stale, every
|
||||||
|
// helper here calls into the lifecycle module which probes, spawns
|
||||||
|
// (under a lock), polls, and retries — so cold-path fallback only
|
||||||
|
// fires if auto-spawn failed. The lifecycle module caches its
|
||||||
|
// per-process result, so a script doing 50 sends pays the spawn cost
|
||||||
|
// at most once.
|
||||||
|
|
||||||
import { ipc } from "~/daemon/ipc/client.js";
|
import { ipc } from "~/daemon/ipc/client.js";
|
||||||
import { DAEMON_PATHS } from "~/daemon/paths.js";
|
import { ensureDaemonReady } from "~/services/daemon/lifecycle.js";
|
||||||
|
import { warnDaemonState } from "~/ui/warnings.ts";
|
||||||
|
|
||||||
function meshQuery(mesh?: string): string {
|
function meshQuery(mesh?: string): string {
|
||||||
return mesh ? `?mesh=${encodeURIComponent(mesh)}` : "";
|
return mesh ? `?mesh=${encodeURIComponent(mesh)}` : "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Common entry: ensure the daemon is reachable, emitting a one-shot
|
||||||
|
* stderr warning describing what we did. Returns true when the daemon
|
||||||
|
* is now reachable, false when the caller should fall back. */
|
||||||
|
async function daemonReachable(): Promise<boolean> {
|
||||||
|
const res = await ensureDaemonReady();
|
||||||
|
// Suppress the warning under JSON / quiet at the call site —
|
||||||
|
// helpers here can't see those flags. JSON callers should switch
|
||||||
|
// to lifecycle directly. For now we always print; --quiet at the
|
||||||
|
// top of each verb already redirects stderr where needed.
|
||||||
|
warnDaemonState(res, {});
|
||||||
|
return res.state === "up" || res.state === "started";
|
||||||
|
}
|
||||||
|
|
||||||
/** Try fetching the peer list through the daemon (~1ms warm IPC).
|
/** Try fetching the peer list through the daemon (~1ms warm IPC).
|
||||||
* Returns null when the daemon socket isn't present so the caller can
|
* Returns null when the daemon socket isn't present so the caller can
|
||||||
* fall back to bridge / cold paths. */
|
* fall back to bridge / cold paths. */
|
||||||
export async function tryListPeersViaDaemon(mesh?: string): Promise<unknown[] | null> {
|
export async function tryListPeersViaDaemon(mesh?: string): Promise<unknown[] | null> {
|
||||||
if (!existsSync(DAEMON_PATHS.SOCK_FILE)) return null;
|
if (!(await daemonReachable())) return null;
|
||||||
try {
|
try {
|
||||||
const res = await ipc<{ peers?: unknown[] }>({ path: `/v1/peers${meshQuery(mesh)}`, timeoutMs: 3_000 });
|
const res = await ipc<{ peers?: unknown[] }>({ path: `/v1/peers${meshQuery(mesh)}`, timeoutMs: 3_000 });
|
||||||
if (res.status !== 200) return null;
|
if (res.status !== 200) return null;
|
||||||
@@ -29,7 +48,7 @@ export async function tryListPeersViaDaemon(mesh?: string): Promise<unknown[] |
|
|||||||
|
|
||||||
/** Try fetching mesh-published skills through the daemon. */
|
/** Try fetching mesh-published skills through the daemon. */
|
||||||
export async function tryListSkillsViaDaemon(mesh?: string): Promise<unknown[] | null> {
|
export async function tryListSkillsViaDaemon(mesh?: string): Promise<unknown[] | null> {
|
||||||
if (!existsSync(DAEMON_PATHS.SOCK_FILE)) return null;
|
if (!(await daemonReachable())) return null;
|
||||||
try {
|
try {
|
||||||
const res = await ipc<{ skills?: unknown[] }>({ path: `/v1/skills${meshQuery(mesh)}`, timeoutMs: 3_000 });
|
const res = await ipc<{ skills?: unknown[] }>({ path: `/v1/skills${meshQuery(mesh)}`, timeoutMs: 3_000 });
|
||||||
if (res.status !== 200) return null;
|
if (res.status !== 200) return null;
|
||||||
@@ -43,7 +62,7 @@ export async function tryListSkillsViaDaemon(mesh?: string): Promise<unknown[] |
|
|||||||
|
|
||||||
/** Try fetching one skill body through the daemon. */
|
/** Try fetching one skill body through the daemon. */
|
||||||
export async function tryGetSkillViaDaemon(name: string, mesh?: string): Promise<unknown | null> {
|
export async function tryGetSkillViaDaemon(name: string, mesh?: string): Promise<unknown | null> {
|
||||||
if (!existsSync(DAEMON_PATHS.SOCK_FILE)) return null;
|
if (!(await daemonReachable())) return null;
|
||||||
try {
|
try {
|
||||||
const res = await ipc<{ skill?: unknown }>({
|
const res = await ipc<{ skill?: unknown }>({
|
||||||
path: `/v1/skills/${encodeURIComponent(name)}${meshQuery(mesh)}`,
|
path: `/v1/skills/${encodeURIComponent(name)}${meshQuery(mesh)}`,
|
||||||
@@ -70,7 +89,7 @@ export type StateEntry = {
|
|||||||
* - undefined when the daemon ran but the key is unset (404)
|
* - undefined when the daemon ran but the key is unset (404)
|
||||||
* - null when the daemon socket isn't present (caller falls back) */
|
* - null when the daemon socket isn't present (caller falls back) */
|
||||||
export async function tryGetStateViaDaemon(key: string, mesh?: string): Promise<StateEntry | undefined | null> {
|
export async function tryGetStateViaDaemon(key: string, mesh?: string): Promise<StateEntry | undefined | null> {
|
||||||
if (!existsSync(DAEMON_PATHS.SOCK_FILE)) return null;
|
if (!(await daemonReachable())) return null;
|
||||||
try {
|
try {
|
||||||
const path = `/v1/state?key=${encodeURIComponent(key)}${mesh ? `&mesh=${encodeURIComponent(mesh)}` : ""}`;
|
const path = `/v1/state?key=${encodeURIComponent(key)}${mesh ? `&mesh=${encodeURIComponent(mesh)}` : ""}`;
|
||||||
const res = await ipc<{ state?: StateEntry; error?: string }>({ path, timeoutMs: 3_000 });
|
const res = await ipc<{ state?: StateEntry; error?: string }>({ path, timeoutMs: 3_000 });
|
||||||
@@ -85,7 +104,7 @@ export async function tryGetStateViaDaemon(key: string, mesh?: string): Promise<
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function tryListStateViaDaemon(mesh?: string): Promise<StateEntry[] | null> {
|
export async function tryListStateViaDaemon(mesh?: string): Promise<StateEntry[] | null> {
|
||||||
if (!existsSync(DAEMON_PATHS.SOCK_FILE)) return null;
|
if (!(await daemonReachable())) return null;
|
||||||
try {
|
try {
|
||||||
const res = await ipc<{ entries?: StateEntry[] }>({ path: `/v1/state${meshQuery(mesh)}`, timeoutMs: 3_000 });
|
const res = await ipc<{ entries?: StateEntry[] }>({ path: `/v1/state${meshQuery(mesh)}`, timeoutMs: 3_000 });
|
||||||
if (res.status !== 200) return null;
|
if (res.status !== 200) return null;
|
||||||
@@ -98,7 +117,7 @@ export async function tryListStateViaDaemon(mesh?: string): Promise<StateEntry[]
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function trySetStateViaDaemon(key: string, value: unknown, mesh?: string): Promise<boolean> {
|
export async function trySetStateViaDaemon(key: string, value: unknown, mesh?: string): Promise<boolean> {
|
||||||
if (!existsSync(DAEMON_PATHS.SOCK_FILE)) return false;
|
if (!(await daemonReachable())) return false;
|
||||||
try {
|
try {
|
||||||
const res = await ipc<{ ok?: boolean; error?: string }>({
|
const res = await ipc<{ ok?: boolean; error?: string }>({
|
||||||
method: "POST",
|
method: "POST",
|
||||||
@@ -122,7 +141,7 @@ export type MemoryEntry = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
export async function tryRememberViaDaemon(content: string, tags?: string[], mesh?: string): Promise<{ id: string; mesh?: string } | null> {
|
export async function tryRememberViaDaemon(content: string, tags?: string[], mesh?: string): Promise<{ id: string; mesh?: string } | null> {
|
||||||
if (!existsSync(DAEMON_PATHS.SOCK_FILE)) return null;
|
if (!(await daemonReachable())) return null;
|
||||||
try {
|
try {
|
||||||
const res = await ipc<{ id?: string; mesh?: string; error?: string }>({
|
const res = await ipc<{ id?: string; mesh?: string; error?: string }>({
|
||||||
method: "POST",
|
method: "POST",
|
||||||
@@ -136,7 +155,7 @@ export async function tryRememberViaDaemon(content: string, tags?: string[], mes
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function tryRecallViaDaemon(query: string, mesh?: string): Promise<MemoryEntry[] | null> {
|
export async function tryRecallViaDaemon(query: string, mesh?: string): Promise<MemoryEntry[] | null> {
|
||||||
if (!existsSync(DAEMON_PATHS.SOCK_FILE)) return null;
|
if (!(await daemonReachable())) return null;
|
||||||
try {
|
try {
|
||||||
const path = `/v1/memory?q=${encodeURIComponent(query)}${mesh ? `&mesh=${encodeURIComponent(mesh)}` : ""}`;
|
const path = `/v1/memory?q=${encodeURIComponent(query)}${mesh ? `&mesh=${encodeURIComponent(mesh)}` : ""}`;
|
||||||
const res = await ipc<{ matches?: MemoryEntry[] }>({ path, timeoutMs: 5_000 });
|
const res = await ipc<{ matches?: MemoryEntry[] }>({ path, timeoutMs: 5_000 });
|
||||||
@@ -150,7 +169,7 @@ export async function tryRecallViaDaemon(query: string, mesh?: string): Promise<
|
|||||||
}
|
}
|
||||||
|
|
||||||
export async function tryForgetViaDaemon(id: string, mesh?: string): Promise<boolean> {
|
export async function tryForgetViaDaemon(id: string, mesh?: string): Promise<boolean> {
|
||||||
if (!existsSync(DAEMON_PATHS.SOCK_FILE)) return false;
|
if (!(await daemonReachable())) return false;
|
||||||
try {
|
try {
|
||||||
const path = `/v1/memory/${encodeURIComponent(id)}${meshQuery(mesh)}`;
|
const path = `/v1/memory/${encodeURIComponent(id)}${meshQuery(mesh)}`;
|
||||||
const res = await ipc<{ ok?: boolean }>({ method: "DELETE", path, timeoutMs: 3_000 });
|
const res = await ipc<{ ok?: boolean }>({ method: "DELETE", path, timeoutMs: 3_000 });
|
||||||
@@ -179,7 +198,7 @@ export async function trySendViaDaemon(args: {
|
|||||||
* right mesh by either flag or single-mesh-default. */
|
* right mesh by either flag or single-mesh-default. */
|
||||||
expectedMesh?: string;
|
expectedMesh?: string;
|
||||||
}): Promise<DaemonSendResult | null> {
|
}): Promise<DaemonSendResult | null> {
|
||||||
if (!existsSync(DAEMON_PATHS.SOCK_FILE)) return null;
|
if (!(await daemonReachable())) return null;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const res = await ipc<{
|
const res = await ipc<{
|
||||||
|
|||||||
243
apps/cli/src/services/daemon/lifecycle.ts
Normal file
243
apps/cli/src/services/daemon/lifecycle.ts
Normal file
@@ -0,0 +1,243 @@
|
|||||||
|
/**
|
||||||
|
* Daemon lifecycle helper — probe, auto-spawn, retry, fall-through.
|
||||||
|
*
|
||||||
|
* Every daemon-routed CLI verb passes through `ensureDaemonReady()` before
|
||||||
|
* its IPC call. The helper:
|
||||||
|
*
|
||||||
|
* 1. Probes the socket via a fast `/v1/version` IPC (~5-10 ms).
|
||||||
|
* 2. If the socket is missing OR present-but-stale, attempts a detached
|
||||||
|
* `claudemesh daemon up` spawn under a file-lock.
|
||||||
|
* 3. Polls for the new socket up to a budget (default 3s).
|
||||||
|
* 4. Returns a state describing what happened, so the caller can either
|
||||||
|
* proceed warm or fall back to the cold path with a clear warning.
|
||||||
|
*
|
||||||
|
* State machine:
|
||||||
|
* - "up" daemon was already running
|
||||||
|
* - "started" daemon was down; we spawned it; it came up
|
||||||
|
* - "down" daemon was down; auto-spawn skipped (e.g., recursion guard)
|
||||||
|
* - "spawn-failed" spawn attempted but socket never appeared within budget
|
||||||
|
* - "spawn-suppressed" recently-failed marker is fresh; skipped retry
|
||||||
|
*
|
||||||
|
* Stale-socket handling: if the socket file exists but the IPC probe
|
||||||
|
* fails (ECONNREFUSED / timeout), we treat the file as stale, remove
|
||||||
|
* it, and proceed as if the daemon were down. This fixes the prior bug
|
||||||
|
* where `existsSync(SOCK_FILE)` was a false positive after a daemon
|
||||||
|
* crash.
|
||||||
|
*
|
||||||
|
* Recursion guard: when we spawn the daemon we set
|
||||||
|
* `CLAUDEMESH_INTERNAL_NO_AUTOSPAWN=1` in its env so any nested CLI
|
||||||
|
* calls inside the daemon skip the auto-spawn check and avoid a loop.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { existsSync, readFileSync, statSync, unlinkSync, writeFileSync } from "node:fs";
|
||||||
|
import { join } from "node:path";
|
||||||
|
|
||||||
|
import { ipc, IpcError } from "~/daemon/ipc/client.js";
|
||||||
|
import { DAEMON_PATHS } from "~/daemon/paths.js";
|
||||||
|
|
||||||
|
export type DaemonReadyState =
|
||||||
|
| "up"
|
||||||
|
| "started"
|
||||||
|
| "down"
|
||||||
|
| "spawn-failed"
|
||||||
|
| "spawn-suppressed";
|
||||||
|
|
||||||
|
export interface EnsureDaemonResult {
|
||||||
|
state: DaemonReadyState;
|
||||||
|
/** Total ms spent in this call (probe ± spawn ± poll). */
|
||||||
|
durationMs: number;
|
||||||
|
/** When state is `spawn-failed` or `spawn-suppressed`, a one-line reason. */
|
||||||
|
reason?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface EnsureDaemonOpts {
|
||||||
|
/** Total budget for socket-appearance polling after spawn. Default 3000ms. */
|
||||||
|
budgetMs?: number;
|
||||||
|
/** Skip auto-spawn entirely. Used by `--no-daemon` and the recursion guard. */
|
||||||
|
noAutoSpawn?: boolean;
|
||||||
|
/** When auto-spawning a legacy single-mesh daemon, pin a slug. Omit for multi-mesh (default). */
|
||||||
|
mesh?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
const SPAWN_LOCK_FILE = () => join(DAEMON_PATHS.DAEMON_DIR, ".spawn.lock");
|
||||||
|
const SPAWN_FAIL_FILE = () => join(DAEMON_PATHS.DAEMON_DIR, ".spawn-failure");
|
||||||
|
const SPAWN_FAIL_TTL_MS = 30_000;
|
||||||
|
const PROBE_TIMEOUT_MS = 800;
|
||||||
|
|
||||||
|
let lastResultThisProcess: EnsureDaemonResult | null = null;
|
||||||
|
|
||||||
|
/** Probe daemon and return what we know. Cached per-process so a script
|
||||||
|
* with 50 sends doesn't re-spawn 50 times. */
|
||||||
|
export async function ensureDaemonReady(opts: EnsureDaemonOpts = {}): Promise<EnsureDaemonResult> {
|
||||||
|
if (lastResultThisProcess && (lastResultThisProcess.state === "up" || lastResultThisProcess.state === "started")) {
|
||||||
|
return lastResultThisProcess;
|
||||||
|
}
|
||||||
|
if (process.env.CLAUDEMESH_INTERNAL_NO_AUTOSPAWN === "1") {
|
||||||
|
opts = { ...opts, noAutoSpawn: true };
|
||||||
|
}
|
||||||
|
const result = await runEnsureDaemon(opts);
|
||||||
|
lastResultThisProcess = result;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Reset the per-process cache. Test helper. */
|
||||||
|
export function _resetDaemonReadyCache(): void {
|
||||||
|
lastResultThisProcess = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function runEnsureDaemon(opts: EnsureDaemonOpts): Promise<EnsureDaemonResult> {
|
||||||
|
const t0 = Date.now();
|
||||||
|
|
||||||
|
// Step 1 — probe.
|
||||||
|
const probe = await probeDaemon();
|
||||||
|
if (probe === "up") return { state: "up", durationMs: Date.now() - t0 };
|
||||||
|
if (probe === "stale") cleanupStaleFiles();
|
||||||
|
|
||||||
|
// Step 2 — auto-spawn unless forbidden.
|
||||||
|
if (opts.noAutoSpawn) {
|
||||||
|
return { state: "down", durationMs: Date.now() - t0, reason: "auto-spawn disabled" };
|
||||||
|
}
|
||||||
|
if (recentSpawnFailureFresh()) {
|
||||||
|
return {
|
||||||
|
state: "spawn-suppressed",
|
||||||
|
durationMs: Date.now() - t0,
|
||||||
|
reason: `daemon failed to start within last ${Math.round(SPAWN_FAIL_TTL_MS / 1000)}s`,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 3 — spawn detached.
|
||||||
|
const spawnRes = await spawnDaemon(opts);
|
||||||
|
if (spawnRes.ok) {
|
||||||
|
return { state: "started", durationMs: Date.now() - t0 };
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 4 — record failure for backoff and report.
|
||||||
|
markSpawnFailure();
|
||||||
|
return { state: "spawn-failed", durationMs: Date.now() - t0, reason: spawnRes.reason };
|
||||||
|
}
|
||||||
|
|
||||||
|
async function probeDaemon(): Promise<"up" | "absent" | "stale"> {
|
||||||
|
if (!existsSync(DAEMON_PATHS.SOCK_FILE)) return "absent";
|
||||||
|
try {
|
||||||
|
const res = await ipc<{ version?: string }>({ path: "/v1/version", timeoutMs: PROBE_TIMEOUT_MS });
|
||||||
|
if (res.status === 200) return "up";
|
||||||
|
return "stale";
|
||||||
|
} catch (err) {
|
||||||
|
if (err instanceof IpcError) return "stale";
|
||||||
|
const msg = String(err);
|
||||||
|
if (/ENOENT|ECONNREFUSED|ipc_timeout|EPIPE|ECONNRESET/.test(msg)) return "stale";
|
||||||
|
return "stale";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function cleanupStaleFiles(): void {
|
||||||
|
for (const p of [DAEMON_PATHS.SOCK_FILE, DAEMON_PATHS.PID_FILE]) {
|
||||||
|
try { unlinkSync(p); } catch { /* best-effort */ }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function recentSpawnFailureFresh(): boolean {
|
||||||
|
try {
|
||||||
|
const st = statSync(SPAWN_FAIL_FILE());
|
||||||
|
return Date.now() - st.mtimeMs < SPAWN_FAIL_TTL_MS;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function markSpawnFailure(): void {
|
||||||
|
try { writeFileSync(SPAWN_FAIL_FILE(), String(Date.now()), { mode: 0o600 }); } catch { /* best-effort */ }
|
||||||
|
}
|
||||||
|
|
||||||
|
function clearSpawnFailure(): void {
|
||||||
|
try { unlinkSync(SPAWN_FAIL_FILE()); } catch { /* best-effort */ }
|
||||||
|
}
|
||||||
|
|
||||||
|
interface SpawnResult { ok: boolean; reason?: string; }
|
||||||
|
|
||||||
|
async function spawnDaemon(opts: EnsureDaemonOpts): Promise<SpawnResult> {
|
||||||
|
const lockResult = await acquireOrShareLock(opts);
|
||||||
|
if (lockResult === "wait-existing") {
|
||||||
|
// Another process is spawning; just wait for the socket to appear.
|
||||||
|
return await pollForSocket(opts.budgetMs ?? 3_000);
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const { spawn } = await import("node:child_process");
|
||||||
|
const binary = await resolveCliBinary();
|
||||||
|
const args = ["daemon", "up"];
|
||||||
|
if (opts.mesh) args.push("--mesh", opts.mesh);
|
||||||
|
|
||||||
|
const child = spawn(binary, args, {
|
||||||
|
detached: true,
|
||||||
|
stdio: "ignore",
|
||||||
|
env: { ...process.env, CLAUDEMESH_INTERNAL_NO_AUTOSPAWN: "1" },
|
||||||
|
});
|
||||||
|
child.unref();
|
||||||
|
|
||||||
|
const polled = await pollForSocket(opts.budgetMs ?? 3_000);
|
||||||
|
if (polled.ok) clearSpawnFailure();
|
||||||
|
return polled;
|
||||||
|
} catch (err) {
|
||||||
|
return { ok: false, reason: err instanceof Error ? err.message : String(err) };
|
||||||
|
} finally {
|
||||||
|
releaseLock();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Acquire spawn lock. If another process holds it AND its pid is alive,
|
||||||
|
* return "wait-existing" so we share that spawn attempt. If the pid is
|
||||||
|
* dead, take over the lock. */
|
||||||
|
async function acquireOrShareLock(_opts: EnsureDaemonOpts): Promise<"acquired" | "wait-existing"> {
|
||||||
|
const lockPath = SPAWN_LOCK_FILE();
|
||||||
|
if (existsSync(lockPath)) {
|
||||||
|
try {
|
||||||
|
const pidStr = readFileSync(lockPath, "utf8").trim();
|
||||||
|
const pid = Number.parseInt(pidStr, 10);
|
||||||
|
if (Number.isFinite(pid) && pid > 0) {
|
||||||
|
try {
|
||||||
|
process.kill(pid, 0); // signal 0 = liveness probe
|
||||||
|
return "wait-existing";
|
||||||
|
} catch {
|
||||||
|
// Holder is dead — fall through to take over.
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch { /* unreadable lock — take over */ }
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
writeFileSync(lockPath, String(process.pid), { mode: 0o600 });
|
||||||
|
} catch { /* best-effort; lock is advisory */ }
|
||||||
|
return "acquired";
|
||||||
|
}
|
||||||
|
|
||||||
|
function releaseLock(): void {
|
||||||
|
try { unlinkSync(SPAWN_LOCK_FILE()); } catch { /* best-effort */ }
|
||||||
|
}
|
||||||
|
|
||||||
|
async function pollForSocket(budgetMs: number): Promise<SpawnResult> {
|
||||||
|
const start = Date.now();
|
||||||
|
while (Date.now() - start < budgetMs) {
|
||||||
|
if (existsSync(DAEMON_PATHS.SOCK_FILE)) {
|
||||||
|
// Don't just trust file presence — confirm it answers.
|
||||||
|
const probe = await probeDaemon();
|
||||||
|
if (probe === "up") return { ok: true };
|
||||||
|
}
|
||||||
|
await new Promise((r) => setTimeout(r, 150));
|
||||||
|
}
|
||||||
|
return { ok: false, reason: `socket did not appear within ${budgetMs}ms` };
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Resolve the absolute path to the `claudemesh` binary the user is running.
|
||||||
|
* When invoked via tsx/bun in dev, fall back to the system `claudemesh`. */
|
||||||
|
async function resolveCliBinary(): Promise<string> {
|
||||||
|
const argv1 = process.argv[1] ?? "claudemesh";
|
||||||
|
if (/\.ts$/.test(argv1) || /node_modules|src\/entrypoints/.test(argv1)) {
|
||||||
|
try {
|
||||||
|
const { execSync } = await import("node:child_process");
|
||||||
|
return execSync("which claudemesh", { encoding: "utf8" }).trim() || "claudemesh";
|
||||||
|
} catch {
|
||||||
|
return "claudemesh";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return argv1;
|
||||||
|
}
|
||||||
54
apps/cli/src/ui/warnings.ts
Normal file
54
apps/cli/src/ui/warnings.ts
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
/**
|
||||||
|
* Once-per-process daemon-state warnings, routed to stderr.
|
||||||
|
*
|
||||||
|
* Suppressed under --quiet (caller responsibility — we never inspect
|
||||||
|
* argv). JSON callers should consult the result's `state` field
|
||||||
|
* directly and skip calling this helper.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import type { EnsureDaemonResult } from "~/services/daemon/lifecycle.js";
|
||||||
|
import { dim } from "./styles.js";
|
||||||
|
|
||||||
|
let alreadyWarned = false;
|
||||||
|
|
||||||
|
export interface WarnDaemonOpts {
|
||||||
|
quiet?: boolean;
|
||||||
|
/** When true, emit nothing — the caller will surface the state in JSON. */
|
||||||
|
json?: boolean;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Print a single, severity-appropriate line to stderr describing the
|
||||||
|
* result of `ensureDaemonReady`. Returns whether anything was printed. */
|
||||||
|
export function warnDaemonState(
|
||||||
|
res: EnsureDaemonResult,
|
||||||
|
opts: WarnDaemonOpts = {},
|
||||||
|
): boolean {
|
||||||
|
if (alreadyWarned) return false;
|
||||||
|
if (opts.quiet || opts.json) return false;
|
||||||
|
if (res.state === "up") return false;
|
||||||
|
|
||||||
|
alreadyWarned = true;
|
||||||
|
const tag = (label: string) => `[claudemesh] ${label}`;
|
||||||
|
const hint = (s: string) => dim(s);
|
||||||
|
|
||||||
|
switch (res.state) {
|
||||||
|
case "started":
|
||||||
|
process.stderr.write(`${tag("info")} daemon restarted automatically ${hint(`(took ${res.durationMs}ms)`)}\n`);
|
||||||
|
return true;
|
||||||
|
case "down":
|
||||||
|
process.stderr.write(`${tag("info")} daemon not running — using cold path ${hint("(slower; run `claudemesh daemon up` for warm path)")}\n`);
|
||||||
|
return true;
|
||||||
|
case "spawn-suppressed":
|
||||||
|
process.stderr.write(`${tag("warn")} ${res.reason ?? "daemon failed to start recently"} — using cold path ${hint("(run `claudemesh doctor`)")}\n`);
|
||||||
|
return true;
|
||||||
|
case "spawn-failed":
|
||||||
|
process.stderr.write(`${tag("warn")} daemon spawn failed${res.reason ? `: ${res.reason}` : ""} — using cold path ${hint("(check ~/.claudemesh/daemon/daemon.log)")}\n`);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Reset the once-per-process latch. Test helper. */
|
||||||
|
export function _resetDaemonWarningLatch(): void {
|
||||||
|
alreadyWarned = false;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user