feat(cli,broker): stable session identity — fix ghost peers + lost DMs (1.35.0)
Session identity is now anchored on Claude Code's session UUID instead of a fresh random keypair per launch. The ed25519 session keypair is generated once per (mesh, session UUID) and persisted under ~/.claudemesh/sessions/<mesh>/<uuid>.json, so relaunching or --resume-ing the same session reuses the same sessionPubkey. Why: a DM is sealed (crypto_box) to the recipient's sessionPubkey. With ephemeral per-launch keys, the pubkey rotated on every relaunch, so queued messages became undecryptable AND the old presence lingered as a same-name ghost that won queued-DM claim races. Reconnecting could not recover the peer because it minted yet another key. On --resume the CLI also registered a throwaway random id unrelated to the resumed session, so the broker never recognized the returning peer. CLI (launch.ts): - resolve the stable UUID for all paths: fresh mints + forces via --session-id; --resume V registers V; --continue resolves the most-recent session UUID from ~/.claude/projects/<cwd>. - use loadOrCreateSessionKeypair(mesh, uuid) instead of generateKeypair(). CLI (daemon/run.ts): - onRegister closes any prior SessionBrokerClient holding the same pubkey under a different token (the leaked-WS ghost). Broker (handleSessionHello): - reattach by sessionPubkey regardless of lease state (online or grace), closing the stale socket — enforces one live presence per session pubkey, killing the duplicate and draining queued DMs on return. Trade-off: session secret keys now persist on disk (the member key already does); SPEC.md updated to reflect the stable-identity model. Older CLIs remain compatible (they keep using ephemeral keys). New: keypair-store.ts + 7 unit tests. Full CLI suite: 114/114 green. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -2191,23 +2191,38 @@ async function handleSessionHello(
|
||||
// session leave.
|
||||
for (const [pid, oldConn] of connections) {
|
||||
if (oldConn.meshId !== hello.meshId) continue;
|
||||
if (oldConn.leaseState !== "offline") continue;
|
||||
if (oldConn.sessionPubkey !== hello.sessionPubkey) continue;
|
||||
|
||||
// Same sessionPubkey = same logical session. The CLI now anchors the
|
||||
// session keypair on Claude Code's session UUID and persists it, so a
|
||||
// matching pubkey is always the same peer relaunching/resuming — never
|
||||
// a coincidental collision. Reattach whether the old lease is in its
|
||||
// 90s grace window OR still nominally "online" (a duplicate/relaunch
|
||||
// that raced ahead of the old socket's close). The new WS is
|
||||
// authoritative: cancel any eviction timer, close the stale socket if
|
||||
// it differs, swap in the new WS, restore online. This is the "one
|
||||
// presence per session pubkey" invariant — it kills the same-name
|
||||
// ghost that used to win queued-DM claim races.
|
||||
const wasState = oldConn.leaseState;
|
||||
if (oldConn.evictionTimer) {
|
||||
clearTimeout(oldConn.evictionTimer);
|
||||
oldConn.evictionTimer = null;
|
||||
}
|
||||
if (oldConn.ws !== ws) {
|
||||
try { oldConn.ws.close(1000, "session_replaced"); } catch { /* already dead */ }
|
||||
}
|
||||
oldConn.ws = ws;
|
||||
oldConn.leaseState = "online";
|
||||
oldConn.leaseUntil = 0;
|
||||
oldConn.lastPongAt = Date.now();
|
||||
// Refresh mutable fields from the new hello.
|
||||
oldConn.sessionId = hello.sessionId;
|
||||
oldConn.cwd = hello.cwd;
|
||||
if (hello.displayName) oldConn.displayName = hello.displayName;
|
||||
log.info("session_hello reattach (lease)", {
|
||||
log.info("session_hello reattach", {
|
||||
presence_id: pid,
|
||||
session_pubkey: hello.sessionPubkey.slice(0, 12),
|
||||
was: wasState,
|
||||
});
|
||||
void restorePresence(pid);
|
||||
void maybePushQueuedMessages(pid);
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "claudemesh-cli",
|
||||
"version": "1.34.18",
|
||||
"version": "1.35.0",
|
||||
"description": "Peer mesh for Claude Code sessions — CLI + MCP server.",
|
||||
"keywords": [
|
||||
"claude-code",
|
||||
|
||||
@@ -42,6 +42,37 @@ export interface LaunchFlags {
|
||||
quiet?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve the most-recently-active Claude Code session UUID for a cwd by
|
||||
* inspecting `~/.claude/projects/<encoded-cwd>/<uuid>.jsonl`. Claude Code
|
||||
* encodes the project dir as the absolute path with every `/` → `-`.
|
||||
*
|
||||
* Used by `--continue` (which otherwise gives us no UUID to anchor on) so
|
||||
* a continued session re-attaches to the same claudemesh peer it last
|
||||
* represented. Returns undefined when the project dir is absent/empty —
|
||||
* the caller then falls back to an ephemeral identity.
|
||||
*/
|
||||
function resolveLatestSessionUuid(cwd: string): string | undefined {
|
||||
try {
|
||||
const slug = cwd.replace(/\//g, "-");
|
||||
const dir = join(homedir(), ".claude", "projects", slug);
|
||||
if (!existsSync(dir)) return undefined;
|
||||
const uuidRe = /^([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})\.jsonl$/i;
|
||||
let newest: { id: string; mtime: number } | null = null;
|
||||
for (const entry of readdirSync(dir)) {
|
||||
const m = uuidRe.exec(entry);
|
||||
if (!m) continue;
|
||||
try {
|
||||
const mtime = statSync(join(dir, entry)).mtimeMs;
|
||||
if (!newest || mtime > newest.mtime) newest = { id: m[1]!, mtime };
|
||||
} catch { /* file vanished mid-scan — skip */ }
|
||||
}
|
||||
return newest?.id;
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
// --- Interactive mesh picker ---
|
||||
|
||||
/**
|
||||
@@ -754,8 +785,28 @@ export async function runLaunch(flags: LaunchFlags, rawArgs: string[]): Promise<
|
||||
// the TDZ → ReferenceError swallowed by the surrounding catch.
|
||||
// The IPC registration has been silently failing every launch
|
||||
// since 1.29.0. Hoist the declaration up so it actually runs.
|
||||
// Session identity is anchored on Claude Code's session UUID — the
|
||||
// stable thing `--resume` is built on — so the same logical peer keeps
|
||||
// one identity (and one persisted keypair) across relaunches:
|
||||
// - fresh launch: mint a UUID and force it on claude via --session-id.
|
||||
// - --resume V: register V (the returning peer), let claude resume it.
|
||||
// - --continue: resolve the most-recent session UUID in this cwd so
|
||||
// we re-attach to the same peer instead of minting a
|
||||
// throwaway id (the bug that orphaned queued DMs and
|
||||
// spawned same-name ghosts on every relaunch).
|
||||
const isResume = args.resume !== null || args.continueSession;
|
||||
const claudeSessionId = isResume ? undefined : randomUUID();
|
||||
let claudeSessionId: string | undefined;
|
||||
if (args.resume) {
|
||||
claudeSessionId = args.resume;
|
||||
} else if (args.continueSession) {
|
||||
claudeSessionId = resolveLatestSessionUuid(process.cwd());
|
||||
} else {
|
||||
claudeSessionId = randomUUID();
|
||||
}
|
||||
// Only fresh launches may dictate the UUID via --session-id; --resume
|
||||
// and --continue carry their own session selection and claude rejects
|
||||
// --session-id alongside them.
|
||||
const passSessionIdFlag = !isResume;
|
||||
let sessionTokenFilePath: string | null = null;
|
||||
let sessionTokenForCleanup: string | null = null;
|
||||
try {
|
||||
@@ -780,7 +831,13 @@ export async function runLaunch(flags: LaunchFlags, rawArgs: string[]): Promise<
|
||||
try {
|
||||
const { generateKeypair } = await import("~/services/crypto/facade.js");
|
||||
const { signParentAttestation } = await import("~/services/broker/session-hello-sig.js");
|
||||
const sessionKp = await generateKeypair();
|
||||
// Persisted, UUID-anchored keypair so relaunch/--resume reuse the
|
||||
// same sessionPubkey (queued DMs route AND decrypt). Falls back to
|
||||
// an ephemeral keypair when we couldn't resolve a stable UUID
|
||||
// (e.g. --continue with no prior session in this cwd).
|
||||
const sessionKp = claudeSessionId
|
||||
? await (await import("~/services/session/keypair-store.js")).loadOrCreateSessionKeypair(mesh.slug, claudeSessionId)
|
||||
: await generateKeypair();
|
||||
const att = await signParentAttestation({
|
||||
parentMemberPubkey: mesh.pubkey,
|
||||
parentSecretKey: mesh.secretKey,
|
||||
@@ -917,7 +974,7 @@ export async function runLaunch(flags: LaunchFlags, rawArgs: string[]): Promise<
|
||||
const claudeArgs = [
|
||||
"--dangerously-load-development-channels",
|
||||
"server:claudemesh",
|
||||
...(claudeSessionId ? ["--session-id", claudeSessionId] : []),
|
||||
...(passSessionIdFlag && claudeSessionId ? ["--session-id", claudeSessionId] : []),
|
||||
...(args.resume ? ["--resume", args.resume] : []),
|
||||
...(args.continueSession ? ["--continue"] : []),
|
||||
...(args.skipPermConfirm ? ["--dangerously-skip-permissions"] : []),
|
||||
|
||||
@@ -230,6 +230,20 @@ export async function runDaemon(opts: RunDaemonOptions = {}): Promise<number> {
|
||||
}
|
||||
prior.close().catch(() => { /* ignore */ });
|
||||
}
|
||||
// Also drop any stale WS holding this session pubkey under a
|
||||
// DIFFERENT token. With UUID-anchored persistent keypairs a relaunch
|
||||
// reuses the pubkey, so without this the old SessionBrokerClient
|
||||
// would linger connected (the broker then sees two presences for one
|
||||
// pubkey — the same-name ghost that stole queued DMs). Dedup by
|
||||
// pubkey closes it before the new WS opens.
|
||||
const priorByPubkey = sessionBrokersByPubkey.get(info.presence.sessionPubkey);
|
||||
if (priorByPubkey && priorByPubkey !== prior) {
|
||||
for (const [tok, c] of sessionBrokers) {
|
||||
if (c === priorByPubkey) { sessionBrokers.delete(tok); break; }
|
||||
}
|
||||
sessionBrokersByPubkey.delete(info.presence.sessionPubkey);
|
||||
priorByPubkey.close().catch(() => { /* ignore */ });
|
||||
}
|
||||
// 1.32.1 — wire push delivery. Messages targeted at the launched
|
||||
// session's pubkey land on THIS WS, not on the member-keyed one,
|
||||
// so without this forward they'd silently disappear (the bug that
|
||||
|
||||
147
apps/cli/src/services/session/keypair-store.ts
Normal file
147
apps/cli/src/services/session/keypair-store.ts
Normal file
@@ -0,0 +1,147 @@
|
||||
/**
|
||||
* Persistent per-session ed25519 keypairs, keyed by Claude Code's
|
||||
* session UUID.
|
||||
*
|
||||
* Background. Until this module landed, `claudemesh launch` minted a
|
||||
* FRESH ephemeral session keypair on every invocation (see
|
||||
* SPEC.md §"Session identity"). That made a peer's routing/crypto
|
||||
* identity unstable across relaunch and `--resume`: a DM is sealed to
|
||||
* the recipient's `sessionPubkey` (crypto_box; see services/crypto/box.ts),
|
||||
* so when the key rotated, any message queued for the old pubkey became
|
||||
* undecryptable AND the old presence lingered as a ghost on the broker.
|
||||
*
|
||||
* The fix anchors session identity on the stable thing Claude Code
|
||||
* itself uses for resume: the session UUID (scoped to the project/cwd).
|
||||
* The keypair for a given (mesh, sessionUuid) is generated once and
|
||||
* persisted, so:
|
||||
* - relaunching / `--resume`-ing the same session reuses the SAME
|
||||
* pubkey → the broker reattaches the existing presence and queued
|
||||
* DMs both route AND decrypt;
|
||||
* - a genuinely new session (fresh UUID) gets a fresh keypair → it is
|
||||
* correctly a distinct peer.
|
||||
*
|
||||
* Storage. `~/.claudemesh/sessions/<meshSlug>/<sessionUuid>.json`, the
|
||||
* file mode 0o600 inside a 0o700 dir — same secret-hygiene as the IPC
|
||||
* token store. The secret key lives on disk (like the member key
|
||||
* already does in the mesh config); the threat-model delta over the old
|
||||
* ephemeral scheme is small and was an accepted trade for reliable
|
||||
* delivery. `CLAUDEMESH_SESSIONS_DIR` overrides the root for tests.
|
||||
*/
|
||||
|
||||
import { randomBytes } from "node:crypto";
|
||||
import { existsSync, mkdirSync, readFileSync, renameSync, rmSync, writeFileSync } from "node:fs";
|
||||
import { homedir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
|
||||
import { generateKeypair, type Ed25519Keypair } from "~/services/crypto/facade.js";
|
||||
|
||||
const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
|
||||
const SLUG_RE = /^[a-z0-9._-]+$/i;
|
||||
|
||||
interface StoredKeypair {
|
||||
version: 1;
|
||||
meshSlug: string;
|
||||
sessionId: string;
|
||||
publicKey: string;
|
||||
secretKey: string;
|
||||
createdAt: string;
|
||||
}
|
||||
|
||||
/** Root dir for persisted session keypairs. Stable per-machine; does
|
||||
* NOT honor the per-launch `CLAUDEMESH_CONFIG_DIR` tmpdir (those are
|
||||
* ephemeral and would defeat persistence). */
|
||||
export function sessionsDir(): string {
|
||||
return (
|
||||
process.env.CLAUDEMESH_SESSIONS_DIR ||
|
||||
join(homedir(), ".claudemesh", "sessions")
|
||||
);
|
||||
}
|
||||
|
||||
function keyFilePath(meshSlug: string, sessionId: string): string {
|
||||
return join(sessionsDir(), meshSlug, `${sessionId}.json`);
|
||||
}
|
||||
|
||||
/** Read a persisted keypair, returning null (never throwing) when the
|
||||
* file is missing, unreadable, malformed, or carries an invalid key. */
|
||||
function readValidKeypair(file: string): Ed25519Keypair | null {
|
||||
try {
|
||||
if (!existsSync(file)) return null;
|
||||
const parsed = JSON.parse(readFileSync(file, "utf8")) as Partial<StoredKeypair>;
|
||||
if (
|
||||
parsed &&
|
||||
typeof parsed.publicKey === "string" &&
|
||||
/^[0-9a-f]{64}$/.test(parsed.publicKey) &&
|
||||
typeof parsed.secretKey === "string" &&
|
||||
/^[0-9a-f]{128}$/.test(parsed.secretKey)
|
||||
) {
|
||||
return { publicKey: parsed.publicKey, secretKey: parsed.secretKey };
|
||||
}
|
||||
} catch {
|
||||
// Unreadable / corrupt — caller treats as absent and rewrites.
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the persisted keypair for (meshSlug, sessionId), creating and
|
||||
* writing one on first use. Re-reads from disk every call so concurrent
|
||||
* launches of the same session converge on one identity rather than
|
||||
* racing to mint divergent keys.
|
||||
*
|
||||
* Falls back to an in-memory ephemeral keypair (the legacy behaviour)
|
||||
* when the identifiers are unusable or disk I/O fails — a launch must
|
||||
* never be blocked by a keystore problem.
|
||||
*/
|
||||
export async function loadOrCreateSessionKeypair(
|
||||
meshSlug: string,
|
||||
sessionId: string,
|
||||
): Promise<Ed25519Keypair> {
|
||||
// Defensive validation: these compose into a filesystem path, so a
|
||||
// malformed slug/uuid must never escape the sessions dir.
|
||||
if (!SLUG_RE.test(meshSlug) || !UUID_RE.test(sessionId)) {
|
||||
return generateKeypair();
|
||||
}
|
||||
|
||||
const file = keyFilePath(meshSlug, sessionId);
|
||||
const existing = readValidKeypair(file);
|
||||
if (existing) return existing;
|
||||
|
||||
const kp = await generateKeypair();
|
||||
try {
|
||||
mkdirSync(join(sessionsDir(), meshSlug), { recursive: true, mode: 0o700 });
|
||||
const stored: StoredKeypair = {
|
||||
version: 1,
|
||||
meshSlug,
|
||||
sessionId,
|
||||
publicKey: kp.publicKey,
|
||||
secretKey: kp.secretKey,
|
||||
createdAt: new Date().toISOString(),
|
||||
};
|
||||
// Write to a temp sibling then rename for atomicity, so a concurrent
|
||||
// reader never sees a half-written file.
|
||||
const tmp = `${file}.${randomBytes(6).toString("hex")}.tmp`;
|
||||
writeFileSync(tmp, JSON.stringify(stored), { mode: 0o600 });
|
||||
try {
|
||||
// Re-check: another launch may have won the race and created the
|
||||
// canonical file with a VALID keypair while we were generating —
|
||||
// prefer it. A corrupt/invalid existing file is not a winner; fall
|
||||
// through and overwrite it via the atomic rename below.
|
||||
if (existsSync(file)) {
|
||||
const won = readValidKeypair(file);
|
||||
if (won) {
|
||||
try { rmSync(tmp, { force: true }); } catch { /* ignore */ }
|
||||
return won;
|
||||
}
|
||||
}
|
||||
// renameSync is atomic on the same filesystem.
|
||||
renameSync(tmp, file);
|
||||
} catch {
|
||||
// rename failed — best effort, the in-memory keypair is still valid
|
||||
// for this launch.
|
||||
}
|
||||
} catch {
|
||||
// mkdir/write failed — return the freshly generated keypair anyway so
|
||||
// the launch proceeds (degrades to ephemeral, same as legacy).
|
||||
}
|
||||
return kp;
|
||||
}
|
||||
96
apps/cli/tests/unit/keypair-store.test.ts
Normal file
96
apps/cli/tests/unit/keypair-store.test.ts
Normal file
@@ -0,0 +1,96 @@
|
||||
/**
|
||||
* Persisted, UUID-anchored session keypairs (delivery-reliability fix).
|
||||
*
|
||||
* The keystore is what makes a peer's sessionPubkey stable across
|
||||
* relaunch/--resume, so queued DMs (sealed to that pubkey) both route to
|
||||
* and decrypt on the returning session. Verifies:
|
||||
* - the same (mesh, uuid) returns the SAME keypair across calls and
|
||||
* across a fresh module read (persisted to disk);
|
||||
* - distinct uuids / meshes get distinct keypairs;
|
||||
* - malformed identifiers fall back to an ephemeral keypair and never
|
||||
* escape the sessions dir;
|
||||
* - a corrupt on-disk file is transparently rewritten.
|
||||
*/
|
||||
|
||||
import { mkdtempSync, rmSync, writeFileSync, existsSync, readdirSync } from "node:fs";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { afterEach, beforeEach, describe, expect, test } from "vitest";
|
||||
|
||||
import {
|
||||
loadOrCreateSessionKeypair,
|
||||
sessionsDir,
|
||||
} from "../../src/services/session/keypair-store.js";
|
||||
|
||||
const UUID_A = "11111111-2222-3333-4444-555555555555";
|
||||
const UUID_B = "66666666-7777-8888-9999-aaaaaaaaaaaa";
|
||||
|
||||
let dir: string;
|
||||
|
||||
beforeEach(() => {
|
||||
dir = mkdtempSync(join(tmpdir(), "cm-keystore-"));
|
||||
process.env.CLAUDEMESH_SESSIONS_DIR = dir;
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
delete process.env.CLAUDEMESH_SESSIONS_DIR;
|
||||
rmSync(dir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
describe("loadOrCreateSessionKeypair", () => {
|
||||
test("same (mesh, uuid) is stable across calls", async () => {
|
||||
const a = await loadOrCreateSessionKeypair("flexicar", UUID_A);
|
||||
const b = await loadOrCreateSessionKeypair("flexicar", UUID_A);
|
||||
expect(a.publicKey).toBe(b.publicKey);
|
||||
expect(a.secretKey).toBe(b.secretKey);
|
||||
expect(a.publicKey).toMatch(/^[0-9a-f]{64}$/);
|
||||
expect(a.secretKey).toMatch(/^[0-9a-f]{128}$/);
|
||||
});
|
||||
|
||||
test("persists to disk under sessionsDir/<mesh>/<uuid>.json", async () => {
|
||||
await loadOrCreateSessionKeypair("flexicar", UUID_A);
|
||||
const file = join(sessionsDir(), "flexicar", `${UUID_A}.json`);
|
||||
expect(existsSync(file)).toBe(true);
|
||||
});
|
||||
|
||||
test("distinct uuids get distinct keys", async () => {
|
||||
const a = await loadOrCreateSessionKeypair("flexicar", UUID_A);
|
||||
const b = await loadOrCreateSessionKeypair("flexicar", UUID_B);
|
||||
expect(a.publicKey).not.toBe(b.publicKey);
|
||||
});
|
||||
|
||||
test("distinct meshes get distinct keys for the same uuid", async () => {
|
||||
const a = await loadOrCreateSessionKeypair("flexicar", UUID_A);
|
||||
const b = await loadOrCreateSessionKeypair("other-mesh", UUID_A);
|
||||
expect(a.publicKey).not.toBe(b.publicKey);
|
||||
});
|
||||
|
||||
test("malformed uuid falls back to ephemeral, writes nothing", async () => {
|
||||
const a = await loadOrCreateSessionKeypair("flexicar", "not-a-uuid");
|
||||
const b = await loadOrCreateSessionKeypair("flexicar", "not-a-uuid");
|
||||
expect(a.publicKey).toMatch(/^[0-9a-f]{64}$/);
|
||||
// Ephemeral → not persisted → each call is fresh.
|
||||
expect(a.publicKey).not.toBe(b.publicKey);
|
||||
expect(existsSync(join(dir, "flexicar"))).toBe(false);
|
||||
});
|
||||
|
||||
test("path-traversal slug is rejected (ephemeral, no escape)", async () => {
|
||||
const a = await loadOrCreateSessionKeypair("../../etc", UUID_A);
|
||||
expect(a.publicKey).toMatch(/^[0-9a-f]{64}$/);
|
||||
// Nothing written under the sessions dir for a rejected slug.
|
||||
expect(readdirSync(dir)).toHaveLength(0);
|
||||
});
|
||||
|
||||
test("corrupt on-disk file is rewritten and yields a valid key", async () => {
|
||||
const a = await loadOrCreateSessionKeypair("flexicar", UUID_A);
|
||||
const file = join(sessionsDir(), "flexicar", `${UUID_A}.json`);
|
||||
writeFileSync(file, "{ this is not valid json", "utf8");
|
||||
const b = await loadOrCreateSessionKeypair("flexicar", UUID_A);
|
||||
expect(b.publicKey).toMatch(/^[0-9a-f]{64}$/);
|
||||
// Rewritten to a fresh, internally-consistent keypair (distinct from
|
||||
// the now-clobbered original).
|
||||
expect(b.publicKey).not.toBe(a.publicKey);
|
||||
const c = await loadOrCreateSessionKeypair("flexicar", UUID_A);
|
||||
expect(c.publicKey).toBe(b.publicKey);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user