Merge pull request #410 from RayCharlizard/fix/408-cowork-vm-daemon-recovery

fix: cowork-vm-service daemon recovery and crash diagnostics (#408)
This commit is contained in:
Travis
2026-04-18 22:11:12 -05:00
committed by GitHub
4 changed files with 330 additions and 56 deletions

View File

@@ -9,6 +9,7 @@ This project repackages Claude Desktop (Electron app) for Debian/Ubuntu Linux, a
The [`docs/learnings/`](docs/learnings/) directory contains hard-won technical knowledge from debugging and fixing issues — things that aren't obvious from reading the code or docs alone. Consult these before working on related areas. Add new entries when you discover something non-obvious that would save future contributors (human or AI) significant time.
- [`nix.md`](docs/learnings/nix.md) — NixOS packaging, Electron resource path resolution, testing without NixOS
- [`cowork-vm-daemon.md`](docs/learnings/cowork-vm-daemon.md) — Cowork VM daemon lifecycle, respawn logic, crash diagnosis
## Code Style

View File

@@ -1271,6 +1271,15 @@ if (!code.includes('"linux":{') && !code.includes("'linux':{") &&
//
// Fix: patch the ENOENT check to also match ECONNREFUSED on Linux,
// then inject auto-launch before the retry delay.
//
// The auto-launch uses a timestamp-based cooldown (_lastSpawn) instead
// of a one-shot boolean so the daemon can be re-spawned after it dies
// mid-session (issue #408). 10s cooldown prevents fork storms on hard
// failures while allowing recovery on the next retry iteration.
//
// stdout/stderr of the forked daemon is piped to
// ~/.config/Claude/logs/cowork_vm_daemon.log so crashes are no longer
// silent. Falls back to "ignore" if the log dir can't be opened.
// ============================================================
const serviceErrorStr = 'VM service not running. The service failed to start.';
const serviceErrorIdx = code.indexOf(serviceErrorStr);
@@ -1333,17 +1342,32 @@ if (serviceErrorIdx !== -1) {
while ((funcMatch = funcNameRe.exec(funcRegion)) !== null) {
retryFuncName = funcMatch[1];
}
const svcLaunchedGuard = retryFuncName
? retryFuncName + '._svcLaunched'
: '_globalSvcLaunched';
const spawnGuard = retryFuncName
? retryFuncName + '._lastSpawn'
: '_globalLastSpawn';
// Cooldown in ms — long enough to avoid fork storms, short enough
// that the retry loop can re-spawn after a mid-session daemon death.
const autoLaunch =
'process.platform==="linux"&&!' + svcLaunchedGuard + '&&(' + svcLaunchedGuard + '=true,' +
'process.platform==="linux"&&' +
'(!' + spawnGuard + '||Date.now()-' + spawnGuard + '>1e4)' +
'&&(' + spawnGuard + '=Date.now(),' +
'(()=>{try{' +
'const _d=require("path").join(process.resourcesPath,' +
'const _p=require("path"),_fs=require("fs");' +
'const _d=_p.join(process.resourcesPath,' +
'"app.asar.unpacked","' + svcPath + '");' +
'if(require("fs").existsSync(_d)){' +
'if(_fs.existsSync(_d)){' +
// Open daemon log for append; fall back to ignoring stdio.
'let _stdio="ignore";' +
'try{' +
'const _ld=_p.join(process.env.HOME||"/tmp",' +
'".config/Claude/logs");' +
'_fs.mkdirSync(_ld,{recursive:true});' +
'const _fd=_fs.openSync(' +
'_p.join(_ld,"cowork_vm_daemon.log"),"a");' +
'_stdio=["ignore",_fd,_fd,"ipc"]' +
'}catch(_){}' +
'const _c=require("child_process").fork(_d,[],' +
'{detached:true,stdio:"ignore",env:{...process.env,' +
'{detached:true,stdio:_stdio,env:{...process.env,' +
'ELECTRON_RUN_AS_NODE:"1"}});' +
'global.__coworkDaemonPid=_c.pid;_c.unref()}' +
'}catch(_e){console.error("[cowork-autolaunch]",_e)}})()),';
@@ -1358,6 +1382,47 @@ if (serviceErrorIdx !== -1) {
console.log(' WARNING: Could not find VM service error string for auto-launch');
}
// ============================================================
// Patch 6b: Extend auto-reinstall delete list (issue #408)
// Anchor: const NAME=["rootfs.img",...] — the module-level array
// driving the reinstall-files cleanup in _ue()/deleteVMBundle().
//
// Upstream preserves sessiondata.img and rootfs.img.zst across
// auto-reinstall to avoid re-download. On 1.2773.0, preserving
// them puts the daemon into an unstartable state that persists
// across app restarts and OS reboots. Trade-off: next startup
// re-downloads/re-extracts these files. This only runs on the
// auto-reinstall path (already in a failed state), so biasing
// toward recovery over re-download avoidance is correct.
// ============================================================
{
const reinstallArrRe = /const (\w+)=\[("rootfs\.img"[^\]]*)\];/;
const arrMatch = code.match(reinstallArrRe);
if (arrMatch) {
const [whole, name, contents] = arrMatch;
const additions = [];
if (!contents.includes('"sessiondata.img"')) {
additions.push('"sessiondata.img"');
}
if (!contents.includes('"rootfs.img.zst"')) {
additions.push('"rootfs.img.zst"');
}
if (additions.length) {
const newContents = contents + ',' + additions.join(',');
code = code.replace(
whole,
'const ' + name + '=[' + newContents + '];'
);
console.log(' Added VM images to reinstall delete list');
patchCount++;
} else {
console.log(' Reinstall delete list already includes VM images');
}
} else {
console.log(' WARNING: Could not find reinstall file list array');
}
}
// ============================================================
// Patch 7: Skip Windows-specific smol-bin.vhdx copy on Linux
// The code already checks: if(process.platform==="win32")

View File

@@ -0,0 +1,174 @@
# Cowork VM Daemon — Learnings
## Architecture Overview
Cowork mode on Linux uses a custom Node.js daemon
([`scripts/cowork-vm-service.js`](../../scripts/cowork-vm-service.js))
that replaces the Windows cowork-vm-service. The Electron app talks to
it over a Unix domain socket at
`$XDG_RUNTIME_DIR/cowork-vm-service.sock` using length-prefixed JSON —
the same wire format as the Windows named pipe.
The daemon is forked by **Patch 6** in `build.sh`'s
`patch_cowork_linux()` function, which injects auto-launch code into
the Electron app's retry loop for the VM-service connection.
## Daemon Lifecycle
1. First connect attempt: the app tries `$XDG_RUNTIME_DIR/cowork-vm-service.sock`.
2. `ENOENT` / `ECONNREFUSED`: retry loop catches the error (the
`ECONNREFUSED` branch is Linux-only, added by Patch 6 step 1 so
stale sockets don't bypass retry).
3. Auto-launch (Patch 6 step 2): the injected code forks the daemon
via `child_process.fork()` with `detached:true`, stdio redirected
to `~/.config/Claude/logs/cowork_vm_daemon.log`.
4. Spawn cooldown: `FUNC._lastSpawn = Date.now()` — subsequent
iterations only re-fork after 10 s have elapsed. This replaces the
old one-shot `_svcLaunched` boolean so the retry loop can recover
after mid-session daemon death (issue #408).
5. Retry: the loop waits and reconnects, which now succeeds.
## Issue #408 — Daemon Recovery
### Root cause (one-shot guard)
Before the fix, Patch 6 injected:
```javascript
process.platform==="linux" && !FUNC._svcLaunched && (
FUNC._svcLaunched = true,
/* fork daemon */
)
```
`FUNC._svcLaunched` was set on the first successful spawn and never
cleared, so when the daemon died mid-session the retry loop saw the
guard already set and skipped the re-fork. The client looped forever
on `connect ENOENT`.
### Fix (rate-limited respawn)
Timestamp-based cooldown replaces the boolean:
```javascript
process.platform==="linux" &&
(!FUNC._lastSpawn || Date.now() - FUNC._lastSpawn > 1e4) &&
(FUNC._lastSpawn = Date.now(), /* fork daemon */)
```
10 s is short enough that the retry loop (which sleeps on the order of
seconds between iterations) recovers promptly after a crash, and long
enough that a crash-looping daemon can't turn into a fork bomb.
### Secondary cause (preserved images block recovery)
The app's `_ue()` / `deleteVMBundle()` function deletes a whitelist of
reinstall files on auto-reinstall. Upstream deliberately preserves
`sessiondata.img` and `rootfs.img.zst` to avoid re-download.
On 1.2773.0 those preserved files put the daemon into an unstartable
state that persists across app restart and OS reboot. The client's
symptom is `connect ENOENT` (daemon never got far enough to create the
socket) rather than `ECONNREFUSED` (daemon started, crashed, socket
stayed). RayCharlizard (2026-04-16) confirmed that manually wiping
`~/.config/Claude/vm_bundles/claudevm.bundle/` is required to recover,
even after rolling back the AppImage to a known-good version.
### Fix (extend delete list — Patch 6b)
`build.sh` now matches the `const NAME=["rootfs.img",...]` array at
module level and appends `"sessiondata.img"` and `"rootfs.img.zst"` if
they're not already present. The auto-reinstall path now wipes these
too. Trade-off: the next successful startup re-downloads/re-extracts
these files. Acceptable because auto-reinstall only runs after startup
has already failed — biasing toward recovery over re-download
avoidance is correct.
Not included in the delete list: `~/.config/Claude/claude-code-vm/`.
That's CLI-binary storage (`2.1.x/claude`), unrelated to the VM
daemon, and has its own version-check logic at `this.vmStorageDir`
inside the app. Wiping it would just force a slow re-download of the
CLI on every auto-reinstall.
## Silent Death — Now Logged
Before the fix the daemon was forked with `stdio:"ignore"`, and its
internal `log()` function was gated by `COWORK_VM_DEBUG=1`, so a crash
left no trace anywhere.
Two changes together make crashes visible:
1. **Patch 6 (client side)** redirects the forked daemon's stdout +
stderr to `~/.config/Claude/logs/cowork_vm_daemon.log`. Any
Node-level crash dump (uncaught exception pre-handler, native
assertion, etc.) now lands in that file.
2. **`cowork-vm-service.js` (daemon side)** adds `logLifecycle()`
an always-on writer that bypasses `DEBUG` for startup, SIGTERM,
SIGINT, `uncaughtException`, `unhandledRejection`, and `exit`
events. It also proactively `mkdirSync`'s the log directory so the
first write doesn't get swallowed if the daemon is the first thing
writing under `~/.config/Claude/logs/`.
Interpreting the log after a failure:
| Last line | Diagnosis |
|-----------|-----------|
| `lifecycle startup ...` + gap + no further entries | SIGKILL'd (OOM killer, `kill -9`, etc.) — no handler fires |
| `lifecycle startup` + `lifecycle listening` + nothing else | Daemon running fine but died by signal with no handler (rare; check `dmesg`) |
| `lifecycle uncaughtException ...` | JS-level crash, stack is in the log entry |
| `lifecycle SIGTERM received` + `lifecycle exit code=0` | Clean app-initiated shutdown |
| No `startup` entry at all | `fork()` didn't complete; check launcher.log for `[cowork-autolaunch]` errors |
## Key Files
- [`build.sh`](../../build.sh) lines ~1274-1390 — Patch 6 (auto-launch +
stdio pipe + rate limiter) and Patch 6b (reinstall array extension).
- [`scripts/cowork-vm-service.js`](../../scripts/cowork-vm-service.js)
lines ~49-86 — log infrastructure, including `logLifecycle()`.
- [`scripts/cowork-vm-service.js`](../../scripts/cowork-vm-service.js)
lines ~2399-2440 — signal handlers and entry point.
- [`scripts/launcher-common.sh`](../../scripts/launcher-common.sh) — `--doctor` checks.
- [`docs/cowork-linux-handover.md`](../cowork-linux-handover.md) — architecture reference.
## Diagnostic Commands
```bash
# Is the daemon running?
pgrep -af cowork-vm-service
# Socket present?
ls -la "${XDG_RUNTIME_DIR:-/tmp}/cowork-vm-service.sock"
# Watch lifecycle events as they happen
tail -f ~/.config/Claude/logs/cowork_vm_daemon.log
# Look for the last startup / exit pair
grep -E 'lifecycle (startup|exit|SIGTERM|SIGINT|uncaughtException|unhandledRejection)' \
~/.config/Claude/logs/cowork_vm_daemon.log | tail -20
# Find any orphan sockets
lsof -U 2>/dev/null | grep -iE 'cowork|claude'
# Force a respawn test: kill daemon, watch client log for reconnect
pkill -9 -f cowork-vm-service.js
tail -f ~/.cache/claude-desktop-debian/launcher.log
# Find the daemon script inside a mounted AppImage
find /tmp -path '*claude*cowork-vm-service*' 2>/dev/null
```
## Testing Notes
- **Host-direct** (`COWORK_VM_BACKEND=host`): no isolation, direct
execution. Matches the `--doctor` "host-direct (no isolation, via
override)" line. This is what issue #408 was reported against.
- **Bwrap** (`COWORK_VM_BACKEND=bwrap`): Bubblewrap sandbox; requires
`bwrap` installed.
- **KVM** (`COWORK_VM_BACKEND=kvm`): full VM; requires QEMU, KVM,
rootfs image.
- **Debug** (`COWORK_VM_DEBUG=1` or `CLAUDE_LINUX_DEBUG=1`): verbose
logging via the existing `log()` path. `logLifecycle()` is always
on regardless of this flag.
- **Force-cooldown test**: kill the daemon, relaunch a Cowork session
within 10 s — the guard should block that single retry. Wait 10 s
and retry: should succeed. Confirms the cooldown boundary.

View File

@@ -57,6 +57,15 @@ function formatArgs(args) {
.join(' ');
}
// Ensure the log directory exists once at startup so writeLog() isn't
// silently discarded when the daemon is the first thing writing under
// ~/.config/Claude/logs/ (issue #408 — crashes otherwise leave no trace).
try {
fs.mkdirSync(path.dirname(LOG_FILE), { recursive: true });
} catch (_) {
// Best-effort — writeLog() still catches any later write errors.
}
function writeLog(level, args) {
const ts = new Date().toISOString();
const msg = `${ts} [${level}] ${LOG_PREFIX} ${formatArgs(args)}\n`;
@@ -67,6 +76,15 @@ function writeLog(level, args) {
}
}
// Always-on lifecycle logger for startup/shutdown/crash events so the
// death of the daemon is never silent regardless of COWORK_VM_DEBUG.
function logLifecycle(event, detail) {
const stack = detail && detail.stack
? detail.stack
: (detail !== undefined ? String(detail) : '');
writeLog('lifecycle', stack ? [event, stack] : [event]);
}
function log(...args) {
if (!DEBUG) return;
writeLog('debug', args);
@@ -176,24 +194,24 @@ function translateGuestPath(guestPath, mountMap) {
log(`translateGuestPath: ${guestPath} -> ${normalized}`);
return normalized;
}
/**
* Resolve a subpath that may be root-relative (e.g. "home/user/.config/...")
* or home-relative (e.g. ".config/..."). app.asar generates root-relative
* subpaths via path.relative('/', absolutePath), so path.join('/', subpath)
* recovers the original absolute path. Falls back to home-relative for
* legacy or genuinely relative subpaths.
*
* Fix for https://github.com/aaddrick/claude-desktop-debian/issues/373
*/
function resolveSubpath(subpath) {
if (!subpath) return os.homedir();
const asRoot = path.resolve(path.join('/', subpath));
if (asRoot.startsWith(os.homedir() + path.sep) || asRoot === os.homedir()) {
return asRoot;
}
return path.resolve(path.join(os.homedir(), subpath));
}
/**
* Resolve a subpath that may be root-relative (e.g. "home/user/.config/...")
* or home-relative (e.g. ".config/..."). app.asar generates root-relative
* subpaths via path.relative('/', absolutePath), so path.join('/', subpath)
* recovers the original absolute path. Falls back to home-relative for
* legacy or genuinely relative subpaths.
*
* Fix for https://github.com/aaddrick/claude-desktop-debian/issues/373
*/
function resolveSubpath(subpath) {
if (!subpath) return os.homedir();
const asRoot = path.resolve(path.join('/', subpath));
if (asRoot.startsWith(os.homedir() + path.sep) || asRoot === os.homedir()) {
return asRoot;
}
return path.resolve(path.join(os.homedir(), subpath));
}
/**
* Build a mount-name -> host-path mapping from mountBinds (prior
@@ -213,7 +231,7 @@ function buildMountMap(additionalMounts, mountBinds) {
const homeDir = os.homedir();
for (const [name, info] of Object.entries(additionalMounts)) {
if (!info || !info.path) continue;
const resolved = resolveSubpath(info.path);
const resolved = resolveSubpath(info.path);
if (resolved !== homeDir &&
!resolved.startsWith(homeDir + path.sep)) {
log(`buildMountMap: rejecting "${name}" — resolves outside home: ${resolved}`);
@@ -257,31 +275,31 @@ function buildSpawnEnv(appEnv, mountMap) {
// Translate CLAUDE_CONFIG_DIR from guest path to host path, or
// remove it so Claude Code falls back to ~/.claude/.
if (mergedEnv.CLAUDE_CONFIG_DIR) {
if (mergedEnv.CLAUDE_CONFIG_DIR.startsWith('/sessions/')) {
// translate guest path to host path
const translated = translateGuestPath(
mergedEnv.CLAUDE_CONFIG_DIR, mountMap
);
if (translated !== mergedEnv.CLAUDE_CONFIG_DIR) {
log(`buildSpawnEnv: translated CLAUDE_CONFIG_DIR: ${mergedEnv.CLAUDE_CONFIG_DIR} -> ${translated}`);
mergedEnv.CLAUDE_CONFIG_DIR = translated;
}
} else {
// Host path — may be doubled by app.asar's own
// path.join(homedir, rootRelativeSubpath). Extract the
// relative part and resolve it properly.
const homeDir = os.homedir();
if (mergedEnv.CLAUDE_CONFIG_DIR.startsWith(homeDir + path.sep)) {
const relative = mergedEnv.CLAUDE_CONFIG_DIR.slice(homeDir.length + 1);
const fixed = resolveSubpath(relative);
if (fixed !== mergedEnv.CLAUDE_CONFIG_DIR) {
log(`buildSpawnEnv: fixed doubled CLAUDE_CONFIG_DIR: ${mergedEnv.CLAUDE_CONFIG_DIR} -> ${fixed}`);
mergedEnv.CLAUDE_CONFIG_DIR = fixed;
}
}
}
}
if (mergedEnv.CLAUDE_CONFIG_DIR) {
if (mergedEnv.CLAUDE_CONFIG_DIR.startsWith('/sessions/')) {
// translate guest path to host path
const translated = translateGuestPath(
mergedEnv.CLAUDE_CONFIG_DIR, mountMap
);
if (translated !== mergedEnv.CLAUDE_CONFIG_DIR) {
log(`buildSpawnEnv: translated CLAUDE_CONFIG_DIR: ${mergedEnv.CLAUDE_CONFIG_DIR} -> ${translated}`);
mergedEnv.CLAUDE_CONFIG_DIR = translated;
}
} else {
// Host path — may be doubled by app.asar's own
// path.join(homedir, rootRelativeSubpath). Extract the
// relative part and resolve it properly.
const homeDir = os.homedir();
if (mergedEnv.CLAUDE_CONFIG_DIR.startsWith(homeDir + path.sep)) {
const relative = mergedEnv.CLAUDE_CONFIG_DIR.slice(homeDir.length + 1);
const fixed = resolveSubpath(relative);
if (fixed !== mergedEnv.CLAUDE_CONFIG_DIR) {
log(`buildSpawnEnv: fixed doubled CLAUDE_CONFIG_DIR: ${mergedEnv.CLAUDE_CONFIG_DIR} -> ${fixed}`);
mergedEnv.CLAUDE_CONFIG_DIR = fixed;
}
}
}
}
// Translate CLAUDE_COWORK_MEMORY_PATH_OVERRIDE from guest path to
@@ -2471,6 +2489,7 @@ function startServer() {
}
log(`Listening on ${SOCKET_PATH}`);
console.log(`${LOG_PREFIX} Service started on ${SOCKET_PATH}`);
logLifecycle('listening', SOCKET_PATH);
});
// Graceful shutdown
@@ -2483,12 +2502,26 @@ function startServer() {
});
};
process.on('SIGTERM', shutdown);
process.on('SIGINT', shutdown);
process.on('SIGTERM', () => {
logLifecycle('SIGTERM received');
shutdown();
});
process.on('SIGINT', () => {
logLifecycle('SIGINT received');
shutdown();
});
process.on('uncaughtException', (err) => {
logLifecycle('uncaughtException', err);
logError('Uncaught exception:', err);
shutdown();
});
process.on('unhandledRejection', (reason) => {
logLifecycle('unhandledRejection', reason);
logError('Unhandled rejection:', reason);
});
process.on('exit', (code) => {
logLifecycle('exit', `code=${code}`);
});
}
// ============================================================
@@ -2496,10 +2529,11 @@ function startServer() {
// ============================================================
// Always clean up stale socket and start. The app's retry wrapper has a
// dedup flag (_svcLaunched) preventing duplicate daemon launches, so a
// simple synchronous cleanup avoids the race condition where an async
// connection test delays startup while the app is already retrying.
// 10s spawn cooldown (_lastSpawn) preventing duplicate daemon launches,
// so a simple synchronous cleanup avoids the race condition where an
// async connection test delays startup while the app is already retrying.
if (require.main === module) {
logLifecycle('startup', `pid=${process.pid} sock=${SOCKET_PATH}`);
cleanupSocket();
startServer();
}