Files
claude-desktop-debian/tools/test-harness/src/runners/T10_cowork_daemon_respawn.spec.ts
Aaddrick 3506c14918 test(harness): add Linux compatibility test harness (#579)
Build out a Playwright-based regression-detection harness covering
the compat-matrix surfaces (KDE-W, KDE-X, GNOME, Sway, i3, Niri,
packaging formats). Adds:

- Planning + decision docs under docs/testing/ — README, matrix,
  runbook, automation, cases/ (11 case files), quick-entry-closeout
- Playwright scaffolding (config, tsconfig)
- 78 spec runners under tools/test-harness/src/runners/ — T## case-
  doc runners and S## distribution/smoke runners
- Substrate primitives in tools/test-harness/src/lib/: AX-tree
  loader (snapshotAx + waitForAxNode + axTreeToSnapshot), focus-
  shifter, eipc-registry, niri-native bridge, drag-drop bridge,
  electron-mocks, claudeai page-objects, inspector client

S03 (DEB Depends declared) and S04 (RPM Requires declared) ship
marked test.fail() — they're regression detectors for the case-doc
gap (deb.sh emits no Depends:, rpm.sh sets AutoReqProv: no), and
the expected-failure shape lets them report green on every host
until upstream packaging starts declaring runtime deps.

127 files, no runtime changes; harness is opt-in via
'cd tools/test-harness && npx playwright test'.

Co-authored-by: Claude <claude@anthropic.com>
2026-05-04 23:17:37 -04:00

393 lines
13 KiB
TypeScript

import { test, expect } from '@playwright/test';
import { execFile } from 'node:child_process';
import { promisify } from 'node:util';
import { launchClaude } from '../lib/electron.js';
import { skipUnlessRow } from '../lib/row.js';
import { sleep } from '../lib/retry.js';
import { captureSessionEnv } from '../lib/diagnostics.js';
const exec = promisify(execFile);
// T10 — cowork daemon respawn after kill.
//
// docs/testing/cases/platform-integration.md T10 covers two
// claims: the daemon spawns when Cowork needs it (asserted by
// H04), AND it respawns within the documented timeout if it
// crashes mid-session. This runner covers the second half.
//
// The respawn path is implemented by Patch 6 in
// scripts/patches/cowork.sh:244-362 (issue #408). The auto-launch
// gate uses a timestamp-based cooldown (`_lastSpawn`, 10s window)
// instead of a one-shot boolean specifically so the retry loop
// in kUe()/the renamed retry function can re-fork the daemon
// after it dies. If the cooldown regresses back to a one-shot
// boolean, or the cooldown window grows past the renderer's
// retry budget, kill-then-respawn silently breaks and the user
// sees "VM service not running" until they restart the app.
//
// Trigger model: post-1.5354.0 the cowork client opens a
// persistent pipe at boot (zI/E$i happy path) and uses it for
// every subsequent RPC. After SIGKILL the persistent socket goes
// dead but no client code is in steady-state RPC traffic, so
// nothing fires the retry loop on its own. T10 has to drive
// traffic itself: invoking ClaudeVM.getRunningStatus() through
// the renderer wrapper forces the client to call zI() / kUe(),
// which sees the dead socket, hits the cooldown gate, and
// re-forks the daemon.
//
// Verification primitive: globalThis.__coworkDaemonPid is set
// by the patched fork code after each successful spawn (Patch 6
// in scripts/patches/cowork.sh). Polling that global is faster
// and race-free vs. pgrep, but pgrep is also captured on
// failure for cross-check.
//
// Row gate matches H04 — daemon is Linux-only, gating mirrors the
// rest of the cowork lifecycle row set.
const PGREP_PATTERN = 'cowork-vm-service\\.js';
async function pgrepPids(pattern: string): Promise<Set<number>> {
try {
const { stdout } = await exec('pgrep', ['-f', pattern], {
timeout: 5_000,
});
return new Set(
stdout
.split('\n')
.map((l) => parseInt(l.trim(), 10))
.filter((n) => !Number.isNaN(n)),
);
} catch (err) {
// pgrep exits 1 with empty stdout when no matches. Treat as
// the empty set; everything else propagates.
const e = err as { code?: number; stdout?: string };
if (e.code === 1) return new Set();
const out = e.stdout ?? '';
return new Set(
out
.split('\n')
.map((l) => parseInt(l.trim(), 10))
.filter((n) => !Number.isNaN(n)),
);
}
}
test.setTimeout(90_000);
test('T10 — cowork daemon respawns after SIGKILL', async ({}, testInfo) => {
testInfo.annotations.push({ type: 'severity', description: 'Should' });
testInfo.annotations.push({
type: 'surface',
description: 'Cowork daemon respawn',
});
skipUnlessRow(testInfo, ['KDE-W', 'GNOME-W', 'Ubu-W', 'KDE-X', 'GNOME-X']);
await testInfo.attach('session-env', {
body: JSON.stringify(captureSessionEnv(), null, 2),
contentType: 'application/json',
});
// Baseline — launchClaude's cleanupPreLaunch (lib/electron.ts:160-191)
// pkills any leftover cowork daemon before spawning, so a stray
// pid here would mean the cleanup itself is broken.
const baselinePids = await pgrepPids(PGREP_PATTERN);
await testInfo.attach('baseline-pids', {
body: JSON.stringify(
{
pids: Array.from(baselinePids),
note:
'cleanupPreLaunch should leave this empty before launch. ' +
'Non-empty here is a bug in lib/electron.ts:160-191.',
},
null,
2,
),
contentType: 'application/json',
});
const useHostConfig = process.env.CLAUDE_TEST_USE_HOST_CONFIG === '1';
const app = await launchClaude({
isolation: useHostConfig ? null : undefined,
});
let daemonPid: number | null = null;
try {
// userLoaded — main shell up AND the renderer has navigated
// to a post-login URL. The boot-time daemon spawn happens
// well before this (cowork.sh:262-362 gates on early renderer
// activity), but Phase 3's `window['claude.web'].ClaudeVM`
// invocation requires the renderer to be on a post-login URL
// where the eipc wrapper is exposed. Pre-login pages don't
// expose `claude.web`, so RPC attempts get "Cannot find
// context with specified id" errors. Waiting for userLoaded
// once at the top guarantees the wrapper is reachable.
const { inspector } = await app.waitForReady('userLoaded');
// Phase 1: capture the original daemon pid. Same 15s window
// as H04 — if the daemon never spawned in the first place,
// there's nothing to kill, so skip with the same reason.
const spawnStart = Date.now();
while (Date.now() - spawnStart < 15_000) {
const pids = await pgrepPids(PGREP_PATTERN);
const newPids = Array.from(pids).filter(
(p) => !baselinePids.has(p),
);
if (newPids.length > 0) {
daemonPid = newPids[0]!;
break;
}
await sleep(500);
}
if (daemonPid === null) {
await testInfo.attach('skip-reason', {
body: JSON.stringify(
{
reason:
'cowork daemon not spawned within 15s of mainVisible',
note:
'Auto-launch in cowork.sh:262-362 is gated on a VM ' +
'service connection attempt from the renderer; on a ' +
'passive launch with no Cowork-tab interaction it may ' +
'legitimately not fire. Without an initial spawn there ' +
'is no daemon to kill, so the respawn assertion is ' +
'unreachable. Same skip path as H04.',
},
null,
2,
),
contentType: 'application/json',
});
testInfo.skip(
true,
'cowork daemon not spawned by this build — gating in ' +
'cowork.sh:262-362 may have suppressed it on a passive launch',
);
return;
}
const originalSpawnElapsedMs = Date.now() - spawnStart;
await testInfo.attach('original-spawn', {
body: JSON.stringify(
{
pid: daemonPid,
elapsedMs: originalSpawnElapsedMs,
},
null,
2,
),
contentType: 'application/json',
});
// Phase 2: SIGKILL the daemon. Try direct process.kill first;
// the daemon is forked by the Electron main process under the
// same uid as the test runner, so this should not need root.
// Shell-out fallback covers the unlikely case where direct
// kill fails (e.g. EPERM on a misconfigured runner).
const killTs = Date.now();
let killMethod = 'process.kill';
try {
process.kill(daemonPid, 'SIGKILL');
} catch (err) {
killMethod = 'execFile-kill-9';
await exec('kill', ['-9', String(daemonPid)], { timeout: 5_000 });
}
await testInfo.attach('kill', {
body: JSON.stringify(
{
killedPid: daemonPid,
killMethod,
killedAt: new Date(killTs).toISOString(),
},
null,
2,
),
contentType: 'application/json',
});
// Phase 3: drive the retry loop and poll for a NEW pid. The
// cooldown in cowork.sh:329-332 is 10s, so the new pid can't
// arrive earlier than 10s past the original `_lastSpawn`. The
// 30s budget gives 10s of cooldown headroom plus 20s for the
// renderer context to recover from any post-kill navigation
// (the dead VM service can trigger a re-render that throws
// "Cannot find context with specified id" on RPCs in flight),
// plus the fork + bind + exec round-trip for the new daemon.
//
// Each poll iteration: (1) fire ClaudeVM.getRunningStatus()
// via the renderer wrapper — best-effort, expect throws on
// post-kill navigations and on the first attempts before the
// cooldown gate opens — and (2) read globalThis.__coworkDaemonPid
// (set by the patched fork code after every successful spawn).
// pgrep is the cross-check.
const respawnStart = Date.now();
let respawnPid: number | null = null;
let rpcAttempts = 0;
let rpcFailures = 0;
let lastRpcError: string | null = null;
while (Date.now() - respawnStart < 30_000) {
// Drive a daemon RPC by invoking the eipc handler from
// MAIN directly. The renderer-wrapper path
// (window['claude.web'].ClaudeVM.getRunningStatus) is
// unreliable here because the dead VM service triggers
// a renderer re-render that throws "Cannot find context
// with specified id" on most calls. Calling the handler
// from main bypasses the renderer entirely; the handler
// internally goes through zI()/VsA()/kUe(), the latter
// of which sees ECONNREFUSED/ENOENT and hits the
// cooldown-gated fork. We forge a senderFrame.url to
// satisfy any origin-gated handlers (claude.web scope).
rpcAttempts++;
try {
await inspector.evalInMain(`
const { webContents } = process.mainModule.require('electron');
const wc = webContents.getAllWebContents().find(w => {
try { return w.getURL().includes('claude.ai'); }
catch { return false; }
});
if (!wc) return null;
const handlers = wc.ipc && wc.ipc._invokeHandlers;
if (!handlers || typeof handlers.keys !== 'function') return null;
const channel = Array.from(handlers.keys())
.find(k => k.endsWith('_$_ClaudeVM_$_getRunningStatus'));
if (!channel) return null;
const handler = handlers.get(channel);
if (typeof handler !== 'function') return null;
const fakeEvent = {
senderFrame: { url: 'https://claude.ai/' },
sender: wc,
};
try { await handler(fakeEvent); } catch (e) { /* expected */ }
return null;
`);
} catch (err) {
rpcFailures++;
lastRpcError = err instanceof Error ? err.message : String(err);
}
// Primary signal: the global pid changed.
let currentGlobalPid: number | null = null;
try {
currentGlobalPid = await inspector.evalInMain<number | null>(
`return globalThis.__coworkDaemonPid ?? null;`,
);
} catch {
// inspector momentarily unavailable — keep polling
}
if (
currentGlobalPid !== null &&
currentGlobalPid !== daemonPid &&
!baselinePids.has(currentGlobalPid)
) {
respawnPid = currentGlobalPid;
break;
}
// Cross-check via pgrep (covers the corner where the global
// is set but pgrep hasn't observed the new pid yet, or the
// global never gets updated for some reason).
const pids = await pgrepPids(PGREP_PATTERN);
const candidates = Array.from(pids).filter(
(p) => !baselinePids.has(p) && p !== daemonPid,
);
if (candidates.length > 0) {
respawnPid = candidates[0]!;
break;
}
await sleep(500);
}
const respawnElapsedMs = Date.now() - respawnStart;
if (respawnPid === null) {
const finalPids = await pgrepPids(PGREP_PATTERN);
let finalGlobalPid: number | null = null;
try {
finalGlobalPid = await inspector.evalInMain<number | null>(
`return globalThis.__coworkDaemonPid ?? null;`,
);
} catch {
// best-effort
}
await testInfo.attach('respawn-failure', {
body: JSON.stringify(
{
killedPid: daemonPid,
pgrepFinal: Array.from(finalPids),
globalDaemonPidFinal: finalGlobalPid,
rpcAttempts,
rpcFailures,
lastRpcError,
elapsedMs: respawnElapsedMs,
note:
'No new cowork-vm-service pid observed within 30s ' +
'of SIGKILL despite firing ClaudeVM.getRunningStatus ' +
'each iteration. Cooldown in cowork.sh:329-332 is 10s. ' +
'Possible regressions: cooldown reverted to a one-shot ' +
'boolean (issue #408), the retry loop no longer enters ' +
'the auto-launch branch on ECONNREFUSED/ENOENT, the ' +
'patched fork no longer assigns __coworkDaemonPid, or ' +
'ClaudeVM eipc no longer routes through the daemon ' +
'RPC (the trigger surface).',
},
null,
2,
),
contentType: 'application/json',
});
} else {
await testInfo.attach('respawn', {
body: JSON.stringify(
{
originalPid: daemonPid,
respawnPid,
rpcAttempts,
rpcFailures,
elapsedMs: respawnElapsedMs,
},
null,
2,
),
contentType: 'application/json',
});
}
expect(
respawnPid,
'cowork-vm-service respawns within 30s of SIGKILL',
).not.toBeNull();
expect(
respawnPid,
'respawn pid is distinct from the killed pid',
).not.toBe(daemonPid);
} finally {
await app.close();
// Best-effort cleanup confirmation. If anything still matches
// PGREP_PATTERN after close, attach it for diagnosis but don't
// fail — H04 is the runner that asserts the cleanup contract.
await sleep(2_000);
const postExitPids = await pgrepPids(PGREP_PATTERN);
const lingering = Array.from(postExitPids).filter(
(p) => !baselinePids.has(p),
);
await testInfo.attach('post-exit-pgrep', {
body: JSON.stringify(
{
baseline: Array.from(baselinePids),
postExit: Array.from(postExitPids),
lingering,
note:
'Informational. H04 owns the cleanup-after-close ' +
'assertion; this attachment is for cross-referencing ' +
'when respawn passes but cleanup regresses elsewhere.',
},
null,
2,
),
contentType: 'application/json',
});
}
});