grok-build-vscode/scripts/live-tests.cjs at main · phuryn/grok-build-vscode · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
#!/usr/bin/env node
/**
 * On-demand LIVE pre-release smoke suite — spawns the REAL `grok agent stdio`
 * binary and exercises the surfaces our 368 grok-free unit tests can't:
 * the actual ACP handshake, a prompt round-trip, session restore, plan-mode
 * enforcement, and the v1.4.0 features (image + video generation, subagents).
 *
 * Why this is NOT part of `npm test`:
 *   - CI has no `grok` binary, no login, and no SuperGrok subscription
 *     (`/imagine` is subscription-gated), so it literally can't run there.
 *   - Real LLM output is non-deterministic (whether grok delegates to a
 *     subagent, the exact tool sequence, the image content) — unassertable
 *     the way pure logic is, so it would flake the unit suite.
 *   - Spawning the binary + generating media is seconds-to-minutes and burns
 *     subscription credits.
 * So it's a manual gate: run `npm run test:live` before a release-to-main.
 *
 * It validates the REAL extension logic, not a re-implementation: it requires
 * the compiled `out/acp-dispatch.js` + `out/plan-gate.js` and the shipped
 * `media/webview-helpers.js`, and feeds genuine grok wire output through
 * `isMediaGenToolCall` / `extractGeneratedMediaPaths` / `isSubagentToolCall` /
 * `shouldBlockWrite` exactly as the extension does.
 *
 * Usage:
 *   npm run test:live                  # all tests
 *   npm run test:live -- --quick       # skip the slow generative tests (image/video/subagent)
 *   npm run test:live -- --only=plan-mode,session-restore
 *   npm run test:live -- --only=video-gen          # video-gen is opt-in (off by default)
 *   npm run test:live -- --video-timeout=120000    # give /imagine-video 2 min before SKIPping
 *   GROK_BIN=/path/to/grok npm run test:live
 *
 * Exit code 0 iff no test FAILED (SKIPs — e.g. no subscription, grok chose not
 * to delegate — do not fail the gate; they're reported honestly).
 */
const { spawn } = require("node:child_process");
const os = require("node:os");
const path = require("node:path");
const fs = require("node:fs");

// ── Real extension modules (compiled CJS + shipped webview helper) ───────────
const REPO = path.resolve(__dirname, "..");
let dispatch, planGate, helpers;
try {
  dispatch = require(path.join(REPO, "out", "acp-dispatch.js"));
  planGate = require(path.join(REPO, "out", "plan-gate.js"));
  helpers = require(path.join(REPO, "media", "webview-helpers.js"));
} catch (e) {
  console.error("Could not load compiled modules — run `npm run compile` (or `tsc -p .`) first.\n" + e.message);
  process.exit(2);
}
const { isMediaGenToolCall, extractGeneratedMediaPaths } = dispatch;
const { shouldBlockWrite } = planGate;
const { isSubagentToolCall, subagentLabel } = helpers;

// ── grok locator (cross-platform; mirrors cli-locator's resolution order) ────
function resolveGrok() {
  if (process.env.GROK_BIN && fs.existsSync(process.env.GROK_BIN)) return process.env.GROK_BIN;
  const home = process.env.USERPROFILE || process.env.HOME || os.homedir();
  const win = process.platform === "win32";
  const candidates = win
    ? [path.join(home, ".grok", "bin", "grok.exe"), path.join(home, ".grok", "bin", "grok.cmd")]
    : [path.join(home, ".grok", "bin", "grok")];
  for (const c of candidates) if (fs.existsSync(c)) return c;
  return win ? "grok.exe" : "grok"; // last resort: rely on PATH
}
const GROK = resolveGrok();
const GROK_HOME = path.join(process.env.USERPROFILE || process.env.HOME || os.homedir(), ".grok");

// ── CLI flags ────────────────────────────────────────────────────────────────
const argv = process.argv.slice(2);
const flag = (name) => argv.find((a) => a === `--${name}` || a.startsWith(`--${name}=`));
const flagVal = (name) => { const f = flag(name); return f && f.includes("=") ? f.split("=")[1] : undefined; };
const QUICK = !!flag("quick");
const ONLY = (flagVal("only") || "").split(",").map((s) => s.trim()).filter(Boolean);
const SKIP = (flagVal("skip") || "").split(",").map((s) => s.trim()).filter(Boolean);
// /imagine-video works interactively, but in this bare headless harness grok
// 0.2.x tends to spin (Glob/Grep + the video tool retrying with status:failed)
// instead of cleanly producing one clip, so it often never finishes regardless
// of how long we wait — a longer cap doesn't help (10 min timed out just like
// 5). So testVideo treats a timeout as an inconclusive SKIP, not a FAIL, and the
// wait is just "give it a fair chance before giving up." Override with
// --video-timeout=<ms> or GROK_VIDEO_TIMEOUT_MS.
const VIDEO_TIMEOUT_MS = Number(flagVal("video-timeout") || process.env.GROK_VIDEO_TIMEOUT_MS) || 300000;

// ── A minimal ACP client over one grok child process ─────────────────────────
// Spawns `grok agent stdio`, frames newline-delimited JSON-RPC, auto-answers
// the mandatory server→client requests, and records writes + media/subagent
// tool calls so each test can assert against real wire output.
class Acp {
  constructor(cwd, { extraArgs = [], onWrite } = {}) {
    this.cwd = cwd;
    this.nextId = 1;
    this.waiters = new Map();
    this.updates = [];
    this.writes = [];        // every fs/write_text_file path grok asked for
    this.mediaGenIds = new Set();
    this.media = [];         // MediaRef[] from the real extractor
    this.subagentCalls = []; // tool calls the real isSubagentToolCall matched (genuine spawn_subagent shape)
    this.bgTasks = [];       // background tasks grok spawned (its real subagent mechanism)
    this.taskOutputCalls = []; // get_command_or_subagent_output poller tool_calls
    this.onWrite = onWrite;  // optional per-test hook (path) => "write" | "ack"
    this.buf = "";
    const win = process.platform === "win32";
    const useShell = /\.(cmd|bat)$/i.test(GROK);
    this.proc = spawn(GROK, [...extraArgs, "agent", "stdio"], {
      cwd, env: process.env, shell: useShell && win,
    });
    this.proc.stdout.on("data", (d) => this._onData(d));
    this.proc.stderr.on("data", (d) => {
      const s = d.toString();
      if (/error|panic|unauthor|forbidden|subscription/i.test(s)) this.lastStderr = s.slice(0, 300);
    });
    this.proc.on("exit", (c) => { this.exitCode = c; });
  }

  _onData(d) {
    this.buf += d;
    let i;
    while ((i = this.buf.indexOf("\n")) >= 0) {
      const line = this.buf.slice(0, i);
      this.buf = this.buf.slice(i + 1);
      if (!line.trim()) continue;
      let m; try { m = JSON.parse(line); } catch { continue; }
      this._handle(m);
    }
  }

  _handle(m) {
    if (m.id != null && m.method == null) {            // response to one of our requests
      const w = this.waiters.get(m.id);
      if (w) { this.waiters.delete(m.id); w(m); }
      return;
    }
    if (m.method === "session/update") {
      const u = m.params && m.params.update;
      if (u) { this.updates.push(u); this._inspectUpdate(u); }
      return;
    }
    if (m.method && m.id != null) this._serverRequest(m);  // server→client request
  }

  // Mirror of AcpClient.emitToolMedia + the subagent classifier so the test
  // measures exactly what the extension would surface.
  _inspectUpdate(u) {
    const t = u.sessionUpdate;
    if (t !== "tool_call" && t !== "tool_call_update") return;
    const id = u.toolCallId;
    if (isMediaGenToolCall(u) && typeof id === "string") this.mediaGenIds.add(id);
    if (typeof id === "string" && this.mediaGenIds.has(id)) {
      this.media.push(...extractGeneratedMediaPaths(u));
    }
    if (t === "tool_call" && isSubagentToolCall(u)) this.subagentCalls.push(u);
    // grok's REAL subagent mechanism on the native build is a *background*
    // run_terminal_command + a get_command_or_subagent_output poller (there is
    // no spawn_subagent tool — see research/subagents.md). Track both so the
    // test can confirm a delegation happened AND that the poller is NOT carded.
    const ri = u.rawInput || {};
    const title = String(u.title || "");
    if (ri.is_background === true || ri.background === true || /^\[bg\]/.test(title)) this.bgTasks.push(u);
    if (t === "tool_call" && (ri.variant === "TaskOutput" || /subagent_output|task output/i.test(title))) this.taskOutputCalls.push(u);
  }

  _serverRequest(m) {
    const meth = m.method;
    if (meth === "fs/read_text_file") {
      let content = ""; try { content = fs.readFileSync(m.params.path, "utf8"); } catch {}
      return this._respond(m.id, { content });
    }
    if (meth === "fs/write_text_file") {
      this.writes.push(m.params.path);
      const action = this.onWrite ? this.onWrite(m.params.path, m.params.content) : "write";
      if (action === "write") {
        try { fs.mkdirSync(path.dirname(m.params.path), { recursive: true }); fs.writeFileSync(m.params.path, m.params.content || ""); } catch {}
      }
      return this._respond(m.id, {});
    }
    if (meth === "terminal/create") return this._respond(m.id, { terminalId: "t" + this.nextId });
    if (meth === "terminal/output") return this._respond(m.id, { output: "", exitStatus: { exitCode: 0 }, truncated: false });
    if (meth === "terminal/wait_for_exit") return this._respond(m.id, { exitCode: 0 });
    if (meth === "terminal/kill" || meth === "terminal/release") return this._respond(m.id, {});
    if (meth.includes("exit_plan_mode")) return this._respond(m.id, { outcome: "approved" });
    if (meth === "session/request_permission") {
      const opts = (m.params && m.params.options) || [];
      const allow = opts.find((o) => /allow/.test(o.kind)) || opts[0];
      return this._respond(m.id, { outcome: { outcome: "selected", optionId: allow && allow.optionId } });
    }
    if (/ask_user_question/.test(meth)) return this._respond(m.id, { outcome: "cancelled" });
    return this._respond(m.id, {});
  }

  send(method, params) {
    const id = this.nextId++;
    this.proc.stdin.write(JSON.stringify({ jsonrpc: "2.0", id, method, params }) + "\n");
    return new Promise((res) => this.waiters.set(id, res));
  }
  _respond(id, result) { this.proc.stdin.write(JSON.stringify({ jsonrpc: "2.0", id, result }) + "\n"); }

  agentText() {
    return this.updates
      .filter((u) => u.sessionUpdate === "agent_message_chunk" && u.content && u.content.type === "text")
      .map((u) => u.content.text).join("");
  }
  kill() { try { this.proc.kill(); } catch {} }
}

// ── helpers ──────────────────────────────────────────────────────────────────
const INIT = { protocolVersion: 1, clientCapabilities: { fs: { readTextFile: true, writeTextFile: true }, terminal: true } };
function mkTmp(tag) { return fs.mkdtempSync(path.join(os.tmpdir(), "grok-live-" + tag + "-")); }
function withTimeout(promise, ms, label) {
  return Promise.race([promise, new Promise((_, rej) => setTimeout(() => rej(new Error(`timeout after ${ms}ms: ${label}`)), ms))]);
}
class Skip extends Error {}        // throw to mark a test SKIPPED (not failed)
function assert(cond, msg) { if (!cond) throw new Error(msg); }

// ── tests ────────────────────────────────────────────────────────────────────
// Each returns a short detail string on success, throws Error to FAIL, or
// throws Skip to mark inconclusive (e.g. no subscription / grok didn't delegate).

async function testHandshake() {
  const cwd = mkTmp("hs");
  const acp = new Acp(cwd);
  try {
    const init = await withTimeout(acp.send("initialize", INIT), 30000, "initialize");
    assert(!init.error, "initialize errored: " + JSON.stringify(init.error));
    const r = init.result || {};
    assert(r.protocolVersion != null || r.agentCapabilities || r.promptCapabilities, "no capabilities in initialize result");
    const caps = r.agentCapabilities || r.promptCapabilities || {};
    return `protocolVersion=${r.protocolVersion}, caps=${Object.keys(caps).join("|") || "?"}`;
  } finally { acp.kill(); }
}

async function testPrompt() {
  const cwd = mkTmp("prompt");
  const acp = new Acp(cwd);
  try {
    let r = await withTimeout(acp.send("initialize", INIT), 30000, "init");
    assert(!r.error, "init errored");
    r = await withTimeout(acp.send("session/new", { cwd, mcpServers: [] }), 30000, "session/new");
    assert(!r.error && r.result && r.result.sessionId, "session/new failed: " + JSON.stringify(r.error));
    const sessionId = r.result.sessionId;
    const pr = await withTimeout(
      acp.send("session/prompt", { sessionId, prompt: [{ type: "text", text: "Reply with exactly one word: PONG. No tools, no explanation." }] }),
      120000, "session/prompt");
    assert(!pr.error, "prompt errored: " + JSON.stringify(pr.error));
    const text = acp.agentText();
    assert(text.trim().length > 0, "no agent_message_chunk text came back");
    const pong = /pong/i.test(text);
    // grok ≥0.2.33 echoes the live prompt back as a user_message_chunk — the very
    // behavior that doubled every sent message before the host gated forwarding to
    // replay-only. Surface the count so a future version dropping it is visible.
    const liveEchoes = acp.updates.filter((u) => u.sessionUpdate === "user_message_chunk").length;
    return `stopReason=${pr.result && pr.result.stopReason}, replied ${text.trim().length} chars${pong ? " (contains PONG)" : ""}, live-echo×${liveEchoes}`;
  } finally { acp.kill(); }
}

async function testRestore() {
  const cwd = mkTmp("restore");
  const MARK = "ZEBRA-RESTORE-CHECK";
  // 1) fresh process: make a session and put a recognizable exchange in it
  const a = new Acp(cwd);
  let sessionId;
  try {
    await withTimeout(a.send("initialize", INIT), 30000, "init");
    const ns = await withTimeout(a.send("session/new", { cwd, mcpServers: [] }), 30000, "new");
    assert(ns.result && ns.result.sessionId, "session/new failed");
    sessionId = ns.result.sessionId;
    await withTimeout(a.send("session/prompt", { sessionId, prompt: [{ type: "text", text: `Remember this codeword and reply with just the word: ${MARK}` }] }), 120000, "seed prompt");
  } finally { a.kill(); }
  await new Promise((r) => setTimeout(r, 800)); // let grok flush the session to disk

  // 2) brand-new process: load that session and assert the history replays
  const b = new Acp(cwd);
  try {
    await withTimeout(b.send("initialize", INIT), 30000, "init2");
    const load = await withTimeout(b.send("session/load", { sessionId, cwd, mcpServers: [] }), 60000, "session/load");
    assert(!load.error, "session/load errored: " + JSON.stringify(load.error));
    const replay = b.updates
      .filter((u) => /message_chunk/.test(u.sessionUpdate))
      .map((u) => (u.content && u.content.text) || "").join("\n");
    assert(b.updates.length > 0, "session/load produced no replay updates");
    assert(replay.includes(MARK), `replay did not contain the seeded codeword (got ${b.updates.length} updates)`);
    return `loaded ${sessionId.slice(0, 8)}…, replayed ${b.updates.length} updates incl. codeword`;
  } finally { b.kill(); }
}

async function testPlanMode() {
  const cwd = mkTmp("plan");
  fs.writeFileSync(path.join(cwd, "app.js"), "function add(a,b){return a+b}\nmodule.exports={add}\n");
  // In plan mode we must NOT let grok mutate the workspace, so refuse in-workspace
  // writes (ack without writing) — exactly the choke point the extension gates.
  const acp = new Acp(cwd, { onWrite: () => "ack" });
  try {
    await withTimeout(acp.send("initialize", INIT), 30000, "init");
    const ns = await withTimeout(acp.send("session/new", { cwd, mcpServers: [] }), 30000, "new");
    assert(ns.result && ns.result.sessionId, "session/new failed");
    const sessionId = ns.result.sessionId;
    const sm = await withTimeout(acp.send("session/set_mode", { sessionId, modeId: "plan" }), 30000, "set_mode");
    assert(!sm.error, "set_mode plan errored: " + JSON.stringify(sm.error));
    await withTimeout(
      acp.send("session/prompt", { sessionId, prompt: [{ type: "text", text: "Plan how to add a subtract(a,b) function to app.js and a test for it. Produce a detailed plan; do not implement yet." }] }),
      150000, "plan prompt");

    // The real client-side gate: with plan mode active, an in-workspace write is
    // blocked, while grok's own ~/.grok/sessions/.../plan.md write is allowed.
    const ctx = { active: true, workspaceRoot: cwd, grokHome: GROK_HOME };
    const inWorkspace = path.join(cwd, "app.js");
    assert(shouldBlockWrite(inWorkspace, ctx) === true, "plan-gate failed to block an in-workspace write");
    const planFile = path.join(GROK_HOME, "sessions", "enc", sessionId, "plan.md");
    assert(shouldBlockWrite(planFile, ctx) === false, "plan-gate wrongly blocked grok's own plan.md");

    // Behavioral check against real grok: it must not have mutated the workspace.
    const workspaceWrites = acp.writes.filter((w) => {
      const rel = path.relative(cwd, w);
      return rel && !rel.startsWith("..") && !path.isAbsolute(rel);
    });
    assert(workspaceWrites.length === 0, `grok wrote ${workspaceWrites.length} file(s) into the workspace in plan mode: ${workspaceWrites.join(", ")}`);
    const wrotePlan = acp.writes.some((w) => /plan\.md$/i.test(w));
    return `gate blocks workspace writes & allows plan.md; grok made 0 workspace writes${wrotePlan ? ", wrote its own plan.md" : ""}`;
  } finally { acp.kill(); }
}

async function testImage() {
  const cwd = mkTmp("img");
  const acp = new Acp(cwd);
  try {
    await withTimeout(acp.send("initialize", INIT), 30000, "init");
    const ns = await withTimeout(acp.send("session/new", { cwd, mcpServers: [] }), 30000, "new");
    assert(ns.result && ns.result.sessionId, "session/new failed");
    const pr = await withTimeout(
      acp.send("session/prompt", { sessionId: ns.result.sessionId, prompt: [{ type: "text", text: "/imagine a small red cube on a white background" }] }),
      180000, "/imagine");
    if (pr.error) throw new Skip("/imagine errored (likely no subscription): " + JSON.stringify(pr.error));
    const imgs = acp.media.filter((m) => m.media === "image");
    if (imgs.length === 0) {
      if (/subscription|unauthor|forbidden|upgrade/i.test(acp.lastStderr || acp.agentText())) throw new Skip("image generation unavailable (subscription/auth)");
      throw new Skip("grok produced no image (the model declined or the feature is gated) — agent said: " + acp.agentText().slice(0, 120));
    }
    const ref = imgs[0];
    assert(fs.existsSync(ref.path), "extractor returned an image path that doesn't exist on disk: " + ref.path);
    const bytes = fs.statSync(ref.path).size;
    assert(bytes > 1000, "generated image file is suspiciously small: " + bytes + " bytes");
    return `image at ${path.basename(ref.path)} (${(bytes / 1024).toFixed(0)} KB), classified media:"image"`;
  } finally { acp.kill(); }
}

// What was grok last seen doing? Used to make a video-gen timeout self-explain:
// "in_progress with a media-gen tool call" = slow-but-working; "no tool calls" =
// stuck/idle (a different problem worth chasing).
function videoProgress(acp) {
  const tc = acp.updates.filter((u) => u.sessionUpdate === "tool_call" || u.sessionUpdate === "tool_call_update");
  const titles = [...new Set(tc.map((u) => u.title).filter(Boolean))].slice(0, 4);
  const statuses = [...new Set(tc.map((u) => u.status).filter(Boolean))];
  return `grok emitted ${acp.updates.length} update(s), ${acp.mediaGenIds.size} media-gen tool call(s)`
    + (titles.length ? `, tools: [${titles.join(" / ")}]` : ", no tool calls")
    + (statuses.length ? `, status: ${statuses.join("/")}` : "")
    + (acp.media.length ? `, ${acp.media.length} media path(s) extracted` : "");
}

async function testVideo() {
  const cwd = mkTmp("vid");
  const acp = new Acp(cwd);
  try {
    await withTimeout(acp.send("initialize", INIT), 30000, "init");
    const ns = await withTimeout(acp.send("session/new", { cwd, mcpServers: [] }), 30000, "new");
    assert(ns.result && ns.result.sessionId, "session/new failed");
    let pr;
    try {
      pr = await withTimeout(
        acp.send("session/prompt", { sessionId: ns.result.sessionId, prompt: [{ type: "text", text: "/imagine-video a red cube slowly rotating on a white background" }] }),
        VIDEO_TIMEOUT_MS, "/imagine-video");
    } catch (e) {
      // xAI video gen is slow + variable; a headless gate run can exceed the wait
      // window even though /imagine-video works interactively. A timeout is
      // inconclusive, not a regression — SKIP (don't fail the gate), and report
      // what grok was last doing so a real hang (no tool calls) still stands out.
      if (/^timeout after/.test(e.message)) throw new Skip(`/imagine-video didn't finish within ${VIDEO_TIMEOUT_MS}ms — ${videoProgress(acp)}`);
      throw e;
    }
    if (pr.error) throw new Skip("/imagine-video errored (likely no subscription): " + JSON.stringify(pr.error));
    const vids = acp.media.filter((m) => m.media === "video");
    if (vids.length === 0) {
      if (/subscription|unauthor|forbidden|upgrade/i.test(acp.lastStderr || acp.agentText())) throw new Skip("video generation unavailable (subscription/auth)");
      throw new Skip("grok produced no video (model declined or feature gated) — agent said: " + acp.agentText().slice(0, 120));
    }
    const ref = vids[0];
    assert(fs.existsSync(ref.path), "extractor returned a video path that doesn't exist on disk: " + ref.path);
    const bytes = fs.statSync(ref.path).size;
    assert(bytes > 10000, "generated video file is suspiciously small: " + bytes + " bytes");
    return `video at ${path.basename(ref.path)} (${(bytes / 1024).toFixed(0)} KB), classified media:"video"`;
  } finally { acp.kill(); }
}

async function testSubagent() {
  const cwd = mkTmp("sub");
  // seed a couple of files so a "investigate the codebase" task is delegation-worthy
  fs.writeFileSync(path.join(cwd, "app.js"), "const {add}=require('./math');\nconsole.log(add(2,3));\n");
  fs.writeFileSync(path.join(cwd, "math.js"), "function add(a,b){return a+b}\nmodule.exports={add};\n");
  const acp = new Acp(cwd, { extraArgs: ["--always-approve"] });
  try {
    await withTimeout(acp.send("initialize", INIT), 30000, "init");
    const ns = await withTimeout(acp.send("session/new", { cwd, mcpServers: [] }), 30000, "new");
    assert(ns.result && ns.result.sessionId, "session/new failed");
    await withTimeout(
      acp.send("session/prompt", { sessionId: ns.result.sessionId, prompt: [{ type: "text", text: "Use a subagent to read math.js and report in one sentence what add() does. Delegate to a subagent." }] }),
      300000, "subagent prompt");

    // Regression guard: grok's get_command_or_subagent_output is an output READER,
    // not a delegation. Its tool name contains "subagent", which used to false-fire
    // a Subagent card. The classifier must never match it — assert on every real
    // poller call grok made this run.
    const misfired = acp.taskOutputCalls.filter((u) => isSubagentToolCall(u));
    assert(misfired.length === 0, `isSubagentToolCall wrongly matched ${misfired.length} get_command_or_subagent_output poller(s)`);

    // Did grok actually delegate? On the native build that's a background task +
    // task-output poll; some builds may instead emit a genuine spawn_subagent.
    const bgIds = new Set(acp.bgTasks.map((u) => u.toolCallId));
    const pollIds = new Set(acp.taskOutputCalls.map((u) => u.toolCallId));
    if (bgIds.size === 0 && pollIds.size === 0 && acp.subagentCalls.length === 0) {
      throw new Skip("grok did not delegate this run (non-deterministic) — saw " +
        acp.updates.filter((u) => u.sessionUpdate === "tool_call").length +
        " tool calls, none a subagent / background task");
    }
    if (acp.subagentCalls.length > 0) {
      const labels = [...new Set(acp.subagentCalls.map(subagentLabel))];
      return `genuine spawn_subagent card(s): ${labels.join(", ")}; poller correctly not carded`;
    }
    return `delegated via background task (${bgIds.size} bg spawn, ${pollIds.size} output-poll); ` +
      `poller correctly NOT carded — grok's real subagent = background shell, see research/subagents.md`;
  } finally { acp.kill(); }
}

// ── registry + runner ────────────────────────────────────────────────────────
const TESTS = [
  { name: "handshake", fn: testHandshake, slow: false },
  { name: "prompt-roundtrip", fn: testPrompt, slow: false },
  { name: "session-restore", fn: testRestore, slow: false },
  { name: "plan-mode", fn: testPlanMode, slow: false },
  { name: "image-gen", fn: testImage, slow: true },
  // video-gen is opt-in only (run with --only=video-gen). In this headless harness
  // grok 0.2.x spins on /imagine-video instead of producing a clip, so it never
  // completes and is excluded from the default release gate — the feature works
  // interactively. See the testVideo comment + the SKIP-on-timeout handling.
  { name: "video-gen", fn: testVideo, slow: true, optIn: true },
  { name: "subagent", fn: testSubagent, slow: true },
];

function selected() {
  let list = TESTS;
  if (ONLY.length) list = list.filter((t) => ONLY.includes(t.name));
  else list = list.filter((t) => !t.optIn); // opt-in tests only run when named in --only
  if (SKIP.length) list = list.filter((t) => !SKIP.includes(t.name));
  if (QUICK) list = list.filter((t) => !t.slow);
  return list;
}

(async () => {
  const list = selected();
  console.log(`\n grok live suite — binary: ${GROK}`);
  console.log(` running ${list.length} test(s)${QUICK ? " (quick: generative tests skipped)" : ""}\n`);
  const results = [];
  for (const t of list) {
    const started = process.hrtime.bigint();
    process.stdout.write(`  • ${t.name} … `);
    try {
      const detail = await t.fn();
      const ms = Number((process.hrtime.bigint() - started) / 1000000n);
      console.log(`PASS (${ms}ms)\n      ${detail}`);
      results.push({ name: t.name, status: "PASS", detail });
    } catch (e) {
      const ms = Number((process.hrtime.bigint() - started) / 1000000n);
      if (e instanceof Skip) {
        console.log(`SKIP (${ms}ms)\n      ${e.message}`);
        results.push({ name: t.name, status: "SKIP", detail: e.message });
      } else {
        console.log(`FAIL (${ms}ms)\n      ${e.message}`);
        results.push({ name: t.name, status: "FAIL", detail: e.message });
      }
    }
  }
  const pass = results.filter((r) => r.status === "PASS").length;
  const skip = results.filter((r) => r.status === "SKIP").length;
  const fail = results.filter((r) => r.status === "FAIL").length;
  console.log(`\n ── summary ──  ${pass} passed · ${skip} skipped · ${fail} failed`);
  for (const r of results) if (r.status !== "PASS") console.log(`   ${r.status}  ${r.name}`);
  console.log("");
  process.exit(fail > 0 ? 1 : 0);
})().catch((e) => { console.error("runner crashed:", e); process.exit(2); });