diff --git a/.sandcastle/issue-source.ts b/.sandcastle/issue-source.ts index 6378d22..e7d720e 100644 --- a/.sandcastle/issue-source.ts +++ b/.sandcastle/issue-source.ts @@ -6,9 +6,9 @@ * * Runs inside the outer devcontainer, where `gh` is already authenticated. */ -import { execFile } from "node:child_process"; +import { execFile, spawn } from "node:child_process"; import { promisify } from "node:util"; -import { READY_LABEL, type Issue } from "./reduce.ts"; +import { READY_LABEL, type Issue, type ReviewOutput } from "./reduce.ts"; const exec = promisify(execFile); @@ -39,6 +39,24 @@ export class IssueSource { return exec("gh", full, { maxBuffer: 16 * 1024 * 1024 }); } + private ghWithInput(args: string[], input: string): Promise<{ stdout: string }> { + return new Promise((resolve, reject) => { + const full = this.repo ? ["-R", this.repo, ...args] : args; + const child = spawn("gh", full, { stdio: ["pipe", "pipe", "pipe"] }); + let stdout = ""; + let stderr = ""; + child.stdout.on("data", (chunk: Buffer) => { stdout += chunk.toString(); }); + child.stderr.on("data", (chunk: Buffer) => { stderr += chunk.toString(); }); + child.on("close", (code: number | null) => { + if (code !== 0) reject(new Error(`gh exited ${code}: ${stderr}`)); + else resolve({ stdout }); + }); + child.on("error", reject); + child.stdin.write(input); + child.stdin.end(); + }); + } + /** Open issues carrying the ready label, with dependency refs parsed from body. */ async listReady(): Promise { const { stdout } = await this.gh([ @@ -139,6 +157,53 @@ export class IssueSource { await this.gh(["issue", "comment", String(n), "--body", body]); } + /** Return the unified diff for the PR on the given issue's branch. */ + async getPrDiff(issueId: number): Promise { + const { stdout } = await this.gh(["pr", "diff", `agent/issue-${issueId}`]); + return stdout; + } + + /** + * Post a REQUEST_CHANGES review on the PR for the given issue. + * Uses the GitHub API to attach the structured verdict + filtered inline comments. + * Best-effort: if inline-comment posting fails, falls back to a plain PR comment. + */ + async postPrReview(issueId: number, output: ReviewOutput): Promise { + const { stdout: prJson } = await this.gh([ + "pr", "view", `agent/issue-${issueId}`, + "--json", "number,headRefOid", + ]); + const prInfo = JSON.parse(prJson) as { number: number; headRefOid: string }; + + const reviewBody = { + commit_id: prInfo.headRefOid, + event: "REQUEST_CHANGES", + body: output.summary, + comments: output.comments.map((c) => ({ + path: c.path, + line: c.line, + side: "RIGHT", + body: c.body, + })), + }; + + try { + await this.ghWithInput( + ["api", `repos/{owner}/{repo}/pulls/${prInfo.number}/reviews`, "--method", "POST", "--input", "-"], + JSON.stringify(reviewBody), + ); + } catch (err) { + // Inline comments may fail when lines are no longer in the diff; fall back to a + // plain top-level review without inline comments so the verdict is still visible. + console.warn(`[issue-source] inline review post failed — falling back to body-only review:`, err); + const bodyOnly = { ...reviewBody, comments: [] }; + await this.ghWithInput( + ["api", `repos/{owner}/{repo}/pulls/${prInfo.number}/reviews`, "--method", "POST", "--input", "-"], + JSON.stringify(bodyOnly), + ); + } + } + /** Open a PR for an already-pushed head branch. Returns the PR URL. */ async openPr(opts: { head: string; diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index 4c4c4d8..67acf3a 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -30,9 +30,10 @@ import { existsSync, mkdirSync, openSync, closeSync, writeFileSync } from "node: import { promisify } from "node:util"; import { fileURLToPath } from "node:url"; import { join, dirname } from "node:path"; -import { reduce, READY_LABEL, type State, type Pr, type Policy, type Mode } from "./reduce.ts"; +import { reduce, READY_LABEL, reviewVerdict, type State, type Pr, type Policy, type Mode } from "./reduce.ts"; import { IssueSource } from "./issue-source.ts"; import { SandboxRunner, SANDBOX_LABEL, PROJECT_LABEL_KEY, deriveProject } from "./sandbox-runner.ts"; +import { ReviewerAdapter } from "./reviewer-adapter.ts"; export { SANDBOX_LABEL }; @@ -612,6 +613,12 @@ async function main(): Promise { network, }); + const reviewer = new ReviewerAdapter({ + model: process.env.AGENTIC_MODEL, + cwd: repoRoot, + baseBranch: base, + }); + const inFlight: number[] = []; // Issue ids whose PrMerged event has been dispatched to the reducer. // Synchronous check + add before the first await ensures concurrent smee @@ -771,14 +778,57 @@ async function main(): Promise { }; const finishActions = reduce(finishState, { type: "SandboxFinished", issue: n, pr: openedPr }); for (const action of finishActions) { - if (action.type === "EnableAutoMerge") { - console.log(`[${mode}] #${n} → enabling auto-merge`); + if (action.type === "StartReview") { + console.log(`[${mode}] #${n} → running AI review`); + let verdict: import("./reduce.ts").ReviewVerdict; + let reviewOutput: import("./reduce.ts").ReviewOutput | null = null; try { - await issues.enableAutoMerge(action.pr.issue); + const ghIssue = await issues.get(n); + const diff = await issues.getPrDiff(n); + const result = await reviewer.runReview( + { + issueNumber: n, + issueTitle: ghIssue.title, + issueBody: ghIssue.body, + branch: `agent/issue-${n}`, + prNumber: n, + }, + diff, + ); + verdict = result.verdict; + reviewOutput = result.output; } catch (err) { - // A config gap (auto-merge disabled, no required check) must not - // crash the orchestrator — the PR stays open for a human to merge. - console.error(`[${mode}] #${n} could not enable auto-merge:`, err); + // Fail-safe: any reviewer crash → changes-requested, never auto-merges. + console.warn(`[${mode}] #${n} reviewer failed — defaulting to changes-requested:`, err); + verdict = reviewVerdict(null); + } + console.log(`[${mode}] #${n} review verdict: ${verdict}`); + + const reviewActions = reduce(finishState, { + type: "ReviewFinished", + issue: n, + pr: action.pr, + verdict, + }); + for (const reviewAction of reviewActions) { + if (reviewAction.type === "EnableAutoMerge") { + console.log(`[${mode}] #${n} review passed → enabling auto-merge`); + try { + await issues.enableAutoMerge(reviewAction.pr.issue); + } catch (err) { + console.error(`[${mode}] #${n} could not enable auto-merge:`, err); + } + } + if (reviewAction.type === "WaitForHuman") { + console.log(`[${mode}] #${n} review requested changes → parked for human`); + if (reviewOutput) { + try { + await issues.postPrReview(n, reviewOutput); + } catch (err) { + console.error(`[${mode}] #${n} could not post PR review:`, err); + } + } + } } } if (action.type === "WaitForHuman") { diff --git a/.sandcastle/reduce.test.ts b/.sandcastle/reduce.test.ts index 7ffbdd2..3e8f519 100644 --- a/.sandcastle/reduce.test.ts +++ b/.sandcastle/reduce.test.ts @@ -8,7 +8,7 @@ */ import { test } from "node:test"; import assert from "node:assert/strict"; -import { reduce, READY_LABEL, type State, type CiStatus, type Pr } from "./reduce.ts"; +import { reduce, READY_LABEL, reviewVerdict, type State, type CiStatus, type Pr, type ReviewOutput } from "./reduce.ts"; import { parseBlockedBy } from "./issue-source.ts"; import { sweepOrphanedSandboxes, ensureSandboxNetwork, parseConcurrency, withRetry, resetAgentBranch, refreshBase, validateSignature, classifyDelivery, parseSmeeEvent, parseOrchEnv, resolveCredentials, resolveRunMode, resolveDockerHost } from "./main.ts"; import { createHmac } from "node:crypto"; @@ -210,11 +210,11 @@ test("PrMerged unblocks all fully-resolved dependents in one call", () => { // ─── SandboxFinished / EnableAutoMerge / WaitForHuman ─────────────────────── -test("afk mode + SandboxFinished → EnableAutoMerge", () => { +test("afk mode + SandboxFinished → StartReview (not EnableAutoMerge)", () => { const pr: Pr = { issue: 1, ciStatus: "pending", merged: false }; const state = base({ policy: { concurrency: 1, mode: "afk" } }); assert.deepEqual(reduce(state, { type: "SandboxFinished", issue: 1, pr }), [ - { type: "EnableAutoMerge", pr }, + { type: "StartReview", pr }, ]); }); @@ -226,6 +226,65 @@ test("hitl mode + SandboxFinished → WaitForHuman", () => { ]); }); +test("ReviewFinished pass → EnableAutoMerge", () => { + const pr: Pr = { issue: 1, ciStatus: "pending", merged: false }; + const state = base({ policy: { concurrency: 1, mode: "afk" } }); + assert.deepEqual(reduce(state, { type: "ReviewFinished", issue: 1, pr, verdict: "pass" }), [ + { type: "EnableAutoMerge", pr }, + ]); +}); + +test("ReviewFinished changes-requested → WaitForHuman", () => { + const pr: Pr = { issue: 1, ciStatus: "pending", merged: false }; + const state = base({ policy: { concurrency: 1, mode: "afk" } }); + assert.deepEqual( + reduce(state, { type: "ReviewFinished", issue: 1, pr, verdict: "changes-requested" }), + [{ type: "WaitForHuman", pr }], + ); +}); + +test("onTick does not Stop while a PR is awaiting/under review (open, non-conflicting)", () => { + const state = base({ + issues: [], + prs: [{ issue: 1, ciStatus: "pending", merged: false }], + policy: { concurrency: 1, mode: "afk" }, + }); + const actions = reduce(state, { type: "Tick" }); + assert.ok(!actions.some((a) => a.type === "Stop"), "open PR under review keeps loop alive"); +}); + +// ─── reviewVerdict ─────────────────────────────────────────────────────────── + +test("reviewVerdict: valid pass output → 'pass'", () => { + const output: ReviewOutput = { verdict: "pass", summary: "LGTM", comments: [] }; + assert.equal(reviewVerdict(output), "pass"); +}); + +test("reviewVerdict: valid changes-requested output → 'changes-requested'", () => { + const output: ReviewOutput = { + verdict: "changes-requested", + summary: "needs work", + comments: [{ path: "foo.ts", line: 1, body: "fix this" }], + }; + assert.equal(reviewVerdict(output), "changes-requested"); +}); + +test("reviewVerdict: Error → 'changes-requested' (fail-safe)", () => { + assert.equal(reviewVerdict(new Error("extraction failed")), "changes-requested"); +}); + +test("reviewVerdict: null → 'changes-requested' (fail-safe)", () => { + assert.equal(reviewVerdict(null), "changes-requested"); +}); + +test("reviewVerdict: undefined → 'changes-requested' (fail-safe)", () => { + assert.equal(reviewVerdict(undefined), "changes-requested"); +}); + +test("reviewVerdict: malformed object (no verdict field) → 'changes-requested' (fail-safe)", () => { + assert.equal(reviewVerdict({ summary: "hmm" } as unknown as ReviewOutput), "changes-requested"); +}); + test("afk mode + Tick with open PR and nothing else ready → no Stop (waiting for CI)", () => { const state = base({ issues: [{ id: 1, labels: [], blockedBy: [] }], diff --git a/.sandcastle/reduce.ts b/.sandcastle/reduce.ts index 3415d07..ee50baa 100644 --- a/.sandcastle/reduce.ts +++ b/.sandcastle/reduce.ts @@ -24,6 +24,20 @@ export const READY_LABEL = "ready-for-agent"; export type CiStatus = "none" | "pending" | "success" | "failure" | "conflicting"; +export type ReviewVerdict = "pass" | "changes-requested"; + +export interface ReviewComment { + readonly path: string; + readonly line: number; + readonly body: string; +} + +export interface ReviewOutput { + readonly verdict: ReviewVerdict; + readonly summary: string; + readonly comments: ReadonlyArray; +} + export interface Issue { readonly id: number; readonly labels: string[]; @@ -56,10 +70,12 @@ export type Event = | { type: "Tick" } | { type: "PrMerged"; pr: Pr } | { type: "SandboxFinished"; issue: number; pr: Pr } - | { type: "SandboxFailed"; issue: number }; + | { type: "SandboxFailed"; issue: number } + | { type: "ReviewFinished"; issue: number; pr: Pr; verdict: ReviewVerdict }; export type Action = | { type: "StartSandbox"; issueId: number } + | { type: "StartReview"; pr: Pr } | { type: "EnableAutoMerge"; pr: Pr } | { type: "Relabel"; issueId: number; label: string } | { type: "WaitForHuman"; pr: Pr } @@ -73,6 +89,8 @@ export function reduce(state: State, event: Event): Action[] { return onPrMerged(state, event.pr); case "SandboxFinished": return onSandboxFinished(state, event); + case "ReviewFinished": + return onReviewFinished(event); // SandboxFailed lands in a later slice. default: return []; @@ -113,11 +131,30 @@ function onPrMerged(state: State, mergedPr: Pr): Action[] { function onSandboxFinished(state: State, event: { issue: number; pr: Pr }): Action[] { if (state.policy.mode === "afk") { + return [{ type: "StartReview", pr: event.pr }]; + } + return [{ type: "WaitForHuman", pr: event.pr }]; +} + +function onReviewFinished(event: { issue: number; pr: Pr; verdict: ReviewVerdict }): Action[] { + if (event.verdict === "pass") { return [{ type: "EnableAutoMerge", pr: event.pr }]; } return [{ type: "WaitForHuman", pr: event.pr }]; } +/** + * Map a structured review output (or any extraction/validation error) to a + * merge verdict. Fail-safe: any missing or malformed result → "changes-requested" + * so a reviewer crash never silently auto-merges. + */ +export function reviewVerdict(result: ReviewOutput | Error | null | undefined): ReviewVerdict { + if (!result || result instanceof Error) return "changes-requested"; + const v = (result as ReviewOutput).verdict; + if (v === "pass" || v === "changes-requested") return v; + return "changes-requested"; +} + function onTick(state: State): Action[] { const ready = state.issues .filter((issue) => isReady(issue, state)) diff --git a/.sandcastle/review-extraction.md b/.sandcastle/review-extraction.md new file mode 100644 index 0000000..e63dd08 --- /dev/null +++ b/.sandcastle/review-extraction.md @@ -0,0 +1,27 @@ +Based on the review you just completed, emit a structured verdict as a JSON object inside an `` XML tag. + +The JSON must match this schema exactly: + +```json +{ + "verdict": "pass" | "changes-requested", + "summary": "", + "comments": [ + { + "path": "", + "line": , + "body": "" + } + ] +} +``` + +- `verdict`: `"pass"` if all acceptance criteria are met, `"changes-requested"` otherwise. +- `summary`: A concise paragraph explaining the overall verdict. +- `comments`: Empty array `[]` for a pass. For changes-requested, list only comments for lines that appear in the diff (path + line must correspond to an actual changed line). If unsure of the exact line, omit that comment rather than guessing. + +Emit **only** the `` block — no prose before or after it. + + +{"verdict": "...", "summary": "...", "comments": [...]} + diff --git a/.sandcastle/review-prompt.md b/.sandcastle/review-prompt.md new file mode 100644 index 0000000..ce96f14 --- /dev/null +++ b/.sandcastle/review-prompt.md @@ -0,0 +1,25 @@ +You are an AI code reviewer. Your task is to review the changes on the PR branch against the issue's acceptance criteria and the repository's ADR conventions. + +## Issue + +**#{{ISSUE_NUMBER}}: {{ISSUE_TITLE}}** + +{{ISSUE_BODY}} + +## What to review + +The branch `{{BRANCH}}` has been pushed and a PR is open. Your job: + +1. Run `git diff origin/{{BASE_BRANCH}}...{{BRANCH}}` to see the diff. +2. Read any ADR files in `docs/adr/` that are relevant to the changes. +3. Check each acceptance criterion from the issue body against the diff. +4. Form a verdict: **pass** if all acceptance criteria are met and no convention is violated; **changes-requested** otherwise. + +## Rules + +- Read-only: do NOT push, commit, or modify any files. +- Focus on correctness and spec compliance — not style nit-picks. +- A `pass` verdict means the implementation is ready to merge as-is. +- A `changes-requested` verdict must be accompanied by specific, actionable comments. + +When you have completed your review, summarize your findings clearly. You will be asked to emit a structured output afterwards. diff --git a/.sandcastle/reviewer-adapter.ts b/.sandcastle/reviewer-adapter.ts new file mode 100644 index 0000000..a609672 --- /dev/null +++ b/.sandcastle/reviewer-adapter.ts @@ -0,0 +1,215 @@ +/** + * ReviewerAdapter — runs a read-only reviewer sandbox on a PR branch and + * returns a structured verdict. Uses a two-pass produce→extract pattern: + * + * Pass 1 (prompt.md): the agent reads the diff + ADRs + acceptance criteria + * and forms a free-form review. + * Pass 2 (extraction.md): the same session is resumed; the agent's only job + * is to emit a structured block with the verdict JSON. + * + * The reviewer runs in noSandbox() — it reads the branch via `git diff` and + * `gh`, but never pushes or commits. + * + * Fail-safe: any exception (timeout, missing tag, schema failure) is caught and + * returns `null`, which the orchestrator maps to "changes-requested" via + * reviewVerdict() in reduce.ts. + * + * Prior art: mattpocock/sandcastle agent-workflows/review/ pattern (ADR-0020). + */ +import { run, claudeCode, Output, StructuredOutputError } from "@ai-hero/sandcastle"; +import { noSandbox } from "@ai-hero/sandcastle/sandboxes/no-sandbox"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; +import { reviewVerdict, type ReviewOutput, type ReviewVerdict } from "./reduce.ts"; + +const _dir = dirname(fileURLToPath(import.meta.url)); + +/** Standard Schema-compatible validator for the reviewer's structured output. */ +const reviewOutputSchema = { + "~standard": { + validate(value: unknown): { value: ReviewOutput } | { issues: unknown[] } { + if (!value || typeof value !== "object") { + return { issues: [{ message: "expected an object" }] }; + } + const v = value as Record; + if (v.verdict !== "pass" && v.verdict !== "changes-requested") { + return { issues: [{ message: `verdict must be 'pass' or 'changes-requested', got ${JSON.stringify(v.verdict)}` }] }; + } + if (typeof v.summary !== "string") { + return { issues: [{ message: "summary must be a string" }] }; + } + if (!Array.isArray(v.comments)) { + return { issues: [{ message: "comments must be an array" }] }; + } + return { value: value as ReviewOutput }; + }, + version: 1, + }, +} as const; + +export interface ReviewerOptions { + /** Model to use for the reviewer agent (default: claude-sonnet-4-6). */ + readonly model?: string; + /** Host repo root — anchors sandcastle's cwd and git operations. */ + readonly cwd?: string; + /** Base branch the PR was cut from (used in the diff command). */ + readonly baseBranch?: string; +} + +export interface ReviewInput { + readonly issueNumber: number; + readonly issueTitle: string; + readonly issueBody: string; + readonly branch: string; + /** PR number for posting the review. */ + readonly prNumber: number; +} + +/** + * Parse a unified diff and return a Set of "path:line" pairs for all lines + * that appear in diff hunks (both added and context lines). Used by + * filterInlineComments to drop comments whose path/line aren't in the diff. + */ +export function parseDiffLines(diff: string): Set { + const result = new Set(); + let currentFile = ""; + let currentLine = 0; + + for (const line of diff.split("\n")) { + const fileMatch = line.match(/^\+\+\+ b\/(.+)$/); + if (fileMatch) { + currentFile = fileMatch[1]; + continue; + } + const hunkMatch = line.match(/^@@ -\d+(?:,\d+)? \+(\d+)(?:,\d+)? @@/); + if (hunkMatch) { + currentLine = parseInt(hunkMatch[1], 10) - 1; + continue; + } + if (!currentFile) continue; + if (line.startsWith("-")) continue; // removed lines have no new-file line number + if (line.startsWith("+") || line.startsWith(" ")) { + currentLine++; + result.add(`${currentFile}:${currentLine}`); + } + } + + return result; +} + +/** + * Filter inline review comments to only those whose path+line appear in the + * diff hunks. Mirrors the reference's filterInlineComments guard so off-diff + * line references (hallucinated or stale) are never posted to the PR. + */ +export function filterInlineComments( + comments: ReviewOutput["comments"], + diffLines: Set, +): ReviewOutput["comments"] { + return comments.filter((c) => diffLines.has(`${c.path}:${c.line}`)); +} + +export class ReviewerAdapter { + private readonly opts: ReviewerOptions; + + constructor(opts: ReviewerOptions = {}) { + this.opts = opts; + } + + /** + * Run the reviewer on the given PR branch and return the validated structured + * output, or null when extraction/validation fails (caller maps to + * "changes-requested" via reviewVerdict()). + */ + async review(input: ReviewInput): Promise { + const model = this.opts.model ?? "claude-sonnet-4-6"; + const baseBranch = this.opts.baseBranch ?? "main"; + const cwd = this.opts.cwd; + + const promptArgs = { + ISSUE_NUMBER: String(input.issueNumber), + ISSUE_TITLE: input.issueTitle, + ISSUE_BODY: input.issueBody || "(no body)", + BRANCH: input.branch, + BASE_BRANCH: baseBranch, + }; + + // Pass 1: produce — the reviewer reads the diff and forms a free-form review. + let reviewResult; + try { + reviewResult = await run({ + agent: claudeCode(model, { permissionMode: "auto" }), + sandbox: noSandbox(), + cwd, + promptFile: join(_dir, "review-prompt.md"), + promptArgs, + name: `review-issue-${input.issueNumber}`, + maxIterations: 8, + }); + } catch (err) { + console.warn(`[reviewer] pass-1 (produce) failed for #${input.issueNumber}:`, err); + return null; + } + + if (!reviewResult.resume) { + console.warn(`[reviewer] resume not available after pass-1 for #${input.issueNumber} — using pass-1 stdout for extraction`); + } + + // Pass 2: extract — resume the session and ask for the structured output block. + try { + const extractResult = reviewResult.resume + ? await reviewResult.resume( + await loadPromptFile(join(_dir, "review-extraction.md")), + { + output: Output.object({ tag: "output", schema: reviewOutputSchema }), + cwd, + }, + ) + : await run({ + agent: claudeCode(model, { permissionMode: "auto" }), + sandbox: noSandbox(), + cwd, + promptFile: join(_dir, "review-extraction.md"), + output: Output.object({ tag: "output", schema: reviewOutputSchema }), + name: `review-extract-${input.issueNumber}`, + }); + + return (extractResult as typeof extractResult & { output: ReviewOutput }).output; + } catch (err) { + if (err instanceof StructuredOutputError) { + console.warn( + `[reviewer] structured output extraction failed for #${input.issueNumber}: ${err.message}`, + ); + } else { + console.warn(`[reviewer] pass-2 (extract) failed for #${input.issueNumber}:`, err); + } + return null; + } + } + + /** + * Run a full review pass: run the agent, validate output, filter comments to + * diff-only lines, and return the verdict. On any failure, returns + * "changes-requested" (fail-safe). + */ + async runReview(input: ReviewInput, diff: string): Promise<{ verdict: ReviewVerdict; output: ReviewOutput | null }> { + const raw = await this.review(input); + const verdict = reviewVerdict(raw instanceof Error ? raw : raw); + + if (raw && raw.comments.length > 0) { + const diffLines = parseDiffLines(diff); + const filtered: ReviewOutput = { + ...raw, + comments: filterInlineComments(raw.comments, diffLines), + }; + return { verdict, output: filtered }; + } + + return { verdict, output: raw }; + } +} + +async function loadPromptFile(path: string): Promise { + const { readFile } = await import("node:fs/promises"); + return readFile(path, "utf8"); +} diff --git a/docs/adr/0020-review-gate-before-auto-merge.md b/docs/adr/0020-review-gate-before-auto-merge.md new file mode 100644 index 0000000..c282ac6 --- /dev/null +++ b/docs/adr/0020-review-gate-before-auto-merge.md @@ -0,0 +1,26 @@ +# AI review gate before auto-merge in /afk + +In `/afk` mode, after the implementing sandbox opens a PR, a **reviewer sandbox** runs before auto-merge is enabled. The reviewer reads the diff against the issue's acceptance criteria and ADR conventions and emits a structured `pass | changes-requested` verdict. Only a `pass` verdict proceeds to auto-merge; `changes-requested` parks the PR for a human, with the review posted directly to the PR. + +This amends ADR-0007 ("CI is the only safety net"): CI remains required but is no longer the *only* gate — an AI review step now precedes it. + +## Motivation + +CI catches regressions and type errors but cannot verify that the implementation actually satisfies the acceptance criteria stated in the issue. Off-spec-but-CI-green code was silently merging under `/afk`. The reviewer catches that gap without requiring a human in the loop on every issue. + +## Design + +- **Two-pass produce→extract pattern:** Pass 1 (`review-prompt.md`) — the agent reads the diff + ADRs + criteria and forms a free-form review. Pass 2 (`review-extraction.md`) resumes the same session and emits a single `{…}` block with the JSON verdict. Structured output is validated via `Output.object({ tag, schema })` (no hand-rolled regex). +- **Schema:** `{ verdict: "pass" | "changes-requested", summary, comments[] }`. Each comment carries `path`, `line` (new-file line number), and `body`. +- **Validate then filter:** inline comments are filtered through `parseDiffLines` / `filterInlineComments` before being posted, so off-diff line references (hallucinated or stale) are never sent to the API. +- **Reviewer runs in `noSandbox()`** — read-only; never pushes or commits. +- **Fail-safe:** any reviewer timeout, crash, or garbled verdict resolves to `changes-requested` via `reviewVerdict()`, so a reviewer failure never silently auto-merges. +- **Reducer gate:** `onSandboxFinished` in `reduce.ts` returns `StartReview { pr }` (not `EnableAutoMerge`) in afk mode. A new `ReviewFinished { issue, pr, verdict }` event maps `pass → EnableAutoMerge`, `changes-requested → WaitForHuman`. `/hitl` is unchanged — `onSandboxFinished` still returns `WaitForHuman` (the human is the reviewer, ADR-0009). + +## Consequences + +- `/afk` now requires a Claude seat for the reviewer sandbox on every issue, in addition to the implementing sandbox. This is bounded to one issue at a time (serial execution, ADR-0003). +- A reviewer crash or timeout is safe (changes-requested) but delays the issue; a human must re-label and re-run. +- `/hitl` is completely unaffected — the human is still the sole reviewer there (ADR-0009). +- Human-driven `/code-review ultra` remains available as an alternative but cannot run hands-off; it is not integrated into the autonomous loop. +- The downstream merge mechanism (GitHub-native auto-merge on green CI) is unchanged — this only inserts a gate before `EnableAutoMerge` is called.