diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index 67acf3a..f41a0b6 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -620,6 +620,9 @@ async function main(): Promise { }); const inFlight: number[] = []; + // Per-issue count of SandboxFailed events dispatched this run. Persists + // across ticks so the retry cap is enforced within a single orchestrator run. + const failedAttempts: Record = {}; // Issue ids whose PrMerged event has been dispatched to the reducer. // Synchronous check + add before the first await ensures concurrent smee // and reconciliation paths never double-dispatch the same merge. @@ -760,15 +763,48 @@ async function main(): Promise { void (async () => { let openedPr: Pr | null = null; + let sandboxFailed = false; try { openedPr = await processIssue(n, issues, runner, base, repoRoot, mode); + if (openedPr === null) { + sandboxFailed = true; + console.log(`[${mode}] #${n} sandbox produced no commits`); + } } catch (err) { console.error(`[${mode}] #${n} failed:`, err); - // Leave the issue claimed (label removed) for a human to inspect. + sandboxFailed = true; } finally { inFlight.splice(inFlight.indexOf(n), 1); } + if (sandboxFailed) { + const attempts = failedAttempts[n] ?? 0; + const failState: State = { issues: ready, prs: allPrs, inFlight, policy, failedAttempts }; + const failActions = reduce(failState, { type: "SandboxFailed", issue: n }); + failedAttempts[n] = attempts + 1; + for (const action of failActions) { + if (action.type === "Relabel") { + console.log( + `[${mode}] #${n} failed — re-queuing for retry ` + + `(${attempts + 1}/${policy.maxRetries ?? 2})`, + ); + try { + await withRetry(() => issues.addLabel(n, action.label), { label: `addLabel #${n}` }); + } catch (err) { + console.error(`[${mode}] #${n} could not re-queue:`, err); + } + } + if (action.type === "Comment") { + console.log(`[${mode}] #${n} retries exhausted after ${attempts + 1} failures`); + try { + await issues.comment(n, action.body); + } catch (err) { + console.error(`[${mode}] #${n} could not post exhaustion comment:`, err); + } + } + } + } + if (openedPr) { const finishState: State = { issues: ready, @@ -877,14 +913,6 @@ async function processIssue( ); if (outcome.commits.length === 0) { - // Leave the issue claimed (label stays off) so the poll loop does not - // re-pick a persistently-failing issue every tick. A human re-labels it - // after inspecting. (Automated backoff/retry is later work, not slice 1.) - await issues.comment( - n, - "AFK agent produced no commits — leaving this issue unlabelled for a " + - "human to inspect and re-label `ready-for-agent` if appropriate.", - ); return null; } diff --git a/.sandcastle/reduce.test.ts b/.sandcastle/reduce.test.ts index 3e8f519..eaf939b 100644 --- a/.sandcastle/reduce.test.ts +++ b/.sandcastle/reduce.test.ts @@ -1010,3 +1010,63 @@ test("resolveRunMode: any truthy value for AGENTIC_IN_CONTAINER → detached", ( assert.equal(resolveRunMode({ AGENTIC_IN_CONTAINER: "true" }), "detached"); assert.equal(resolveRunMode({ AGENTIC_IN_CONTAINER: "yes" }), "detached"); }); + +// ─── SandboxFailed retry gate ───────────────────────────────────────────────── + +test("SandboxFailed with attempts < maxRetries → Relabel(ready-for-agent)", () => { + const state = base({ + failedAttempts: { 1: 0 }, + policy: { concurrency: 1, mode: "afk", maxRetries: 2 }, + }); + assert.deepEqual(reduce(state, { type: "SandboxFailed", issue: 1 }), [ + { type: "Relabel", issueId: 1, label: READY_LABEL }, + ]); +}); + +test("SandboxFailed one attempt before cap also re-queues", () => { + const state = base({ + failedAttempts: { 1: 1 }, + policy: { concurrency: 1, mode: "afk", maxRetries: 2 }, + }); + assert.deepEqual(reduce(state, { type: "SandboxFailed", issue: 1 }), [ + { type: "Relabel", issueId: 1, label: READY_LABEL }, + ]); +}); + +test("SandboxFailed at the cap → Comment, no Relabel (no infinite loop)", () => { + const state = base({ + failedAttempts: { 1: 2 }, + policy: { concurrency: 1, mode: "afk", maxRetries: 2 }, + }); + const actions = reduce(state, { type: "SandboxFailed", issue: 1 }); + assert.ok(!actions.some((a) => a.type === "Relabel"), "must not re-queue at the cap — no infinite loop"); + assert.ok(actions.some((a) => a.type === "Comment"), "must emit exhaustion comment"); +}); + +test("SandboxFailed with absent failedAttempts defaults to 0 → Relabel", () => { + // Existing tests pass base() without failedAttempts — must still work + const state = base(); + const actions = reduce(state, { type: "SandboxFailed", issue: 99 }); + assert.deepEqual(actions, [{ type: "Relabel", issueId: 99, label: READY_LABEL }]); +}); + +test("after SandboxFailed Relabel, re-labelled issue is picked up by next Tick", () => { + // Simulate: SandboxFailed emitted Relabel, orchestrator applied it; next Tick starts the issue + const stateAfterRelabel = base({ + issues: [{ id: 1, labels: [READY_LABEL], blockedBy: [] }], + failedAttempts: { 1: 1 }, + }); + assert.deepEqual(reduce(stateAfterRelabel, { type: "Tick" }), [ + { type: "StartSandbox", issueId: 1 }, + ]); +}); + +test("SandboxFailed applies in hitl mode too (afk/hitl-agnostic)", () => { + const state = base({ + failedAttempts: { 5: 0 }, + policy: { concurrency: 1, mode: "hitl", maxRetries: 2 }, + }); + assert.deepEqual(reduce(state, { type: "SandboxFailed", issue: 5 }), [ + { type: "Relabel", issueId: 5, label: READY_LABEL }, + ]); +}); diff --git a/.sandcastle/reduce.ts b/.sandcastle/reduce.ts index ee50baa..805562d 100644 --- a/.sandcastle/reduce.ts +++ b/.sandcastle/reduce.ts @@ -56,6 +56,8 @@ export type Mode = "afk" | "hitl"; export interface Policy { readonly concurrency: number; readonly mode: Mode; + /** Maximum retries per issue within a single orchestrator run (default 2). */ + readonly maxRetries?: number; } export interface State { @@ -64,6 +66,8 @@ export interface State { readonly policy: Policy; /** Issue ids whose sandbox is currently running. */ readonly inFlight: number[]; + /** Per-issue count of SandboxFailed events dispatched this run (default {}). */ + readonly failedAttempts?: Record; } export type Event = @@ -79,6 +83,7 @@ export type Action = | { type: "EnableAutoMerge"; pr: Pr } | { type: "Relabel"; issueId: number; label: string } | { type: "WaitForHuman"; pr: Pr } + | { type: "Comment"; issueId: number; body: string } | { type: "Stop" }; export function reduce(state: State, event: Event): Action[] { @@ -91,9 +96,8 @@ export function reduce(state: State, event: Event): Action[] { return onSandboxFinished(state, event); case "ReviewFinished": return onReviewFinished(event); - // SandboxFailed lands in a later slice. - default: - return []; + case "SandboxFailed": + return onSandboxFailed(state, event); } } @@ -143,6 +147,22 @@ function onReviewFinished(event: { issue: number; pr: Pr; verdict: ReviewVerdict return [{ type: "WaitForHuman", pr: event.pr }]; } +function onSandboxFailed(state: State, event: { issue: number }): Action[] { + const maxRetries = state.policy.maxRetries ?? 2; + const attempts = (state.failedAttempts ?? {})[event.issue] ?? 0; + if (attempts < maxRetries) { + return [{ type: "Relabel", issueId: event.issue, label: READY_LABEL }]; + } + return [{ + type: "Comment", + issueId: event.issue, + body: + `AFK agent failed ${attempts + 1} times (retry limit: ${maxRetries}) — ` + + `leaving this issue unlabelled for a human to inspect and re-label ` + + `\`ready-for-agent\` if appropriate.`, + }]; +} + /** * Map a structured review output (or any extraction/validation error) to a * merge verdict. Fail-safe: any missing or malformed result → "changes-requested" diff --git a/CLAUDE.md b/CLAUDE.md index e857472..f78063f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -103,6 +103,18 @@ fallback). This is what cockpit mode depends on. - After each close, dependents are checked and labelled if now unblocked. - GitHub issues are the durable state — nothing is stored locally. +### Sandbox retry semantics (#76) + +When a sandbox fails (throws or produces no commits) the orchestrator automatically +re-labels the issue `ready-for-agent` so the next tick picks it up again, up to +`Policy.maxRetries` times per run (default **2**). After the cap the issue is left +unlabelled and a comment explains the exhaustion — never an infinite retry loop. + +**Known limitation:** attempt counts live in the orchestrator's in-run state and +reset when the orchestrator restarts. A persistently-failing issue gets another +`maxRetries` attempts on each fresh orchestrator run. In-run bounding is sufficient +for v1; cross-run persistence is not implemented. + ### Implementation approach Each implementation sub-agent uses `/tdd` (red → green → refactor) per acceptance criterion,