Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 37 additions & 9 deletions .sandcastle/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -620,6 +620,9 @@ async function main(): Promise<void> {
});

const inFlight: number[] = [];
// Per-issue count of SandboxFailed events dispatched this run. Persists
// across ticks so the retry cap is enforced within a single orchestrator run.
const failedAttempts: Record<number, number> = {};
// Issue ids whose PrMerged event has been dispatched to the reducer.
// Synchronous check + add before the first await ensures concurrent smee
// and reconciliation paths never double-dispatch the same merge.
Expand Down Expand Up @@ -760,15 +763,48 @@ async function main(): Promise<void> {

void (async () => {
let openedPr: Pr | null = null;
let sandboxFailed = false;
try {
openedPr = await processIssue(n, issues, runner, base, repoRoot, mode);
if (openedPr === null) {
sandboxFailed = true;
console.log(`[${mode}] #${n} sandbox produced no commits`);
}
} catch (err) {
console.error(`[${mode}] #${n} failed:`, err);
// Leave the issue claimed (label removed) for a human to inspect.
sandboxFailed = true;
} finally {
inFlight.splice(inFlight.indexOf(n), 1);
}

if (sandboxFailed) {
const attempts = failedAttempts[n] ?? 0;
const failState: State = { issues: ready, prs: allPrs, inFlight, policy, failedAttempts };
const failActions = reduce(failState, { type: "SandboxFailed", issue: n });
failedAttempts[n] = attempts + 1;
for (const action of failActions) {
if (action.type === "Relabel") {
console.log(
`[${mode}] #${n} failed — re-queuing for retry ` +
`(${attempts + 1}/${policy.maxRetries ?? 2})`,
);
try {
await withRetry(() => issues.addLabel(n, action.label), { label: `addLabel #${n}` });
} catch (err) {
console.error(`[${mode}] #${n} could not re-queue:`, err);
}
}
if (action.type === "Comment") {
console.log(`[${mode}] #${n} retries exhausted after ${attempts + 1} failures`);
try {
await issues.comment(n, action.body);
} catch (err) {
console.error(`[${mode}] #${n} could not post exhaustion comment:`, err);
}
}
}
}

if (openedPr) {
const finishState: State = {
issues: ready,
Expand Down Expand Up @@ -877,14 +913,6 @@ async function processIssue(
);

if (outcome.commits.length === 0) {
// Leave the issue claimed (label stays off) so the poll loop does not
// re-pick a persistently-failing issue every tick. A human re-labels it
// after inspecting. (Automated backoff/retry is later work, not slice 1.)
await issues.comment(
n,
"AFK agent produced no commits — leaving this issue unlabelled for a " +
"human to inspect and re-label `ready-for-agent` if appropriate.",
);
return null;
}

Expand Down
60 changes: 60 additions & 0 deletions .sandcastle/reduce.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1010,3 +1010,63 @@ test("resolveRunMode: any truthy value for AGENTIC_IN_CONTAINER → detached", (
assert.equal(resolveRunMode({ AGENTIC_IN_CONTAINER: "true" }), "detached");
assert.equal(resolveRunMode({ AGENTIC_IN_CONTAINER: "yes" }), "detached");
});

// ─── SandboxFailed retry gate ─────────────────────────────────────────────────

test("SandboxFailed with attempts < maxRetries → Relabel(ready-for-agent)", () => {
const state = base({
failedAttempts: { 1: 0 },
policy: { concurrency: 1, mode: "afk", maxRetries: 2 },
});
assert.deepEqual(reduce(state, { type: "SandboxFailed", issue: 1 }), [
{ type: "Relabel", issueId: 1, label: READY_LABEL },
]);
});

test("SandboxFailed one attempt before cap also re-queues", () => {
const state = base({
failedAttempts: { 1: 1 },
policy: { concurrency: 1, mode: "afk", maxRetries: 2 },
});
assert.deepEqual(reduce(state, { type: "SandboxFailed", issue: 1 }), [
{ type: "Relabel", issueId: 1, label: READY_LABEL },
]);
});

test("SandboxFailed at the cap → Comment, no Relabel (no infinite loop)", () => {
const state = base({
failedAttempts: { 1: 2 },
policy: { concurrency: 1, mode: "afk", maxRetries: 2 },
});
const actions = reduce(state, { type: "SandboxFailed", issue: 1 });
assert.ok(!actions.some((a) => a.type === "Relabel"), "must not re-queue at the cap — no infinite loop");
assert.ok(actions.some((a) => a.type === "Comment"), "must emit exhaustion comment");
});

test("SandboxFailed with absent failedAttempts defaults to 0 → Relabel", () => {
// Existing tests pass base() without failedAttempts — must still work
const state = base();
const actions = reduce(state, { type: "SandboxFailed", issue: 99 });
assert.deepEqual(actions, [{ type: "Relabel", issueId: 99, label: READY_LABEL }]);
});

test("after SandboxFailed Relabel, re-labelled issue is picked up by next Tick", () => {
// Simulate: SandboxFailed emitted Relabel, orchestrator applied it; next Tick starts the issue
const stateAfterRelabel = base({
issues: [{ id: 1, labels: [READY_LABEL], blockedBy: [] }],
failedAttempts: { 1: 1 },
});
assert.deepEqual(reduce(stateAfterRelabel, { type: "Tick" }), [
{ type: "StartSandbox", issueId: 1 },
]);
});

test("SandboxFailed applies in hitl mode too (afk/hitl-agnostic)", () => {
const state = base({
failedAttempts: { 5: 0 },
policy: { concurrency: 1, mode: "hitl", maxRetries: 2 },
});
assert.deepEqual(reduce(state, { type: "SandboxFailed", issue: 5 }), [
{ type: "Relabel", issueId: 5, label: READY_LABEL },
]);
});
26 changes: 23 additions & 3 deletions .sandcastle/reduce.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ export type Mode = "afk" | "hitl";
export interface Policy {
readonly concurrency: number;
readonly mode: Mode;
/** Maximum retries per issue within a single orchestrator run (default 2). */
readonly maxRetries?: number;
}

export interface State {
Expand All @@ -64,6 +66,8 @@ export interface State {
readonly policy: Policy;
/** Issue ids whose sandbox is currently running. */
readonly inFlight: number[];
/** Per-issue count of SandboxFailed events dispatched this run (default {}). */
readonly failedAttempts?: Record<number, number>;
}

export type Event =
Expand All @@ -79,6 +83,7 @@ export type Action =
| { type: "EnableAutoMerge"; pr: Pr }
| { type: "Relabel"; issueId: number; label: string }
| { type: "WaitForHuman"; pr: Pr }
| { type: "Comment"; issueId: number; body: string }
| { type: "Stop" };

export function reduce(state: State, event: Event): Action[] {
Expand All @@ -91,9 +96,8 @@ export function reduce(state: State, event: Event): Action[] {
return onSandboxFinished(state, event);
case "ReviewFinished":
return onReviewFinished(event);
// SandboxFailed lands in a later slice.
default:
return [];
case "SandboxFailed":
return onSandboxFailed(state, event);
}
}

Expand Down Expand Up @@ -143,6 +147,22 @@ function onReviewFinished(event: { issue: number; pr: Pr; verdict: ReviewVerdict
return [{ type: "WaitForHuman", pr: event.pr }];
}

function onSandboxFailed(state: State, event: { issue: number }): Action[] {
const maxRetries = state.policy.maxRetries ?? 2;
const attempts = (state.failedAttempts ?? {})[event.issue] ?? 0;
if (attempts < maxRetries) {
return [{ type: "Relabel", issueId: event.issue, label: READY_LABEL }];
}
return [{
type: "Comment",
issueId: event.issue,
body:
`AFK agent failed ${attempts + 1} times (retry limit: ${maxRetries}) — ` +
`leaving this issue unlabelled for a human to inspect and re-label ` +
`\`ready-for-agent\` if appropriate.`,
}];
}

/**
* Map a structured review output (or any extraction/validation error) to a
* merge verdict. Fail-safe: any missing or malformed result → "changes-requested"
Expand Down
12 changes: 12 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,18 @@ fallback). This is what cockpit mode depends on.
- After each close, dependents are checked and labelled if now unblocked.
- GitHub issues are the durable state — nothing is stored locally.

### Sandbox retry semantics (#76)

When a sandbox fails (throws or produces no commits) the orchestrator automatically
re-labels the issue `ready-for-agent` so the next tick picks it up again, up to
`Policy.maxRetries` times per run (default **2**). After the cap the issue is left
unlabelled and a comment explains the exhaustion — never an infinite retry loop.

**Known limitation:** attempt counts live in the orchestrator's in-run state and
reset when the orchestrator restarts. A persistently-failing issue gets another
`maxRetries` attempts on each fresh orchestrator run. In-run bounding is sufficient
for v1; cross-run persistence is not implemented.

### Implementation approach

Each implementation sub-agent uses `/tdd` (red → green → refactor) per acceptance criterion,
Expand Down
Loading