From 45f412ffd15022fee512be21764bd7e5ebc2db3e Mon Sep 17 00:00:00 2001 From: Luca Giordano Date: Sun, 28 Jun 2026 15:58:14 +0000 Subject: [PATCH] feat: adopt high-level sandcastle lifecycle in sandbox-runner (#72) Switch SandboxRunner from run() to createSandbox + await using + hooks.onSandboxReady + promptFile/promptArgs so the library owns worktree creation and deterministic teardown instead of the adapter. - SandboxRunner.runIssue uses `await using sandbox = await createSandbox()` then `sandbox.run()` with promptFile/promptArgs; scope-bound disposal replaces bespoke try/finally teardown - buildAgentInput extended to accept IssueInput; returns the full config for both createSandbox() and sandbox.run() (agent, imageName, network, copyToWorktree, onSandboxReady, promptFile, promptArgs) - In-sandbox setup (opencode.json relocation) moves to hooks.onSandboxReady - Agent prompts move to prompt-claude.md / prompt-local.md templates with {{ISSUE_NUMBER}}/{{ISSUE_TITLE}}/{{ISSUE_BODY}} substitution via promptArgs - buildPrompt/buildLocalPrompt string builders removed - 6 new unit tests covering promptFile, promptArgs, and network forwarding - ADR-0019 records the two asymmetries that stay imperative (MTU creation, socat host override) and their relation to ADR-0006/0013 and #71 --- .sandcastle/prompt-claude.md | 23 +++ .sandcastle/prompt-local.md | 16 ++ .sandcastle/sandbox-runner.test.ts | 62 ++++++-- .sandcastle/sandbox-runner.ts | 149 ++++++++---------- ...n-layer-high-level-sandcastle-lifecycle.md | 105 ++++++++++++ 5 files changed, 262 insertions(+), 93 deletions(-) create mode 100644 .sandcastle/prompt-claude.md create mode 100644 .sandcastle/prompt-local.md create mode 100644 docs/adr/0019-lean-execution-layer-high-level-sandcastle-lifecycle.md diff --git a/.sandcastle/prompt-claude.md b/.sandcastle/prompt-claude.md new file mode 100644 index 0000000..8aaf319 --- /dev/null +++ b/.sandcastle/prompt-claude.md @@ -0,0 +1,23 @@ +You are implementing GitHub issue #{{ISSUE_NUMBER}}: "{{ISSUE_TITLE}}" + +## Issue +{{ISSUE_BODY}} + +## Scope guardrails +- Only modify files directly required by this issue. +- Do not modify, close, or reference other issues. +- Do not add dependencies beyond what the issue specifies. +- Do not push or open a pull request — the orchestrator does that after you finish. +- Make the changes yourself: use the edit/write tools to modify files and bash + to run tests and commit. Do not just plan or write a todo list, and do not + delegate to a subagent — apply the edits and commit them directly. + +## Steps +1. Implement the change. If the project has a test suite, work test-first: + write a failing test, make it pass, then refactor. +2. Run the test suite and make sure it is green. +3. Commit your work to the current branch with clear, imperative messages + referencing #{{ISSUE_NUMBER}}. +4. When the issue is fully implemented and committed, output exactly this + line, by itself, and then stop — produce no further output or commits: + ISSUE_COMPLETE diff --git a/.sandcastle/prompt-local.md b/.sandcastle/prompt-local.md new file mode 100644 index 0000000..cf431dd --- /dev/null +++ b/.sandcastle/prompt-local.md @@ -0,0 +1,16 @@ +Implement this change in the repository, then commit. Use your tools now — read files, edit files, run bash. Do not just describe a plan. + +TASK (issue #{{ISSUE_NUMBER}}): {{ISSUE_TITLE}} +{{ISSUE_BODY}} + +The code lives in the `.sandcastle/` directory (TypeScript). Steps: +1. Read the relevant `.sandcastle/*.ts` file(s) with your read tool. +2. Make the change with your edit tool. +3. Run the tests: `cd .sandcastle && npm test` — fix until they pass. +4. Commit: `cd .sandcastle && cd .. && git add -A && git commit -m " (#{{ISSUE_NUMBER}})"` +5. Do NOT push or open a PR. + +When committed, output this exact line alone and stop: +ISSUE_COMPLETE + +Begin now by reading the most relevant file. diff --git a/.sandcastle/sandbox-runner.test.ts b/.sandcastle/sandbox-runner.test.ts index 4f74926..6fa64ef 100644 --- a/.sandcastle/sandbox-runner.test.ts +++ b/.sandcastle/sandbox-runner.test.ts @@ -1,6 +1,7 @@ /** * Unit tests for buildAgentInput — the pure function that selects agent - * provider, image, and worktree files based on the runner's tier option. + * provider, image, network, prompt template, and prompt args based on the + * runner's tier option and the issue being worked. * No Docker, GitHub, or network required. * * Run: npm test (picks up all *.test.ts files in the test script) @@ -9,29 +10,31 @@ import { test } from "node:test"; import assert from "node:assert/strict"; import { buildAgentInput } from "./sandbox-runner.ts"; +const STUB_ISSUE = { number: 42, title: "Fix the bug", body: "Detailed description" }; + test("default tier uses claude-code agent with sandcastle:local image", () => { - const input = buildAgentInput({}); + const input = buildAgentInput({}, STUB_ISSUE); assert.equal(input.agent.name, "claude-code"); assert.equal(input.imageName, "sandcastle:local"); assert.equal(input.copyToWorktree, undefined); }); test("explicit claude tier uses claude-code agent", () => { - const input = buildAgentInput({ tier: "claude" }); + const input = buildAgentInput({ tier: "claude" }, STUB_ISSUE); assert.equal(input.agent.name, "claude-code"); assert.equal(input.imageName, "sandcastle:local"); assert.equal(input.copyToWorktree, undefined); }); test("local tier uses opencode agent with sandcastle-opencode:local image", () => { - const input = buildAgentInput({ tier: "local" }); + const input = buildAgentInput({ tier: "local" }, STUB_ISSUE); assert.equal(input.agent.name, "opencode"); assert.equal(input.imageName, "sandcastle-opencode:local"); assert.deepEqual(input.copyToWorktree, ["opencode.json"]); }); test("local tier delivers opencode.json via copyToWorktree", () => { - const input = buildAgentInput({ tier: "local" }); + const input = buildAgentInput({ tier: "local" }, STUB_ISSUE); assert.ok(Array.isArray(input.copyToWorktree), "copyToWorktree should be an array"); assert.ok(input.copyToWorktree!.includes("opencode.json"), "opencode.json must be in copyToWorktree"); }); @@ -40,7 +43,7 @@ test("local tier installs opencode config into the global config dir", () => { // opencode resolves its Ollama provider from ~/.config/opencode, not the // worktree cwd; without this hook the provider never resolves and every // iteration is an empty turn. Lock in that the config is relocated to HOME. - const input = buildAgentInput({ tier: "local" }); + const input = buildAgentInput({ tier: "local" }, STUB_ISSUE); assert.ok(Array.isArray(input.onSandboxReady), "onSandboxReady should be an array"); const cmds = input.onSandboxReady!.map((h) => h.command).join("\n"); assert.match(cmds, /\.config\/opencode/, "must target opencode's global config dir"); @@ -48,21 +51,60 @@ test("local tier installs opencode config into the global config dir", () => { }); test("claude tier has no onSandboxReady hook", () => { - const input = buildAgentInput({ tier: "claude" }); + const input = buildAgentInput({ tier: "claude" }, STUB_ISSUE); assert.equal(input.onSandboxReady, undefined); }); test("claude tier respects custom imageName", () => { - const input = buildAgentInput({ imageName: "my-sandcastle:v2" }); + const input = buildAgentInput({ imageName: "my-sandcastle:v2" }, STUB_ISSUE); assert.equal(input.imageName, "my-sandcastle:v2"); }); test("local tier respects custom localImageName", () => { - const input = buildAgentInput({ tier: "local", localImageName: "my-opencode:latest" }); + const input = buildAgentInput({ tier: "local", localImageName: "my-opencode:latest" }, STUB_ISSUE); assert.equal(input.imageName, "my-opencode:latest"); }); test("local tier defaults to ollama/qwen3-coder:30b model", () => { - const input = buildAgentInput({ tier: "local" }); + const input = buildAgentInput({ tier: "local" }, STUB_ISSUE); assert.equal(input.agent.name, "opencode"); }); + +test("claude tier uses the claude prompt template", () => { + const input = buildAgentInput({ tier: "claude" }, STUB_ISSUE); + assert.ok( + input.promptFile.endsWith("prompt-claude.md"), + `expected prompt-claude.md, got ${input.promptFile}`, + ); +}); + +test("local tier uses the local prompt template", () => { + const input = buildAgentInput({ tier: "local" }, STUB_ISSUE); + assert.ok( + input.promptFile.endsWith("prompt-local.md"), + `expected prompt-local.md, got ${input.promptFile}`, + ); +}); + +test("promptArgs include issue number, title, and body", () => { + const issue = { number: 99, title: "Test Issue", body: "Some body text" }; + const input = buildAgentInput({}, issue); + assert.equal(input.promptArgs["ISSUE_NUMBER"], "99"); + assert.equal(input.promptArgs["ISSUE_TITLE"], "Test Issue"); + assert.equal(input.promptArgs["ISSUE_BODY"], "Some body text"); +}); + +test("promptArgs ISSUE_BODY defaults to (no body) when body is empty", () => { + const input = buildAgentInput({}, { number: 1, title: "T", body: "" }); + assert.equal(input.promptArgs["ISSUE_BODY"], "(no body)"); +}); + +test("network is forwarded from RunnerOptions", () => { + const input = buildAgentInput({ network: "agentic-sandbox-net" }, STUB_ISSUE); + assert.equal(input.network, "agentic-sandbox-net"); +}); + +test("network is undefined when not provided", () => { + const input = buildAgentInput({}, STUB_ISSUE); + assert.equal(input.network, undefined); +}); diff --git a/.sandcastle/sandbox-runner.ts b/.sandcastle/sandbox-runner.ts index bb6e565..1cd6651 100644 --- a/.sandcastle/sandbox-runner.ts +++ b/.sandcastle/sandbox-runner.ts @@ -1,23 +1,34 @@ /** - * SandboxRunner — the sandcastle adapter. Wraps `run()` to work one issue in a - * disposable, git-isolated Docker sandbox: the agent gets its own checkout on - * `agent/issue-`, implements the issue, and commits. A `completionSignal` - * stops it as soon as it is done so it does not loop and produce duplicate + * SandboxRunner — the sandcastle adapter. Uses the high-level `createSandbox` + * lifecycle to work one issue in a disposable, git-isolated Docker sandbox: + * the agent gets its own checkout on `agent/issue-`, implements the issue, + * and commits. Scope-bound disposal (`await using`) ensures the worktree and + * container are torn down even if the agent throws. A `completionSignal` stops + * the agent as soon as it is done so it does not loop and produce duplicate * commits (observed in the spike with no signal). * * Auth: `claudeCode` uses CLAUDE_CODE_OAUTH_TOKEN, which sandcastle resolves * from `.sandcastle/.env` (no Anthropic API key). See ADR-0003. * - * Topology: `run()` is invoked with `cwd` = the path-matched host mount (the - * orchestrator's working directory), so the worktree sandcastle bind-mounts - * into the inner container resolves under docker-outside-of-docker (ADR-0011). + * Topology: `createSandbox` is invoked with `cwd` = the path-matched host + * mount (the orchestrator's working directory), so the worktree sandcastle + * bind-mounts into the inner container resolves under docker-outside-of-docker + * (ADR-0011). * * The agent does NOT push or open the PR — the orchestrator does that with the * devcontainer's existing gh + SSH auth (walking-skeleton choice; keeps tokens * out of inner sandboxes). + * + * Lifecycle idiom (ADR-0019): createSandbox owns worktree creation and + * deterministic teardown; in-sandbox setup is declared via hooks.onSandboxReady; + * agent prompts are .md templates resolved by promptFile + promptArgs. */ -import { run, claudeCode, opencode, type AgentProvider } from "@ai-hero/sandcastle"; +import { createSandbox, claudeCode, opencode, type AgentProvider, type PromptArgs } from "@ai-hero/sandcastle"; import { docker } from "@ai-hero/sandcastle/sandboxes/docker"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; + +const _dir = dirname(fileURLToPath(import.meta.url)); const COMPLETION_SIGNAL = "ISSUE_COMPLETE"; @@ -87,22 +98,42 @@ interface SandboxReadyHook { readonly sudo?: boolean; } -/** Resolved agent-tier inputs: agent provider, docker image, any extra - * worktree files to copy, and any in-sandbox setup commands. */ +/** Resolved agent-tier inputs: agent provider, docker image, network, any + * extra worktree files to copy, any in-sandbox setup commands, and the + * prompt template path + args for this issue. */ export interface AgentInput { readonly agent: AgentProvider; readonly imageName: string; + readonly network?: string; readonly copyToWorktree?: string[]; /** Commands run inside the sandbox after it boots, before the agent runs. */ readonly onSandboxReady?: SandboxReadyHook[]; + /** Absolute path to the .md prompt template (resolved against import.meta.url). */ + readonly promptFile: string; + /** {{KEY}} substitution args for the prompt template. */ + readonly promptArgs: PromptArgs; +} + +export interface IssueInput { + readonly number: number; + readonly title: string; + readonly body: string; } -/** Pure function: resolve RunnerOptions to the per-tier agent config. */ -export function buildAgentInput(opts: RunnerOptions): AgentInput { +/** Pure function: resolve RunnerOptions + IssueInput to the per-tier agent + * config, including the prompt template path and substitution args. */ +export function buildAgentInput(opts: RunnerOptions, issue: IssueInput): AgentInput { + const promptArgs: PromptArgs = { + ISSUE_NUMBER: String(issue.number), + ISSUE_TITLE: issue.title, + ISSUE_BODY: issue.body || "(no body)", + }; + if (opts.tier === "local") { return { agent: opencode(opts.localModel ?? "ollama/qwen3-coder:30b"), imageName: opts.localImageName ?? "sandcastle-opencode:local", + network: opts.network, copyToWorktree: ["opencode.json"], // opencode resolves its Ollama provider from its *global* config // (~/.config/opencode/opencode.json), NOT from a config in the worktree @@ -117,106 +148,58 @@ export function buildAgentInput(opts: RunnerOptions): AgentInput { "cp opencode.json \"$HOME/.config/opencode/opencode.json\"", }, ], + promptFile: join(_dir, "prompt-local.md"), + promptArgs, }; } return { agent: claudeCode(opts.model ?? "claude-sonnet-4-6", { permissionMode: "auto" }), imageName: opts.imageName ?? "sandcastle:local", + network: opts.network, + promptFile: join(_dir, "prompt-claude.md"), + promptArgs, }; } -export interface IssueInput { - readonly number: number; - readonly title: string; - readonly body: string; -} - export class SandboxRunner { constructor(private readonly opts: RunnerOptions = {}) {} async runIssue(issue: IssueInput): Promise { const branch = `agent/issue-${issue.number}`; - const agentInput = buildAgentInput(this.opts); - const result = await run({ - agent: agentInput.agent, + const agentInput = buildAgentInput(this.opts, issue); + + // createSandbox owns worktree creation and teardown (ADR-0019); `await + // using` ensures close() fires even if sandbox.run() throws, replacing the + // old bespoke try/finally lifecycle. + await using sandbox = await createSandbox({ + branch, sandbox: docker({ imageName: agentInput.imageName, containerUid: this.opts.containerUid ?? 1000, containerGid: this.opts.containerGid ?? 1000, - ...(this.opts.network ? { network: this.opts.network } : {}), + ...(agentInput.network ? { network: agentInput.network } : {}), }), - branchStrategy: { type: "branch", branch }, cwd: this.opts.cwd, - name: `issue-${issue.number}`, - maxIterations: this.opts.maxIterations ?? 12, - completionSignal: COMPLETION_SIGNAL, copyToWorktree: agentInput.copyToWorktree, hooks: agentInput.onSandboxReady ? { sandbox: { onSandboxReady: agentInput.onSandboxReady } } : undefined, - prompt: buildPrompt(issue, this.opts.tier), + }); + + const result = await sandbox.run({ + agent: agentInput.agent, + name: `issue-${issue.number}`, + maxIterations: this.opts.maxIterations ?? 12, + completionSignal: COMPLETION_SIGNAL, + promptFile: agentInput.promptFile, + promptArgs: agentInput.promptArgs, }); return { - branch: result.branch, + branch: sandbox.branch, commits: result.commits, completed: result.completionSignal !== undefined, logFilePath: result.logFilePath, }; } } - -/** Per-tier prompt. Local models (opencode) get a short, action-first prompt: - * the verbose claude prompt produces empty turns on smaller models — they need - * a direct "edit the file now, then these exact commands" shape. */ -function buildPrompt(issue: IssueInput, tier?: "claude" | "local"): string { - if (tier === "local") return buildLocalPrompt(issue); - return [ - `You are implementing GitHub issue #${issue.number}: "${issue.title}"`, - "", - "## Issue", - issue.body || "(no body)", - "", - "## Scope guardrails", - "- Only modify files directly required by this issue.", - "- Do not modify, close, or reference other issues.", - "- Do not add dependencies beyond what the issue specifies.", - "- Do not push or open a pull request — the orchestrator does that after you finish.", - "- Make the changes yourself: use the edit/write tools to modify files and bash", - " to run tests and commit. Do not just plan or write a todo list, and do not", - " delegate to a subagent — apply the edits and commit them directly.", - "", - "## Steps", - "1. Implement the change. If the project has a test suite, work test-first:", - " write a failing test, make it pass, then refactor.", - "2. Run the test suite and make sure it is green.", - "3. Commit your work to the current branch with clear, imperative messages", - ` referencing #${issue.number}.`, - "4. When the issue is fully implemented and committed, output exactly this", - " line, by itself, and then stop — produce no further output or commits:", - ` ${COMPLETION_SIGNAL}`, - ].join("\n"); -} - -/** Short, directive prompt for local opencode models. Action-first, concrete - * commands, minimal prose — verbose instructions produce empty turns. */ -function buildLocalPrompt(issue: IssueInput): string { - return [ - `Implement this change in the repository, then commit. Use your tools now — read files, edit files, run bash. Do not just describe a plan.`, - "", - `TASK (issue #${issue.number}): ${issue.title}`, - issue.body || "(no body)", - "", - "The code lives in the `.sandcastle/` directory (TypeScript). Steps:", - "1. Read the relevant `.sandcastle/*.ts` file(s) with your read tool.", - "2. Make the change with your edit tool.", - "3. Run the tests: `cd .sandcastle && npm test` — fix until they pass.", - `4. Commit: \`cd .sandcastle && cd .. && git add -A && git commit -m " (#${issue.number})"\``, - "5. Do NOT push or open a PR.", - "", - `When committed, output this exact line alone and stop:`, - COMPLETION_SIGNAL, - "", - "Begin now by reading the most relevant file.", - ].join("\n"); -} diff --git a/docs/adr/0019-lean-execution-layer-high-level-sandcastle-lifecycle.md b/docs/adr/0019-lean-execution-layer-high-level-sandcastle-lifecycle.md new file mode 100644 index 0000000..2ce76d9 --- /dev/null +++ b/docs/adr/0019-lean-execution-layer-high-level-sandcastle-lifecycle.md @@ -0,0 +1,105 @@ +# Lean execution layer — high-level sandcastle lifecycle + +The original `sandbox-runner.ts` called `run()` directly and hand-rolled its +own lifecycle: building the full options object inline, embedding prompt text +in TypeScript string arrays, and relying on `run()`'s internal teardown. This +was functional but kept bespoke lifecycle logic in the adapter rather than +delegating it to the library. + +`@ai-hero/sandcastle@0.10.0` ships a higher-level API — `createSandbox` + +`await using` + `hooks.onSandboxReady` + `promptFile`/`promptArgs` — that owns +worktree creation, scope-bound disposal, declarative in-sandbox setup, and +prompt template substitution. Adopting it shrinks the adapter to its essential +role: translating `RunnerOptions` + `IssueInput` to a plain config object, then +handing off to the library. + +## Decision + +`SandboxRunner.runIssue` uses the high-level lifecycle: + +```typescript +await using sandbox = await createSandbox({ + branch, + sandbox: docker({ imageName, containerUid, containerGid, network }), + cwd, + copyToWorktree, + hooks: { sandbox: { onSandboxReady: [...] } }, +}); + +const result = await sandbox.run({ + agent, + name, + maxIterations, + completionSignal, + promptFile, + promptArgs, +}); +``` + +**`await using` replaces bespoke teardown.** `createSandbox` returns a `Sandbox` +handle that implements `Symbol.asyncDispose`; the scope-bound disposal calls +`sandbox.close()` when the block exits, normally or via exception. The library +owns worktree cleanup — the adapter no longer needs a `finally` block. + +**`hooks.onSandboxReady` replaces prompt-embedded setup.** The local tier's +`opencode.json` relocation command was previously embedded in prompt text (making +it part of the agent's instructions, not the sandbox lifecycle). It now lives in +`hooks.sandbox.onSandboxReady`, where sandcastle runs it once after boot and +before the agent starts. The agent prompt no longer describes bootstrapping. + +**`promptFile` + `promptArgs` replace inline string builders.** Agent prompts +move to `.md` templates (`prompt-claude.md`, `prompt-local.md`) alongside the +runner source. `{{KEY}}` placeholders are resolved by sandcastle at run time from +`promptArgs`; only the template selection (one file per tier) stays conditional in +`buildAgentInput`. The `buildPrompt`/`buildLocalPrompt` string-builder functions +are removed. + +**`buildAgentInput` is the pure builder seam.** The function is extended to take +both `RunnerOptions` and `IssueInput` and returns the complete config for both +`createSandbox()` and `sandbox.run()` (agent, imageName, network, copyToWorktree, +onSandboxReady, promptFile, promptArgs). It remains pure and exported so +`sandbox-runner.test.ts` can assert on all config fields without spawning Docker. + +## Two asymmetries that stay imperative + +### MTU network — declarative attachment, imperative creation + +`docker({ network: SANDBOX_NETWORK })` attaches the inner container to the +MTU-1400 network declaratively (ADR-0013). `ensureSandboxNetwork` (in `main.ts`) +creates it imperatively with `--opt com.docker.network.driver.mtu=1400` before +the first sandbox starts. This split is intentional: `docker()` has no option +to create a custom-MTU network, only to attach to an existing one. Do not move +network creation into `docker()`. + +### Socat proxy — imperative host override + +`resolveDockerHost` sets `process.env.DOCKER_HOST` to the direct socket path +(`unix:///var/run/docker-host.sock`) when the DooD socat proxy is detected (#71). +This bypasses socat because socat tears down `docker exec`'s hijacked bidirectional +stream after the first burst, causing every sandbox iteration to be an empty +"started → stopped" turn with zero commits. `docker()` has no `host` or `socket` +option, so this cannot be expressed declaratively. `resolveDockerHost` and its +`process.env` mutation in `main.ts` are unchanged. Do not "tidy" this into the +docker options — the bug returns immediately. + +## Consequences + +- `SandboxRunner` is thinner: worktree creation, teardown, in-sandbox setup, and + prompt rendering are all delegated to the library. +- `/afk` and `/hitl` behavior is unchanged — they still get one PR per ready + issue with auto-merge on green CI. +- The pure builder seam (`buildAgentInput`) remains testable without Docker; the + test suite covers image selection, network forwarding, copyToWorktree, hook + commands, template paths, and promptArgs in isolation. +- The live `createSandbox`/`await using` round-trip is covered by the existing + gated `integration.test.ts` (`SANDCASTLE_INTEGRATION=1`); the CI `sandcastle` + job is unaffected. + +## Relations + +- ADR-0006: `/afk` still wraps sandcastle via exec — `createSandbox` execs under + the hood just as `run()` did. +- ADR-0013: MTU network creation stays in `ensureSandboxNetwork`; `docker(network)` + only attaches. The split is recorded here. +- #71 (socat fix): `resolveDockerHost` stays imperative. See the asymmetry note + above.