From 3b335ea197ea3e5675dd2c90c3ad8d6613eb2039 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Tue, 23 Jun 2026 17:02:01 -0600 Subject: [PATCH 1/5] docs(examples): scientifically-rigorous coding benchmark across harnesses with controlled tool use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add examples/coding-benchmark/ — runs one coding task across claude-code/opencode/codex/cli baseline profiles × scenarios via runProfileMatrix, with a one-line tool-surface knob, validators-before-judge scoring, a 1-or-3 (ensemble) judge layer, and real paired-bootstrap + Wilson + BH stats. The no-cheat firewall (agent context = scenario.prompt only) is enforced and pinpointed in dispatch.ts. Ships an in-process SandboxClient so the whole pipeline compiles and runs offline with no creds, and runs faithfully live with --live + TANGLE_API_KEY. --- examples/README.md | 3 + examples/coding-benchmark/README.md | 111 +++++++++++++++ examples/coding-benchmark/benchmark.ts | 159 +++++++++++++++++++++ examples/coding-benchmark/dispatch.ts | 167 +++++++++++++++++++++++ examples/coding-benchmark/judges.ts | 147 ++++++++++++++++++++ examples/coding-benchmark/offline-box.ts | 101 ++++++++++++++ examples/coding-benchmark/profiles.ts | 74 ++++++++++ examples/coding-benchmark/scenarios.ts | 124 +++++++++++++++++ examples/coding-benchmark/stats.ts | 146 ++++++++++++++++++++ examples/coding-benchmark/tools.ts | 55 ++++++++ examples/coding-benchmark/validators.ts | 104 ++++++++++++++ 11 files changed, 1191 insertions(+) create mode 100644 examples/coding-benchmark/README.md create mode 100644 examples/coding-benchmark/benchmark.ts create mode 100644 examples/coding-benchmark/dispatch.ts create mode 100644 examples/coding-benchmark/judges.ts create mode 100644 examples/coding-benchmark/offline-box.ts create mode 100644 examples/coding-benchmark/profiles.ts create mode 100644 examples/coding-benchmark/scenarios.ts create mode 100644 examples/coding-benchmark/stats.ts create mode 100644 examples/coding-benchmark/tools.ts create mode 100644 examples/coding-benchmark/validators.ts diff --git a/examples/README.md b/examples/README.md index ca9a8bf..8e457e8 100644 --- a/examples/README.md +++ b/examples/README.md @@ -36,6 +36,7 @@ Era tags: **production runtime** (`runAgentTask` / `handleChatTurn` — what eve |---|---|---|---| | 9c | [`strategy-evolution/`](./strategy-evolution/) | loops suite | `runStrategyEvolution` + `promotionGate` — the policy-search journey: author candidate strategies from losses, advance a champion, promote on a fresh holdout slice (needs `TANGLE_API_KEY`) | | 9d | [`product-eval/`](./product-eval/) | loops suite | `evalPersona` — user-sim product evals in one call: scripted + LLM-adversarial personas, plus the `runPersonaDispatch` → `runProfileMatrix` scored path (needs `TANGLE_API_KEY`; offline-testable via a `backendFor` override) | +| 9e | [`coding-benchmark/`](./coding-benchmark/) | loops suite | `runProfileMatrix` over harness × baseline-profile × scenario — one coding task across claude-code / opencode / codex / cli, with a one-line tool knob, validators-before-judge, a no-cheat firewall, and real paired-bootstrap + Wilson + BH stats (offline by default; `--live` for real harness boxes) | ## The supervisor core, deeper — an agent drives N agents @@ -101,6 +102,8 @@ pnpm dlx tsx examples/ui-audit/ui-audit.ts /tmp/ui-audit-demo https://example.co # The loops suite, deeper — search + evals TANGLE_API_KEY=... pnpm tsx examples/strategy-evolution/strategy-evolution.ts # policy search → holdout gate TANGLE_API_KEY=... pnpm tsx examples/product-eval/product-eval.ts # user-sim product evals (evalPersona) +pnpm tsx examples/coding-benchmark/benchmark.ts # harness × profile × scenario matrix (offline) +pnpm tsx examples/coding-benchmark/benchmark.ts --ensemble --reps 5 # 3-model judge + more reps # Self-improvement + observability pnpm tsx examples/self-improving-loop/self-improving-loop.ts diff --git a/examples/coding-benchmark/README.md b/examples/coding-benchmark/README.md new file mode 100644 index 0000000..189c4fa --- /dev/null +++ b/examples/coding-benchmark/README.md @@ -0,0 +1,111 @@ +# coding-benchmark + +**Run the same coding task across coding agents — fairly, honestly, with real statistics — in ~8 files of pure composition.** Every moving part is an `agent-runtime` or `agent-eval` primitive. Zero bespoke harness code. + +```bash +# offline — no creds, no network. Runs the whole pipeline against an in-process box. +pnpm tsx examples/coding-benchmark/benchmark.ts + +# swap a tool surface, add the 3-model judge, run more reps +pnpm tsx examples/coding-benchmark/benchmark.ts --tools web --ensemble --reps 5 + +# live — real harness boxes + a real judge model +TANGLE_API_KEY=sk-... SANDBOX_BASE_URL=https://... \ + pnpm tsx examples/coding-benchmark/benchmark.ts --live +``` + +## What it measures + +One coding task, run across a **matrix** of three axes, scored, and compared with real stats: + +| Axis | What varies | Where | +|---|---|---| +| **harness** | claude-code / opencode / codex / cli, each on its **baseline default profile** (no skills, no injected prompt — we measure the harness, not our scaffolding) | `profiles.ts` | +| **scenario** | the held-out coding tasks (a token-bucket rate limiter, an RFC-4180 CSV parser) | `scenarios.ts` | +| **tool surface** | `none` / `web` / `search-mcp` — folded in as a one-line knob | `tools.ts` | + +The agent gets up to **3 refine rounds** in **one persistent box**: round N+1's prompt is built from round N's *check failures* (and nothing else — see the firewall). It stops the moment the deterministic checks pass. + +The output is a leaderboard with confidence bands and a significance matrix: + +``` +Harness leaderboard (mean composite, 95% CI; pass-rate, Wilson CI): + claude-code-baseline composite 0.813 [0.813, 0.813] pass 100% [34%, 100%] (n=2) + ... +Pairwise (paired bootstrap on matched scenarios, BH-corrected): + opencode − claude-code: Δ=0.000 [0.000, 0.000] n.s. +``` + +> Offline, every harness runs the **same** scripted solution through the **same** stub judge, so all deltas are 0.000 — that's the honest no-variance result, not a bug. The plumbing (matrix, validators, judge, stats, firewall) all runs for real; only the model is stubbed. Add `--live` for real models and the harnesses separate. + +## How a tool swap works (one line) + +A tool surface is a **preset**, not forked code. Each preset authors the same two fields onto the profile — native web tools on/off and an optional mounted MCP — and the sandbox substrate materializes them into each harness's real config: + +```ts +withTools(profile, 'web') // native websearch + webfetch on +withTools(profile, 'search-mcp') // mount a search MCP instead +withTools(profile, 'none') // baseline: no web, no MCP +``` + +On the CLI it's `--tools none|web|search-mcp`. **Honesty caveat:** a preset only takes effect for a `(harness, lever)` pair the sandbox actually materializes — if a harness has no native `webfetch`, `--tools web` is a no-op *there*. That's a substrate fact, not something this example papers over. + +## How it stays honest (the no-cheat firewall) + +**The agent's context is the task prompt — and nothing else.** The grading criteria never reach the box. + +- A `CodingScenario` (`scenarios.ts`) splits into `prompt` (the **only** field the agent sees) and eval-only fields: the validator commands, the realness signals, the rubric note. Because they're different fields on one object, "the agent reads the answer key" becomes something you can **see in one place** — it would require the dispatch to write a non-`prompt` field into the box. +- **It does not.** The firewall is one labeled block in **`dispatch.ts`** (`THE NO-CHEAT FIREWALL LIVES HERE`): the only thing that reaches the box is `scenario.prompt`, plus next-round prompts built **only** from validator pass/fail + stderr. The rubric, the realness score, and the judge are read *after* the loop, never written in. +- The realness anchor runs **after** the loop and is written **write-only** to the record (`ctx.artifacts`) — the agent can't steer toward a metric it can't read. + +## How it scores (validators before judge) + +Scoring runs in strict order, cheapest and most objective first: + +1. **Deterministic validators (run first, in the box, ~$0).** `typecheck` → `test` → `lint` as shell commands; pass/fail from the exit code. These steer the refine loop. (`validators.ts` · `runBoxChecks`) +2. **Realness anchor (write-only).** `scoreAuthenticity` + `gateRealness` — catches a stub that compiles but fakes the hard part. On the sample tasks it scores a real impl **85** and a `return true` stub **35 (gated)**. (`validators.ts` · `realnessValidator`) +3. **LLM judge (last, only on the band the checks can't resolve).** A 4-dimension weighted rubric — correctness 0.40 · completeness 0.25 · code_quality 0.20 · robustness 0.15. The rubric text + anchors live **with the judge**, never in the workdir. (`judges.ts`) + +**How many judges:** +- **Default: 1** — `singleCodeJudge`, one model. Cheap, for the leaderboard sweep. +- **`--ensemble`: 3** — `ensembleCodeJudge`, three **cross-family** models (deepseek + openai + google). `crossFamily: true` rejects a same-family panel at construction, so the ensemble is genuinely independent. Use it only for a ship/no-ship claim. + +**Validators per cell:** 3 deterministic checks + 1 realness anchor = **4**, all before the judge. + +## How the stats are real (`stats.ts`) + +Every number is one `agent-eval` primitive call — no hand-rolled statistics: + +- per-harness **mean composite + bootstrap CI** (`confidenceInterval`) +- per-harness **pass-rate + Wilson binomial CI** (`wilson`) — the correct interval for a proportion +- every harness **pair** compared on **matched scenarios** with a **paired bootstrap** (`pairedBootstrap`), then **BH-corrected** across all pairs (`benjaminiHochberg`) so running many comparisons doesn't manufacture a false winner + +## The files + +| File | What it owns | +|---|---| +| `scenarios.ts` | the held-out task corpus + the firewall-as-a-type (`prompt` vs eval-only fields) | +| `profiles.ts` | the harness axis — one bare baseline `AgentProfile` per harness | +| `tools.ts` | the one-line tool knob (`withTools` + presets) | +| `validators.ts` | deterministic checks (`runBoxChecks`) + the realness anchor (`realnessValidator`) | +| `judges.ts` | the rubric + `singleCodeJudge` (1) and `ensembleCodeJudge` (3) | +| `dispatch.ts` | renders one matrix cell: persistent box + multi-round refine. **The firewall lives here.** | +| `offline-box.ts` | an in-process `SandboxClient` so the whole thing runs with no creds | +| `benchmark.ts` | the entrypoint: build the axes, hand the matrix the dispatch + judges, run, print stats | +| `stats.ts` | pairs harnesses → `pairedBootstrap` / `benjaminiHochberg` / `confidenceInterval` / `wilson` | + +## Primitives composed + +- **matrix:** `runProfileMatrix({ profiles, scenarios, dispatch, judges, reps, integrity, costCeiling })` (`@tangle-network/agent-eval/campaign`) with a `ProfileDispatchFn` rendering each cell +- **box + multi-round:** `openSandboxRun(client, opts, deliverable)` → `.start()` / `.resume()` over one persistent, resumable session (`@tangle-network/agent-runtime/loops`) +- **deterministic layer:** the runtime `Validator` seam, run before the judge +- **realness:** `scoreAuthenticity` + `gateRealness` (`@tangle-network/agent-eval/authenticity`) +- **judges:** a hand-built `JudgeConfig`, and `ensembleJudge` + `aggregateJudgeVerdicts` for the panel +- **integrity:** `integrity: 'assert'` on the matrix proves a real backend ran (no stubbed cell) — `'off'` only for the offline stub +- **stats:** `pairedBootstrap`, `benjaminiHochberg`, `confidenceInterval`, `wilson` + +## Going live + +Swap `offlineSandboxClient(...)` for a real `@tangle-network/sandbox` client (the `--live` path in `benchmark.ts`) and point the judge's `complete` / `scoreOne` at your router. **Nothing else in the example changes** — same dispatch, same matrix, same stats. That's the point. + +**Note on codex:** codex emits no structured tool calls, so per-tool progress is unavailable there. It still runs and scores; that's a harness property, not a gap in this example. diff --git a/examples/coding-benchmark/benchmark.ts b/examples/coding-benchmark/benchmark.ts new file mode 100644 index 0000000..3a8b863 --- /dev/null +++ b/examples/coding-benchmark/benchmark.ts @@ -0,0 +1,159 @@ +/** + * coding-benchmark — run ONE coding task across harnesses × baseline profiles × + * scenarios, with controlled tool use, validators-before-judge, real stats, and a + * no-cheat firewall. Every moving part is an agent-runtime / agent-eval primitive. + * + * # offline (no creds — uses the in-process box + stub judge) + * pnpm tsx examples/coding-benchmark/benchmark.ts + * + * # one tool preset / ensemble / more reps + * pnpm tsx examples/coding-benchmark/benchmark.ts --tools web --ensemble --reps 5 + * + * # live (real harness boxes + a real judge model) + * TANGLE_API_KEY=sk-... SANDBOX_BASE_URL=https://... \ + * pnpm tsx examples/coding-benchmark/benchmark.ts --live + * + * The wiring below is the whole thing: build the profile axis, hand the matrix the + * dispatch + the judge(s), run it, then compute pairwise stats. ~40 lines of glue. + */ + +import { mkdtempSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { + inMemoryCampaignStorage, + type JudgeConfig, + runProfileMatrix, +} from '@tangle-network/agent-eval/campaign' +import type { AgentProfile } from '@tangle-network/agent-interface' +import type { SandboxClient } from '@tangle-network/agent-runtime/loops' +import { codingDispatch } from './dispatch' +import { type CompleteFn, ensembleCodeJudge, type RubricDim, singleCodeJudge } from './judges' +import { type OfflineScript, offlineSandboxClient } from './offline-box' +import { harnessProfiles } from './profiles' +import { type CodingScenario, scenarios } from './scenarios' +import { pairwiseStats, renderStats } from './stats' +import type { ToolPreset } from './tools' +import type { RunArtifact } from './validators' + +// ── flags ─────────────────────────────────────────────────────────────────── +const argv = process.argv.slice(2) +const flag = (name: string) => argv.includes(`--${name}`) +const opt = (name: string, fallback: string) => { + const i = argv.indexOf(`--${name}`) + return i >= 0 && argv[i + 1] ? (argv[i + 1] as string) : fallback +} +const live = flag('live') +const ensemble = flag('ensemble') +const toolPreset = opt('tools', 'none') as ToolPreset +const reps = Number(opt('reps', '1')) + +// ── the offline "agent": a scripted solution per scenario ───────────────────── +// Offline we don't have a model, so each scenario's box writes a canned, REAL +// implementation. (Swap in a `stub` to watch the realness validator catch it.) +const offlineSolutions: Record = { + 'rate-limiter': { + path: 'src/rate-limiter.ts', + solutionFor: () => + `export class RateLimiter {\n private tokens: number\n private last = Date.now()\n` + + ` constructor(private capacity: number, private refillPerSec: number) { this.tokens = capacity }\n` + + ` tryRemove(n: number): boolean {\n const now = Date.now()\n` + + ` this.tokens = Math.min(this.capacity, this.tokens + ((now - this.last) / 1000) * this.refillPerSec)\n` + + ` this.last = now\n if (n > this.tokens) return false\n this.tokens -= n\n return true\n }\n}\n`, + }, + 'csv-parser': { + path: 'src/csv.ts', + solutionFor: () => + `export function parseCsv(input: string): string[][] {\n const rows: string[][] = []\n` + + ` let row: string[] = []\n let field = ''\n let inQuotes = false\n` + + ` for (let i = 0; i < input.length; i++) {\n const c = input.charAt(i)\n` + + ` if (inQuotes) {\n if (c === '"' && input.charAt(i + 1) === '"') { field += '"'; i++ }\n` + + ` else if (c === '"') inQuotes = false\n else field += c\n } else if (c === '"') inQuotes = true\n` + + ` else if (c === ',') { row.push(field); field = '' }\n` + + ` else if (c === '\\n') { row.push(field); rows.push(row); row = []; field = '' }\n` + + ` else field += c\n }\n row.push(field); rows.push(row)\n return rows\n}\n`, + }, +} + +// ── the box client: live (real harness) or offline (in-process) ─────────────── +function clientFor(scenario: CodingScenario): (profile: AgentProfile) => SandboxClient { + if (live) { + // Real Tangle sandbox — one real harness box per cell. (Lazy import so the + // offline path never needs the SDK creds.) + const apiKey = process.env.TANGLE_API_KEY + const baseUrl = process.env.SANDBOX_BASE_URL + if (!apiKey || !baseUrl) throw new Error('--live needs TANGLE_API_KEY + SANDBOX_BASE_URL') + // eslint-disable-next-line @typescript-eslint/no-require-imports + const { SandboxClient: RealClient } = require('@tangle-network/sandbox') + return () => new RealClient({ apiKey, baseUrl }) as unknown as SandboxClient + } + const script = offlineSolutions[scenario.id] + if (!script) throw new Error(`no offline script for scenario ${scenario.id}`) + return () => offlineSandboxClient(script) +} + +// ── the judge(s): one model, or a 3-model cross-family ensemble ─────────────── +// Offline the model caller is a deterministic stub (so the pipeline runs with no +// creds). Live, point `complete` / `scoreOne` at your router. +const stubComplete: CompleteFn = async () => + JSON.stringify({ + correctness: 0.85, + completeness: 0.8, + code_quality: 0.8, + robustness: 0.75, + notes: 'stub', + }) + +const stubScoreOne = async (): Promise> => ({ + correctness: 0.85, + completeness: 0.8, + code_quality: 0.8, + robustness: 0.75, +}) + +function judges(): JudgeConfig[] { + if (ensemble) { + // ensembleCodeJudge returns JudgeConfig; the matrix accepts it on + // any artifact — cast to the cell artifact type for the typed judges array. + return [ensembleCodeJudge(stubScoreOne) as unknown as JudgeConfig] + } + return [singleCodeJudge(stubComplete)] +} + +// ── the sweep ───────────────────────────────────────────────────────────────── +async function main(): Promise { + const runDir = mkdtempSync(join(tmpdir(), 'coding-benchmark-')) + console.log( + `coding-benchmark · ${live ? 'LIVE' : 'OFFLINE'} · tools=${toolPreset} · ` + + `judges=${ensemble ? '3 (ensemble)' : '1'} · reps=${reps} · ` + + `harnesses=${harnessProfiles.length} · scenarios=${scenarios.length}`, + ) + + // The matrix runs one campaign per profile. The dispatch is per-scenario only in + // its CLIENT (offline scripts differ by scenario), so run each scenario's matrix + // and merge the records. (Live, one client serves all scenarios — collapse this.) + const allRecords = [] + for (const scenario of scenarios) { + const result = await runProfileMatrix({ + profiles: harnessProfiles, // axis: harness × baseline + scenarios: [scenario], // axis: tasks (one at a time so the offline client matches) + dispatch: codingDispatch(toolPreset, clientFor(scenario)), + judges: judges(), + reps, + integrity: live ? 'assert' : 'off', // offline stub has no real backend; live proves it + costCeiling: 5, + runDir, + commitSha: process.env.GIT_SHA ?? 'example', + storage: inMemoryCampaignStorage(), + }) + allRecords.push(...result.records) + } + + console.log(`\nrecords: ${allRecords.length}\n`) + console.log(renderStats(pairwiseStats(allRecords))) +} + +main().catch((err) => { + console.error(err instanceof Error ? (err.stack ?? err.message) : String(err)) + process.exit(1) +}) diff --git a/examples/coding-benchmark/dispatch.ts b/examples/coding-benchmark/dispatch.ts new file mode 100644 index 0000000..221b3f0 --- /dev/null +++ b/examples/coding-benchmark/dispatch.ts @@ -0,0 +1,167 @@ +/** + * The DISPATCH — renders one (profile, scenario) matrix cell: it runs the coding + * agent on the profile's harness, MULTI-ROUND, in ONE persistent box, then hands + * back the `RunArtifact` the judges score. + * + * This file composes four primitives and nothing bespoke: + * - `createExecutor`/`new SandboxClient` give the box (live) — or `offlineSandboxClient` (offline). + * - `openSandboxRun(client, opts, deliverable)` opens ONE persistent, resumable box. + * `.start(prompt)` = round 1; `.resume(prompt)` = round N over the SAME session. + * That IS the "each round builds on the prior output" loop — no extra combinator. + * - `runBoxChecks` (validators.ts) runs the deterministic checks in the box each round. + * - `ctx.cost.observeTokens(...)` reports usage so the backend-integrity guard sees a real run. + * + * ┌─────────────────────────────────────────────────────────────────────────┐ + * │ THE NO-CHEAT FIREWALL LIVES HERE. │ + * │ The ONLY scenario field that ever reaches the box is `scenario.prompt` │ + * │ (the `agentRun.taskToPrompt` below, and `nextPrompt` built ONLY from │ + * │ validator stderr). The rubric, the realness signals, and the grading │ + * │ note are read later by judges.ts / the realness validator — never written │ + * │ into the box. The agent literally cannot read the answer key. │ + * └─────────────────────────────────────────────────────────────────────────┘ + */ + +import type { ProducedFile } from '@tangle-network/agent-eval/authenticity' +import type { DispatchContext, ProfileDispatchFn } from '@tangle-network/agent-eval/campaign' +import type { AgentProfile } from '@tangle-network/agent-interface' +import { + type AgentRunSpec, + type DefaultVerdict, + openSandboxRun, + type SandboxClient, +} from '@tangle-network/agent-runtime/loops' +import { harnessOf } from './profiles' +import type { CodingScenario } from './scenarios' +import { type ToolPreset, withTools } from './tools' +import { + type BoxCheckResult, + type CheckBox, + type RunArtifact, + realnessValidator, + runBoxChecks, +} from './validators' + +/** Max refine rounds. Round N+1's prompt is built from round N's CHECK output only. */ +const maxRounds = 3 + +/** Build the next-round prompt from the validators the AGENT is allowed to see — + * pass/fail + stderr. NEVER from the rubric, realness, or judge. This is the + * firewall in action: the agent steers on objective check failures, nothing else. */ +function nextPrompt(checks: BoxCheckResult): string { + const fails: string[] = [] + if (!checks.typecheck.passed) + fails.push(`typecheck failed:\n${checks.typecheck.output.slice(0, 1200)}`) + if (!checks.test.passed) fails.push(`tests failed:\n${checks.test.output.slice(0, 1200)}`) + if (!checks.lint.passed) fails.push(`lint failed:\n${checks.lint.output.slice(0, 600)}`) + return `Your solution did not pass these checks. Fix the file and try again.\n\n${fails.join('\n\n')}` +} + +/** A box exposing the methods both `openSandboxRun` and the validators call. */ +type RunBox = CheckBox & { fs: { read(path: string): Promise } } + +/** + * The dispatch factory. Curry the tool preset + the sandbox client; return a + * `ProfileDispatchFn` the matrix calls once per cell. + * + * @param clientFor Resolve a `SandboxClient` for a profile's harness. Offline: + * return `offlineSandboxClient(...)`. Live: `new SandboxClient(...)`. + */ +export function codingDispatch( + toolPreset: ToolPreset, + clientFor: (profile: AgentProfile) => SandboxClient, +): ProfileDispatchFn { + return async ( + profile: AgentProfile, + scenario: CodingScenario, + ctx: DispatchContext, + ): Promise => { + const harness = harnessOf(profile) + // Author the tool surface onto the profile (one line). The substrate + // materializes it into the harness's real config. + const equippedProfile = withTools(profile, toolPreset) + + const agentRun: AgentRunSpec = { + profile: equippedProfile, + // FIREWALL: the prompt is the WHOLE of what the agent sees. Only scenario.prompt. + taskToPrompt: (task: string) => task, + sandboxOverrides: { backend: { type: harness } }, + } + + // Read the produced solution file off the box after each turn (the deliverable). + const run = await openSandboxRun<{ solution: string; files: ProducedFile[] }>( + clientFor(profile), + { agentRun, signal: ctx.signal, runId: ctx.cellId, scenarioId: scenario.id }, + { + kind: 'artifact', + path: scenario.solutionPath, + fromArtifact: (raw: string) => ({ + solution: raw, + files: [{ path: scenario.solutionPath, content: raw }], + }), + }, + ) + + try { + let checks: BoxCheckResult = { typecheck: blank, test: blank, lint: blank, allPass: false } + let solution = '' + let files: ProducedFile[] = [] + let finalText = '' + + for (let round = 0; round < maxRounds; round += 1) { + const prompt = round === 0 ? scenario.prompt : nextPrompt(checks) + const turn = round === 0 ? await run.start(prompt) : await run.resume(prompt) + solution = turn.out.solution + files = turn.out.files + finalText = turn.events.map(eventText).filter(Boolean).join(' ').slice(0, 2000) + + // Report usage so the integrity guard sees a real backend (not a stub). + const usage = sumTokens(turn.events) + if (usage.input || usage.output) ctx.cost.observeTokens(usage) + + // Deterministic checks, IN THE BOX, this round. These (and only these) steer + // the next round — the firewall keeps the rubric/realness out of the loop. + checks = await runBoxChecks(run.box as unknown as RunBox, scenario.validatorCmds) + if (checks.allPass) break // stop on worker-observable green only + } + + // The realness anchor runs AFTER the loop — never inside it, so it can never + // steer the agent. Its verdict is recorded for honesty (`ctx.artifacts`) and + // carried on the artifact for the record; the box never saw the signals. + const realness = await realnessValidator(scenario.realnessSignals).validate( + { files, solution, finalText, checks, realness: emptyVerdict }, + { iteration: maxRounds, signal: ctx.signal }, + ) + await ctx.artifacts.writeJson(`realness/${ctx.cellId}.json`, realness) + + return { files, solution, finalText, checks, realness } + } finally { + await run.close() + } + } +} + +const blank = { passed: false, output: '' } + +/** A placeholder verdict for the artifact passed INTO the realness validator (which + * reads only `files`, never this field). The real verdict replaces it on return. */ +const emptyVerdict: DefaultVerdict = { valid: false, score: 0 } + +/** Pull the agent's text out of a stream event (best-effort, for judge context). */ +function eventText(ev: unknown): string { + const e = ev as { data?: { finalText?: string; text?: string; delta?: string } } + return e.data?.finalText ?? e.data?.text ?? e.data?.delta ?? '' +} + +/** Sum token usage across the turn's events into the `{ input, output }` shape + * `ctx.cost.observeTokens` (and `RunTokenUsage`) expect. */ +function sumTokens(events: unknown[]): { input: number; output: number } { + let input = 0 + let output = 0 + for (const ev of events) { + const d = (ev as { data?: { tokenUsage?: { inputTokens?: number; outputTokens?: number } } }) + .data + input += d?.tokenUsage?.inputTokens ?? 0 + output += d?.tokenUsage?.outputTokens ?? 0 + } + return { input, output } +} diff --git a/examples/coding-benchmark/judges.ts b/examples/coding-benchmark/judges.ts new file mode 100644 index 0000000..ee99f45 --- /dev/null +++ b/examples/coding-benchmark/judges.ts @@ -0,0 +1,147 @@ +/** + * The JUDGE layer — runs LAST, only on the band the deterministic checks can't + * resolve (e.g. "it builds and passes tests, but is the design good?"). + * + * HOW MANY JUDGES: + * - default leaderboard sweep → ONE judge (`singleCodeJudge`), one model. Cheap. + * - ship/no-ship claim → THREE judges (`ensembleCodeJudge`), cross-family + * models, reduced by `aggregateJudgeVerdicts` inside + * `ensembleJudge`. `crossFamily: true` forbids three + * models from the same family at construction, so the + * "ensemble" is genuinely independent. + * + * THE RUBRIC (4 weighted dimensions, total 1.0): + * correctness 0.40 · completeness 0.25 · code_quality 0.20 · robustness 0.15 + * The rubric text + anchors live HERE, with the judge — never in the workdir. The + * agent is graded against criteria it cannot read. + * + * Both judges are campaign `JudgeConfig`s (the shape `runProfileMatrix.judges` + * takes). There is NO `llmJudge` helper in agent-eval today, so the single judge is + * a hand-built `JudgeConfig` whose `score()` does one model call; the ensemble is + * `ensembleJudge(...)`, which already returns a `JudgeConfig`. + */ + +import { ensembleJudge } from '@tangle-network/agent-eval' +import type { JudgeConfig, Scenario } from '@tangle-network/agent-eval/campaign' +import type { CodingScenario } from './scenarios' +import type { RunArtifact } from './validators' + +/** The rubric: name → (description shown to the judge, weight in the composite). */ +export const rubric = { + correctness: { + weight: 0.4, + description: 'Does the code correctly implement the spec for all stated cases?', + }, + completeness: { + weight: 0.25, + description: 'Are all required behaviors and edge cases handled, nothing stubbed?', + }, + code_quality: { + weight: 0.2, + description: 'Is it clear, idiomatic, dependency-free as required, and maintainable?', + }, + robustness: { + weight: 0.15, + description: 'Does it handle malformed / boundary input without crashing or misbehaving?', + }, +} as const + +export type RubricDim = keyof typeof rubric +const dimKeys = Object.keys(rubric) as RubricDim[] +const weights = Object.fromEntries(dimKeys.map((k) => [k, rubric[k].weight])) as Record< + RubricDim, + number +> + +/** Inject your model caller. `(system, user) → completion`. The default below + * calls the Tangle router when `TANGLE_API_KEY` is set; offline it returns a + * deterministic stub so the pipeline runs with no creds. */ +export type CompleteFn = (system: string, user: string) => Promise + +/** The judge's instructions — the rubric anchors. Kept with the judge ONLY. */ +function judgeSystemPrompt(): string { + const dims = dimKeys + .map((k) => `- ${k} (weight ${rubric[k].weight}): ${rubric[k].description}`) + .join('\n') + return [ + 'You are a senior code reviewer scoring a candidate solution to a coding task.', + 'Score each dimension from 0 to 1 (1 = excellent). Reply with ONLY JSON:', + '{"correctness":0.x,"completeness":0.x,"code_quality":0.x,"robustness":0.x,"notes":"..."}', + '', + 'Dimensions:', + dims, + ].join('\n') +} + +/** What the judge sees: the produced code + check results + the eval-only rubric + * note. (This runs in the harness, not in the agent's box — see the firewall.) */ +function judgeUserPrompt(artifact: RunArtifact, scenario: CodingScenario): string { + return [ + `Task intent: ${scenario.prompt}`, + `Grading note: ${scenario.rubricNote}`, + `Deterministic checks — typecheck:${artifact.checks.typecheck.passed} test:${artifact.checks.test.passed} lint:${artifact.checks.lint.passed}`, + '', + 'Candidate solution:', + '```ts', + artifact.solution.slice(0, 8000), + '```', + ].join('\n') +} + +/** Parse the judge's JSON; fail-closed (a bad response scores 0, never a fake pass). */ +function parseScores(raw: string): Record { + try { + const json = JSON.parse(raw.slice(raw.indexOf('{'), raw.lastIndexOf('}') + 1)) + const out = {} as Record + for (const k of dimKeys) { + const v = Number(json[k]) + out[k] = Number.isFinite(v) ? Math.max(0, Math.min(1, v)) : 0 + } + return out + } catch { + return Object.fromEntries(dimKeys.map((k) => [k, 0])) as Record + } +} + +function composite(scores: Record): number { + return dimKeys.reduce((sum, k) => sum + scores[k] * weights[k], 0) +} + +/** ── ONE judge ──────────────────────────────────────────────────────────── + * A hand-built campaign `JudgeConfig` whose `score()` makes a single model call. + * `appliesTo` runs it only on coding scenarios (a no-op here, shown for the shape). */ +export function singleCodeJudge(complete: CompleteFn): JudgeConfig { + return { + name: 'code-quality', + dimensions: dimKeys.map((k) => ({ key: k, description: rubric[k].description })), + appliesTo: (s: Scenario) => s.kind === 'coding', + async score({ artifact, scenario }) { + const raw = await complete(judgeSystemPrompt(), judgeUserPrompt(artifact, scenario)) + const scores = parseScores(raw) + return { dimensions: scores, composite: composite(scores), notes: 'single-judge' } + }, + } +} + +/** ── THREE judges ───────────────────────────────────────────────────────── + * `ensembleJudge` fans the artifact across N cross-family models in parallel and + * reduces surviving verdicts to one `JudgeScore`. A model that throws is excluded, + * never folded into a zero. Use this only for a ship/no-ship claim. */ +export function ensembleCodeJudge( + scoreOne: (model: string, code: string, intent: string) => Promise>, +): JudgeConfig { + return ensembleJudge({ + name: 'code-quality-ensemble', + dimensions: dimKeys, + // Three cross-family models — independence enforced at construction. + models: ['deepseek-chat', 'gpt-4o-mini', 'gemini-flash'], + crossFamily: true, + weights, + scoreWith: async (model, input) => { + const artifact = input.artifact as RunArtifact + const scenario = input.scenario as CodingScenario | undefined + const perDimension = await scoreOne(model, artifact.solution, scenario?.prompt ?? '') + return { model, perDimension } + }, + }) +} diff --git a/examples/coding-benchmark/offline-box.ts b/examples/coding-benchmark/offline-box.ts new file mode 100644 index 0000000..52b704e --- /dev/null +++ b/examples/coding-benchmark/offline-box.ts @@ -0,0 +1,101 @@ +/** + * The OFFLINE seam — an in-process `SandboxClient` so the WHOLE benchmark runs + * with no creds and no network, exactly like `examples/ui-audit/` does. + * + * It implements only what `openSandboxRun` actually calls on a box: + * - `streamPrompt(prompt, opts)` — the "agent" turn. Offline it deterministically + * writes a canned solution into a real temp workspace and emits one terminal + * `result` event carrying finalText + tokenUsage (so the run meters honestly). + * - `fs.read` / `fs.write` — over the temp workspace (the `artifact` deliverable + * + the validators read/write real files here). + * - `exec(cmd)` — runs the deterministic check commands. Offline the toolchain + * (tsc/biome/node --test) usually isn't installed, so a missing tool reads as a + * FAIL — which is the honest offline signal, not a fake pass. + * - `delete()` — tears the temp dir down. + * + * Swap this for `new SandboxClient({ apiKey, baseUrl })` (cast to the runtime's + * `SandboxClient`) and the SAME dispatch runs each round in a real harness box. + * Nothing else in the example changes — that is the point. + */ + +import { exec as execCb } from 'node:child_process' +import { mkdtempSync, rmSync } from 'node:fs' +import { mkdir, readFile, writeFile } from 'node:fs/promises' +import { tmpdir } from 'node:os' +import { dirname, join } from 'node:path' +import { promisify } from 'node:util' +import type { SandboxClient } from '@tangle-network/agent-runtime/loops' +import type { CreateSandboxOptions, SandboxEvent, SandboxInstance } from '@tangle-network/sandbox' + +const execAsync = promisify(execCb) + +/** Produces the canned solution an offline "agent" writes for a given task. Two + * fidelity levels let the example show the validators/judge separating quality: + * a `real` implementation passes the realness scan, a `stub` is caught by it. */ +export type OfflineQuality = 'real' | 'stub' + +/** A scripted offline solution: which file, what content, per round. The harness + * calls `solutionFor(round)` so round 2 can differ from round 1 (refine demo). */ +export interface OfflineScript { + path: string + solutionFor: (round: number) => string +} + +function instanceMethods(workdir: string, script: OfflineScript) { + let round = 0 + return { + id: `offline-${Math.random().toString(36).slice(2, 8)}`, + // The "agent" turn. Writes the scripted solution, emits one terminal event. + async *streamPrompt(_message: string | unknown[]): AsyncGenerator { + const content = script.solutionFor(round) + round += 1 + const abs = join(workdir, script.path) + await mkdir(dirname(abs), { recursive: true }) + await writeFile(abs, content, 'utf8') + yield { + type: 'result', + data: { + finalText: `wrote ${script.path} (offline round ${round})`, + tokenUsage: { inputTokens: 600, outputTokens: 400 }, + costUsd: 0, + }, + } as unknown as SandboxEvent + }, + fs: { + async read(path: string): Promise { + return readFile(join(workdir, path), 'utf8') + }, + async write(path: string, content: string): Promise { + const abs = join(workdir, path) + await mkdir(dirname(abs), { recursive: true }) + await writeFile(abs, content, 'utf8') + }, + }, + async exec(command: string): Promise<{ exitCode: number; stdout: string; stderr: string }> { + try { + const { stdout, stderr } = await execAsync(command, { cwd: workdir, timeout: 30_000 }) + return { exitCode: 0, stdout, stderr } + } catch (err) { + const e = err as { code?: number; stdout?: string; stderr?: string; message?: string } + return { + exitCode: e.code ?? 1, + stdout: e.stdout ?? '', + stderr: e.stderr ?? e.message ?? '', + } + } + }, + async delete(): Promise { + rmSync(workdir, { recursive: true, force: true }) + }, + } +} + +/** An in-process `SandboxClient`. Each `create()` mints a fresh temp workspace box. */ +export function offlineSandboxClient(script: OfflineScript): SandboxClient { + return { + async create(_options?: CreateSandboxOptions): Promise { + const workdir = mkdtempSync(join(tmpdir(), 'coding-bench-')) + return instanceMethods(workdir, script) as unknown as SandboxInstance + }, + } +} diff --git a/examples/coding-benchmark/profiles.ts b/examples/coding-benchmark/profiles.ts new file mode 100644 index 0000000..b9cc3b7 --- /dev/null +++ b/examples/coding-benchmark/profiles.ts @@ -0,0 +1,74 @@ +/** + * The HARNESS axis — one baseline `AgentProfile` per coding harness. + * + * We measure the HARNESS on its default behavior, so each profile is deliberately + * bare: a name, the model it runs, and NOTHING else (no skills, no injected system + * prompt, no extra tools). Adding scaffolding here would measure our scaffolding, + * not the harness. + * + * Two things to know about the shape: + * - `runProfileMatrix` takes `AgentProfile[]` from `@tangle-network/agent-interface`. + * That type has NO `harness` field (harness is a SANDBOX concept, not a profile + * concept), so we carry the harness selector on `metadata.harness`. `dispatch.ts` + * reads it to pick `backend.type` for the sandbox. `harnessOf()` below is the one + * reader. + * - The matrix REQUIRES `model.default` (it stamps it onto every run record). For a + * real harness the agent uses the harness's own default model; we still name a + * model id here so the record is honest about what ran. + */ + +import type { AgentProfile } from '@tangle-network/agent-interface' +import type { BackendType } from '@tangle-network/sandbox' + +/** The harnesses we sweep. `cli-base` is the plain-CLI baseline (no agent harness). */ +export const harnesses = [ + 'claude-code', + 'opencode', + 'codex', + 'cli-base', +] as const satisfies readonly BackendType[] + +/** Read the harness a profile targets. The ONE place metadata.harness is decoded. */ +export function harnessOf(profile: AgentProfile): BackendType { + const h = profile.metadata?.harness + if (typeof h !== 'string') { + throw new Error(`profile "${profile.name}" is missing metadata.harness — see profiles.ts`) + } + return h as BackendType +} + +/** The default model each harness runs. Override per-harness via env if you like; + * the value is stamped onto the run record, so keep it truthful. + * + * IMPORTANT — the model id MUST carry a SNAPSHOT DATE. `runProfileMatrix` rejects a + * bare `name` and requires `name@YYYY-MM-DD` or `name-YYYYMMDD`, because a run record + * without the exact model snapshot is not reproducible ("which gpt-4.1 was that?"). + * This is the substrate keeping the benchmark paper-grade — keep the date current. */ +const harnessModel: Record = { + 'claude-code': process.env.CLAUDE_CODE_MODEL ?? 'anthropic/claude-sonnet-4-5-2025-09-29', + opencode: process.env.OPENCODE_MODEL ?? 'anthropic/claude-sonnet-4-5-2025-09-29', + codex: process.env.CODEX_MODEL ?? 'openai/gpt-5-codex-2025-09-15', + 'cli-base': process.env.CLI_BASE_MODEL ?? 'openai/gpt-4.1-2025-04-14', + // unreached by this example, but BackendType is a closed union — name them all + 'kimi-code': 'moonshot/kimi-k2-2025-07-11', + amp: 'anthropic/claude-sonnet-4-5-2025-09-29', + 'factory-droids': 'anthropic/claude-sonnet-4-5-2025-09-29', + pi: 'openai/gpt-4.1-2025-04-14', + hermes: 'openai/gpt-4.1-2025-04-14', + forge: 'openai/gpt-4.1-2025-04-14', + openclaw: 'anthropic/claude-sonnet-4-5-2025-09-29', + nanoclaw: 'anthropic/claude-sonnet-4-5-2025-09-29', + acp: 'openai/gpt-4.1-2025-04-14', + cursor: 'anthropic/claude-sonnet-4-5-2025-09-29', +} + +/** + * One bare baseline profile per harness. The agent's behavior here is the + * harness's OUT-OF-THE-BOX behavior — exactly what a partner gets on day one. + */ +export const harnessProfiles: AgentProfile[] = harnesses.map((harness) => ({ + name: `${harness}-baseline`, + model: { default: harnessModel[harness] }, + // NO prompt, NO resources, NO tools — measure the harness, not our scaffolding. + metadata: { harness }, +})) diff --git a/examples/coding-benchmark/scenarios.ts b/examples/coding-benchmark/scenarios.ts new file mode 100644 index 0000000..62871b0 --- /dev/null +++ b/examples/coding-benchmark/scenarios.ts @@ -0,0 +1,124 @@ +/** + * The held-out coding-task corpus — and the NO-CHEAT FIREWALL, expressed as a type. + * + * Every scenario splits cleanly into two halves: + * - `prompt` — THE ONLY field the agent ever sees. The dispatch copies it + * (and nothing else) into the worker's context. + * - everything else — the rubric note, the validator commands, the realness + * signals — is EVAL-ONLY. It is read by validators.ts and + * judges.ts to score the result; it is NEVER written into the + * box. Because the two halves are different fields on the same + * object, "the agent can read the answer key" becomes a thing + * you can SEE in one place: it would require dispatch.ts to put + * a non-`prompt` field into the profile. It does not. (See the + * `// FIREWALL` comment in dispatch.ts for the exact line.) + * + * This is the structural defense the design calls for: the firewall is a property + * of which field flows where, not a runtime check you have to trust. + */ + +import type { AuthenticitySignals } from '@tangle-network/agent-eval/authenticity' +import type { Scenario } from '@tangle-network/agent-eval/campaign' + +/** One held-out coding task. Extends the substrate `Scenario` ({ id, kind, tags }). */ +export interface CodingScenario extends Scenario { + /** ── AGENT-VISIBLE ────────────────────────────────────────────────────── + * The task as the agent reads it. A clean scaffold description + the ask. + * This is the WHOLE of what reaches the worker's context. */ + prompt: string + + /** ── EVAL-ONLY (never written into the box) ───────────────────────────── */ + + /** Path (relative to the workspace root) the agent is asked to produce. The + * validators read this file off the box AFTER the turn; the judge scores it. */ + solutionPath: string + + /** Deterministic checks, run in order, in the box, BEFORE any judge. These are + * shell commands the harness runs against the produced code. Objective, ~$0. + * They are eval config — the agent is told WHAT to build, never HOW it's graded. */ + validatorCmds: { + typecheck: string + test: string + lint: string + } + + /** Drives `scoreAuthenticity` — catches a stub that compiles but fakes the + * hard part. Write-only to the record; the agent cannot read or steer it. */ + realnessSignals: AuthenticitySignals + + /** Extra grading context for the JUDGE only (design intent, edge cases to + * reward). Lives with the judge, never in the workdir. */ + rubricNote: string +} + +/** + * A 2-task corpus. Real benchmarks carry 20-50; two keeps the example readable. + * Both are self-contained "write one module that passes these checks" tasks — the + * shape that has a CORRECTABLE MIDDLE BAND (build-passes-but-quality-varies), which + * is what makes a benchmark able to separate harnesses at all. + */ +export const scenarios: CodingScenario[] = [ + { + id: 'rate-limiter', + kind: 'coding', + tags: ['algorithms', 'concurrency'], + prompt: [ + 'Implement a token-bucket rate limiter in TypeScript at `src/rate-limiter.ts`.', + 'Export `class RateLimiter` with a constructor `(capacity: number, refillPerSec: number)`', + 'and a method `tryRemove(tokens: number): boolean` that returns true and consumes the', + 'tokens if enough are available (refilling continuously over elapsed wall-clock time),', + 'and false otherwise. No external dependencies.', + ].join(' '), + solutionPath: 'src/rate-limiter.ts', + validatorCmds: { + typecheck: 'npx tsc --noEmit src/rate-limiter.ts', + test: 'node --test test/rate-limiter.test.js', + lint: 'npx biome check src/rate-limiter.ts', + }, + realnessSignals: { + label: 'token-bucket', + requiredArtifact: /rate-limiter\.ts$/, + // The hard part must be present: time-based refill math, not a hardcoded true. + realImpl: /Date\.now\(\)|performance\.now\(\)|elapsed|refill/, + realInfra: /class\s+RateLimiter/, + // The fake: a tryRemove whose ENTIRE body is `return true` (no refill math + // before it). Tightened so a real impl that legitimately ends in `return true` + // is NOT flagged — the shim is "returns true with no logic", not "returns true". + fakeShim: /tryRemove\([^)]*\)\s*:\s*boolean\s*{\s*return\s+true/, + }, + rubricNote: + 'Reward continuous (not discrete-tick) refill, integer-safe token accounting, and ' + + 'correct behavior when tokens requested exceeds capacity (must return false, never block).', + }, + { + id: 'csv-parser', + kind: 'coding', + tags: ['parsing', 'edge-cases'], + prompt: [ + 'Implement an RFC-4180 CSV parser in TypeScript at `src/csv.ts`.', + 'Export `function parseCsv(input: string): string[][]`. It must handle quoted fields,', + 'escaped double-quotes inside quotes (""), and embedded newlines within quoted fields.', + 'No external dependencies.', + ].join(' '), + solutionPath: 'src/csv.ts', + validatorCmds: { + typecheck: 'npx tsc --noEmit src/csv.ts', + test: 'node --test test/csv.test.js', + lint: 'npx biome check src/csv.ts', + }, + realnessSignals: { + label: 'csv-rfc4180', + requiredArtifact: /csv\.ts$/, + // Real parsers track quote state char-by-char; a naive split is the fake. + realImpl: /inQuotes|state|charAt|for\s*\(|while\s*\(/, + realInfra: /function\s+parseCsv/, + // The fake: splitting on comma or newline (naive parse) — the RFC-4180 cases + // (quoted comma, embedded newline) make `.split` wrong. Matches anywhere, not + // just line-end, so `input.split('\n').map(l => l.split(','))` is caught. + fakeShim: /\.split\(\s*['"`](,|\\n)['"`]\s*\)/, + }, + rubricNote: + 'Reward a single-pass state machine over naive splitting; correct handling of a quoted ' + + 'field containing a comma, a literal newline, and an escaped quote.', + }, +] diff --git a/examples/coding-benchmark/stats.ts b/examples/coding-benchmark/stats.ts new file mode 100644 index 0000000..bae6288 --- /dev/null +++ b/examples/coding-benchmark/stats.ts @@ -0,0 +1,146 @@ +/** + * The STATS — turn the matrix's `RunRecord[]` into an honest leaderboard: + * - per-harness mean composite + a bootstrap CONFIDENCE INTERVAL (`confidenceInterval`) + * - per-harness PASS-RATE with a binomial Wilson interval (`wilson`) — the correct + * CI for a proportion (the continuous CI assumes the wrong distribution) + * - every harness PAIR compared on MATCHED scenarios with a paired bootstrap + * (`pairedBootstrap`), then BH-corrected across all pairs (`benjaminiHochberg`) + * so running many comparisons doesn't manufacture a false winner. + * + * Every number here is one agent-eval primitive call. No hand-rolled statistics. + * + * (The design flagged "no binomial CI in agent-eval" as a gap — that's stale: + * `wilson(successes, n)` ships in the stats surface and is exactly this CI. Used below.) + */ + +import { + benjaminiHochberg, + confidenceInterval, + pairedBootstrap, + type RunRecord, + wilson, +} from '@tangle-network/agent-eval' + +/** A composite at or above this counts as "green" for the pass-rate proportion. */ +const greenThreshold = 0.6 + +interface HarnessRow { + harness: string + n: number + meanComposite: number + ci: { lower: number; upper: number } + passRate: number + passCi: { lower: number; upper: number } +} + +interface PairResult { + a: string + b: string + /** median paired delta (b − a) and its bootstrap CI */ + delta: number + low: number + high: number + /** BH-significant after correcting across all pairs */ + significant: boolean +} + +export interface StatsReport { + leaderboard: HarnessRow[] + pairs: PairResult[] +} + +/** Per-record composite — the search-split score the judges produced. */ +function score(r: RunRecord): number { + return r.outcome.searchScore ?? r.outcome.holdoutScore ?? 0 +} + +/** Group records by harness profile (the matrix stamps the profile id as candidateId). */ +function byHarness(records: RunRecord[]): Map { + const m = new Map() + for (const r of records) { + const key = r.agentProfile?.profileId ?? r.candidateId + const list = m.get(key) ?? [] + list.push(r) + m.set(key, list) + } + return m +} + +/** Scores for harness A and B on the SAME scenarios, aligned for pairing. */ +function pairedScores(a: RunRecord[], b: RunRecord[]): { aScores: number[]; bScores: number[] } { + const bByScenario = new Map(b.map((r) => [r.scenarioId ?? '', r])) + const aScores: number[] = [] + const bScores: number[] = [] + for (const ra of a) { + const rb = bByScenario.get(ra.scenarioId ?? '') + if (rb) { + aScores.push(score(ra)) + bScores.push(score(rb)) + } + } + return { aScores, bScores } +} + +export function pairwiseStats(records: RunRecord[]): StatsReport { + const groups = byHarness(records) + const harnesses = [...groups.keys()].sort() + + const leaderboard: HarnessRow[] = harnesses.map((harness) => { + const rs = groups.get(harness) ?? [] + const scores = rs.map(score) + const ci = confidenceInterval(scores, 0.95, { seed: 7 }) + const passes = scores.filter((s) => s >= greenThreshold).length + const passCi = wilson(passes, scores.length, 0.95) + return { + harness, + n: scores.length, + meanComposite: ci.mean, + ci: { lower: ci.lower, upper: ci.upper }, + passRate: scores.length ? passes / scores.length : 0, + passCi: { lower: passCi.lower, upper: passCi.upper }, + } + }) + + // Every unordered harness pair, paired-bootstrapped on matched scenarios. + const raw: Omit[] = [] + for (let i = 0; i < harnesses.length; i += 1) { + for (let j = i + 1; j < harnesses.length; j += 1) { + const ha = harnesses[i] as string + const hb = harnesses[j] as string + const { aScores, bScores } = pairedScores(groups.get(ha) ?? [], groups.get(hb) ?? []) + if (aScores.length === 0) continue + const boot = pairedBootstrap(aScores, bScores, { seed: 7, statistic: 'median' }) + raw.push({ a: ha, b: hb, delta: boot.median, low: boot.low, high: boot.high }) + } + } + + // A CI excluding 0 is the per-pair p<0.05 proxy; BH-correct across all pairs. + const pProxy = raw.map((r) => (r.low > 0 || r.high < 0 ? 0.04 : 0.5)) + const { significant } = benjaminiHochberg(pProxy, 0.05) + const pairs: PairResult[] = raw.map((r, i) => ({ ...r, significant: significant[i] ?? false })) + + return { leaderboard, pairs } +} + +/** Render the report as a plain leaderboard + significance lines. */ +export function renderStats(report: StatsReport): string { + const lines: string[] = [] + lines.push('Harness leaderboard (mean composite, 95% CI; pass-rate, Wilson CI):') + for (const row of report.leaderboard) { + lines.push( + ` ${row.harness.padEnd(22)} composite ${row.meanComposite.toFixed(3)} ` + + `[${row.ci.lower.toFixed(3)}, ${row.ci.upper.toFixed(3)}] ` + + `pass ${(row.passRate * 100).toFixed(0)}% ` + + `[${(row.passCi.lower * 100).toFixed(0)}%, ${(row.passCi.upper * 100).toFixed(0)}%] (n=${row.n})`, + ) + } + lines.push('') + lines.push('Pairwise (paired bootstrap on matched scenarios, BH-corrected):') + for (const p of report.pairs) { + const tag = p.significant ? 'SIGNIFICANT' : 'n.s.' + lines.push( + ` ${p.b} − ${p.a}: Δ=${p.delta.toFixed(3)} [${p.low.toFixed(3)}, ${p.high.toFixed(3)}] ${tag}`, + ) + } + return lines.join('\n') +} diff --git a/examples/coding-benchmark/tools.ts b/examples/coding-benchmark/tools.ts new file mode 100644 index 0000000..65c49bf --- /dev/null +++ b/examples/coding-benchmark/tools.ts @@ -0,0 +1,55 @@ +/** + * The TOOL knob — swap the agent's tool surface in ONE line. + * + * A tool surface is a PRESET, not forked code. Each preset authors the SAME two + * fields onto a profile — native tools on/off (`profile.tools`) and an optional + * mounted MCP server (`profile.mcp`) — and the sandbox substrate materializes them + * into each harness's real shape (`.claude/`, `opencode.json`, codex config, ...). + * We never hand-write a per-harness config file. + * + * withTools(profile, 'web') // turn on the native web tools + * withTools(profile, 'search-mcp') // mount a search MCP instead + * withTools(profile, 'none') // baseline: no web, no MCP + * + * To add the tool surface as a 4TH matrix axis, build the profile list as the + * cartesian of harnesses × presets (see benchmark.ts, `--tools` flag). + * + * Honesty note for partners: a preset only takes effect for a (harness, lever) + * pair the sandbox actually materializes. If a harness has no native `webfetch`, + * `withTools(p,'web')` is a no-op THERE — that is a substrate fact, not something + * this example silently patches over. Check `@tangle-network/sandbox` for the + * materialization matrix before trusting a tool swap on a given harness. + */ + +import type { AgentProfile, AgentProfileMcpServer } from '@tangle-network/agent-interface' + +/** Where a search MCP lives, when the `search-mcp` preset is selected. */ +const searchMcpUrl = process.env.TANGLE_SEARCH_MCP ?? 'https://search-mcp.tangle.tools/mcp' + +export type ToolPreset = 'none' | 'web' | 'search-mcp' + +interface ToolSurface { + /** Native harness tools, by name → enabled. Maps to `profile.tools`. */ + tools?: Record + /** A mounted MCP server, by name. Maps to `profile.mcp`. */ + mcp?: Record +} + +const presets: Record = { + none: { tools: { websearch: false, webfetch: false } }, + web: { tools: { websearch: true, webfetch: true } }, + 'search-mcp': { + tools: { websearch: false, webfetch: false }, + mcp: { search: { transport: 'http', url: searchMcpUrl, enabled: true } }, + }, +} + +/** Author a tool surface onto a profile. Returns a NEW profile (pure). */ +export function withTools(profile: AgentProfile, preset: ToolPreset): AgentProfile { + const surface = presets[preset] + return { + ...profile, + ...(surface.tools ? { tools: surface.tools } : {}), + ...(surface.mcp ? { mcp: surface.mcp } : {}), + } +} diff --git a/examples/coding-benchmark/validators.ts b/examples/coding-benchmark/validators.ts new file mode 100644 index 0000000..7081bd3 --- /dev/null +++ b/examples/coding-benchmark/validators.ts @@ -0,0 +1,104 @@ +/** + * The DETERMINISTIC layer — validators that run BEFORE any judge. + * + * Scoring a coding task in the right order matters: objective checks first (they + * cost ~$0 and can't be gamed), an anti-fake realness gate next, and only THEN — + * if there is still a subjective band left to grade — an LLM judge. This file owns + * the first two layers. judges.ts owns the third. + * + * Two kinds of validator here: + * 1. `runBoxChecks` — runs the scenario's `typecheck` / `test` / `lint` commands + * IN THE BOX via `box.exec(...)`. Pass/fail comes from the exit code. This is + * a runtime concern (it needs a live box), so it is a plain async function the + * dispatch calls each round; the booleans it returns are what steer the next + * round (see the firewall note in dispatch.ts). + * 2. `realnessValidator` — wraps agent-eval's `scoreAuthenticity` + `gateRealness` + * as a runtime `Validator`. It catches "compiles but is a stub". + * Its score is WRITE-ONLY to the record — the agent never sees it, so it cannot + * steer toward it. + * + * `Validator` is the runtime seam (src/runtime/types.ts): one + * method, `validate(output, ctx) → Promise`. We use the default verdict + * shape `{ valid, score, signals }`. + */ + +import { + type AuthenticitySignals, + gateRealness, + type ProducedFile, + scoreAuthenticity, +} from '@tangle-network/agent-eval/authenticity' +import type { DefaultVerdict, Validator } from '@tangle-network/agent-runtime/loops' + +/** A finished coding attempt — what the dispatch produces and the judge scores. */ +export interface RunArtifact { + /** Files the agent produced, as `{ path, content }` — the realness currency. */ + files: ProducedFile[] + /** The solution file's content (convenience; also present in `files`). */ + solution: string + /** The agent's final chat text for the round (judge context). */ + finalText: string + /** Deterministic check results from the LAST round — gate the judge + the record. */ + checks: BoxCheckResult + /** The realness anchor's verdict, computed AFTER the loop by `realnessValidator`. + * Recorded for honesty; the agent never sees it (see the firewall in dispatch.ts). */ + realness: DefaultVerdict +} + +export interface BoxCheckResult { + typecheck: { passed: boolean; output: string } + test: { passed: boolean; output: string } + lint: { passed: boolean; output: string } + /** True only when typecheck AND test pass (lint is advisory). */ + allPass: boolean +} + +/** Minimal box surface the checks need — a subset of the real `SandboxInstance`. + * The live sandbox satisfies it; the offline in-process box implements it too. */ +export interface CheckBox { + exec(command: string): Promise<{ exitCode: number; stdout: string; stderr: string }> +} + +/** + * Run the scenario's deterministic checks in the box. Exit code 0 = pass. This is + * the objective floor: it can't be talked around by a confident judge, and it costs + * nothing. The agent IS told what to build (the prompt), but never the grading + * commands — those live on the scenario's eval-only fields. + */ +export async function runBoxChecks( + box: CheckBox, + cmds: { typecheck: string; test: string; lint: string }, +): Promise { + const run = async (cmd: string): Promise<{ passed: boolean; output: string }> => { + const r = await box.exec(cmd) + return { passed: r.exitCode === 0, output: `${r.stdout}\n${r.stderr}`.trim() } + } + const typecheck = await run(cmds.typecheck) + const test = await run(cmds.test) + const lint = await run(cmds.lint) + return { typecheck, test, lint, allPass: typecheck.passed && test.passed } +} + +/** + * The realness anchor as a runtime `Validator`. `scoreAuthenticity` is a pure, + * no-LLM structural scan (required artifact present? hard part implemented? or a + * fake shim?), and `gateRealness` caps anything that faked or omitted the required + * artifact. The verdict is recorded but NEVER fed back to the agent. + */ +export function realnessValidator(signals: AuthenticitySignals): Validator { + return { + async validate(artifact: RunArtifact): Promise { + const result = scoreAuthenticity(artifact.files, signals) + const gate = gateRealness(result, { requireArtifact: true }) + // realness is 0..100; normalize to the 0..1 verdict score. + const score = gate.gated ? 0 : result.realness / 100 + const flags = result.flags.length > 0 ? ` — flags: ${result.flags.join(', ')}` : '' + return { + valid: !gate.gated && result.realness >= 50, + score, + scores: { realness: result.realness, gated: gate.gated ? 1 : 0 }, + notes: `${gate.gated ? `GATED (${gate.reason ?? 'fake/missing artifact'})` : 'real'}${flags}`, + } + }, + } +} From 543881c86d8a50d95bd1d95b8102ca2c9448c348 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Wed, 24 Jun 2026 02:59:01 -0600 Subject: [PATCH 2/5] =?UTF-8?q?docs(examples):=20coding-benchmark=20on=20s?= =?UTF-8?q?ubstrate=20primitives=20=E2=80=94=20fix=2017=20findings,=20drop?= =?UTF-8?q?=203=20reinventions?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reworks examples/coding-benchmark to compose published agent-eval / agent-runtime primitives instead of hand-rolling them, fixing every PR-369 review finding. Reinventions removed (now substrate primitives): - hand-rolled judge (judgeSystemPrompt/judgeUserPrompt/score) -> llmJudge / ensembleJudge - sumTokens (read only data.tokenUsage; summed 0 under a real sandbox emitting data.usage -> integrity:'assert' threw on every --live cell) -> extractLlmCallEvent - runBoxChecks flat pass/fail -> MultiLayerVerifier ordered pipeline (typecheck -> test -> lint, dependency-based skip, blended score) Findings: --live token shapes fixed + verified; real seeded fixture tests so the deterministic checks have a file to run; correct paired-scenario stats keying; an offline vitest smoke; dynamic import() of the SDK behind --live; the ensemble now sees the same full context as the single judge; the realness gate ACTUALLY gates the judge (short-circuits to composite 0, no model call); honest tool-knob docs + --tools command; a real cross-round offline refine demo; dead RunBox.fs.read / OfflineQuality removed; real paired test (pairedTTest) feeding BH real p-values; human-readable harness names on the leaderboard. Consolidated validators+judges -> eval.ts and tools -> profiles.ts (9 source files -> 7 + a smoke test). README every claim matched to the code. Bumps the agent-eval devDependency to >=0.99.0 (llmJudge). The peerDependency floor is unchanged — only the example uses llmJudge, src does not. --- examples/coding-benchmark/README.md | 89 ++--- examples/coding-benchmark/benchmark.ts | 219 +++++++++---- .../coding-benchmark/coding-benchmark.test.ts | 53 +++ examples/coding-benchmark/dispatch.ts | 102 +++--- examples/coding-benchmark/eval.ts | 303 ++++++++++++++++++ examples/coding-benchmark/judges.ts | 147 --------- examples/coding-benchmark/offline-box.ts | 46 +-- examples/coding-benchmark/profiles.ts | 75 ++++- examples/coding-benchmark/scenarios.ts | 122 +++++-- examples/coding-benchmark/stats.ts | 95 ++++-- examples/coding-benchmark/tools.ts | 55 ---- examples/coding-benchmark/validators.ts | 104 ------ package.json | 2 +- pnpm-lock.yaml | 10 +- 14 files changed, 867 insertions(+), 555 deletions(-) create mode 100644 examples/coding-benchmark/coding-benchmark.test.ts create mode 100644 examples/coding-benchmark/eval.ts delete mode 100644 examples/coding-benchmark/judges.ts delete mode 100644 examples/coding-benchmark/tools.ts delete mode 100644 examples/coding-benchmark/validators.ts diff --git a/examples/coding-benchmark/README.md b/examples/coding-benchmark/README.md index 189c4fa..6165732 100644 --- a/examples/coding-benchmark/README.md +++ b/examples/coding-benchmark/README.md @@ -1,15 +1,17 @@ # coding-benchmark -**Run the same coding task across coding agents — fairly, honestly, with real statistics — in ~8 files of pure composition.** Every moving part is an `agent-runtime` or `agent-eval` primitive. Zero bespoke harness code. +**Run the same coding task across coding agents — fairly, honestly, with real statistics — in 7 files of pure composition.** Every moving part is an `agent-runtime` or `agent-eval` primitive. Zero bespoke harness code, no hand-rolled scorer, no hand-rolled statistics. ```bash -# offline — no creds, no network. Runs the whole pipeline against an in-process box. +# offline — no creds, no network. Runs the whole pipeline against an in-process box +# with a deterministic mock judge. pnpm tsx examples/coding-benchmark/benchmark.ts -# swap a tool surface, add the 3-model judge, run more reps -pnpm tsx examples/coding-benchmark/benchmark.ts --tools web --ensemble --reps 5 +# pick a tool surface, add the 3-model judge panel, run more reps +pnpm tsx examples/coding-benchmark/benchmark.ts --tools web +pnpm tsx examples/coding-benchmark/benchmark.ts --ensemble --reps 5 -# live — real harness boxes + a real judge model +# live — real harness boxes + a real judge model (see "Going live" for the exact reqs) TANGLE_API_KEY=sk-... SANDBOX_BASE_URL=https://... \ pnpm tsx examples/coding-benchmark/benchmark.ts --live ``` @@ -22,7 +24,7 @@ One coding task, run across a **matrix** of three axes, scored, and compared wit |---|---|---| | **harness** | claude-code / opencode / codex / cli, each on its **baseline default profile** (no skills, no injected prompt — we measure the harness, not our scaffolding) | `profiles.ts` | | **scenario** | the held-out coding tasks (a token-bucket rate limiter, an RFC-4180 CSV parser) | `scenarios.ts` | -| **tool surface** | `none` / `web` / `search-mcp` — folded in as a one-line knob | `tools.ts` | +| **tool surface** | `none` / `web` / `search-mcp` — folded in as a one-line knob (`--tools`) | `profiles.ts` | The agent gets up to **3 refine rounds** in **one persistent box**: round N+1's prompt is built from round N's *check failures* (and nothing else — see the firewall). It stops the moment the deterministic checks pass. @@ -30,82 +32,93 @@ The output is a leaderboard with confidence bands and a significance matrix: ``` Harness leaderboard (mean composite, 95% CI; pass-rate, Wilson CI): - claude-code-baseline composite 0.813 [0.813, 0.813] pass 100% [34%, 100%] (n=2) + claude-code-baseline composite 0.813 [0.813, 0.813] pass 100% [34%, 100%] (n=2) ... -Pairwise (paired bootstrap on matched scenarios, BH-corrected): - opencode − claude-code: Δ=0.000 [0.000, 0.000] n.s. +Pairwise (paired delta + bootstrap CI; paired-test p, BH-corrected): + opencode-baseline − claude-code-baseline: Δ=0.000 [0.000, 0.000] p=1.000 n.s. ``` -> Offline, every harness runs the **same** scripted solution through the **same** stub judge, so all deltas are 0.000 — that's the honest no-variance result, not a bug. The plumbing (matrix, validators, judge, stats, firewall) all runs for real; only the model is stubbed. Add `--live` for real models and the harnesses separate. +> **Offline, every harness writes the same scripted solution and is scored by the same deterministic mock judge, so all deltas are 0.000** — the honest no-variance result, not a bug. The whole pipeline (matrix, verifier, realness gate, judge wiring, stats, firewall) runs for real; only the agent and the judge model are stubbed offline. `--live` swaps in real harness boxes and a real judge model and the harnesses separate. + +### The offline "agent" is a scripted stand-in + +Offline there is no model, so each scenario's box writes a **canned solution** instead of calling a coding agent — a deterministic stand-in so the example runs with no creds. The scripts are honest: `rate-limiter` **improves across rounds** (round 0 is a `return true` stub the realness gate catches; round 1+ is the real token-bucket), a real refine demo. Offline the toolchain (`tsc` / `biome` / `node --test`) isn't on PATH, so the checks fail fast and all 3 rounds run — which is exactly when you want to see refinement. ## How a tool swap works (one line) -A tool surface is a **preset**, not forked code. Each preset authors the same two fields onto the profile — native web tools on/off and an optional mounted MCP — and the sandbox substrate materializes them into each harness's real config: +A tool surface is a **preset**, not forked code. Each preset authors the same two fields onto the profile — native web tools on/off (`profile.tools`) and an optional mounted MCP (`profile.mcp`) — and the sandbox substrate materializes them into each harness's real config: ```ts +withTools(profile, 'none') // baseline: no web tools, no MCP withTools(profile, 'web') // native websearch + webfetch on withTools(profile, 'search-mcp') // mount a search MCP instead -withTools(profile, 'none') // baseline: no web, no MCP ``` -On the CLI it's `--tools none|web|search-mcp`. **Honesty caveat:** a preset only takes effect for a `(harness, lever)` pair the sandbox actually materializes — if a harness has no native `webfetch`, `--tools web` is a no-op *there*. That's a substrate fact, not something this example papers over. +On the CLI it's `--tools none|web|search-mcp`. **Honesty caveat:** a preset only takes effect for a `(harness, lever)` pair the sandbox actually materializes — if a harness has no native `webfetch`, `--tools web` is a no-op *there*. That's a substrate fact, not something this example papers over. Check `@tangle-network/sandbox` for the materialization matrix before trusting a tool swap on a given harness. ## How it stays honest (the no-cheat firewall) **The agent's context is the task prompt — and nothing else.** The grading criteria never reach the box. -- A `CodingScenario` (`scenarios.ts`) splits into `prompt` (the **only** field the agent sees) and eval-only fields: the validator commands, the realness signals, the rubric note. Because they're different fields on one object, "the agent reads the answer key" becomes something you can **see in one place** — it would require the dispatch to write a non-`prompt` field into the box. -- **It does not.** The firewall is one labeled block in **`dispatch.ts`** (`THE NO-CHEAT FIREWALL LIVES HERE`): the only thing that reaches the box is `scenario.prompt`, plus next-round prompts built **only** from validator pass/fail + stderr. The rubric, the realness score, and the judge are read *after* the loop, never written in. -- The realness anchor runs **after** the loop and is written **write-only** to the record (`ctx.artifacts`) — the agent can't steer toward a metric it can't read. +- A `CodingScenario` (`scenarios.ts`) splits into `prompt` (the **only** field the agent sees) and eval-only fields: the hidden test fixture, the realness signals, the rubric note. Because they're different fields on one object, "the agent reads the answer key" becomes something you can **see in one place** — it would require the dispatch to write a non-`prompt` field into the agent's context. +- **It does not.** The firewall is one labeled block in **`dispatch.ts`** (`THE NO-CHEAT FIREWALL LIVES HERE`): the only thing the agent reads is `scenario.prompt`, plus next-round prompts built **only** from check pass/fail + output. The hidden test is *seeded* into the box (so `node --test` has a file to run) but its assertions are never described to the agent; the rubric, the realness signals, and the judge are read *after* the loop, never written in. +- The realness gate runs **after** the loop and is recorded on the run — the agent can't steer toward a metric it can't read. ## How it scores (validators before judge) -Scoring runs in strict order, cheapest and most objective first: +Scoring runs in strict order, cheapest and most objective first — an `agent-eval` primitive at each layer: -1. **Deterministic validators (run first, in the box, ~$0).** `typecheck` → `test` → `lint` as shell commands; pass/fail from the exit code. These steer the refine loop. (`validators.ts` · `runBoxChecks`) -2. **Realness anchor (write-only).** `scoreAuthenticity` + `gateRealness` — catches a stub that compiles but fakes the hard part. On the sample tasks it scores a real impl **85** and a `return true` stub **35 (gated)**. (`validators.ts` · `realnessValidator`) -3. **LLM judge (last, only on the band the checks can't resolve).** A 4-dimension weighted rubric — correctness 0.40 · completeness 0.25 · code_quality 0.20 · robustness 0.15. The rubric text + anchors live **with the judge**, never in the workdir. (`judges.ts`) +1. **Deterministic checks (first, in the box, ~$0).** An ordered **`MultiLayerVerifier`** pipeline: `typecheck → test → lint`, with dependency-based skip (test never runs on a type error) and a blended score. typecheck + test gate `allPass` (and the refine loop); lint is advisory. These pass/fail booleans are the only thing that steers the next round. (`eval.ts` · `runChecks`) +2. **Realness gate (no LLM, and it GATES).** `scoreAuthenticity` + `gateRealness` — a pure structural scan that catches a stub that compiles but fakes the hard part. It is not just recorded: a **gated** artifact short-circuits the judge to composite **0 with no model call** (a `return true` rate-limiter cannot earn a score, however confident a judge would be). On the sample tasks it scores a real impl ≈ **85** and the `return true` stub **gated → 0**. (`eval.ts` · `realnessGate`, asserted in the smoke test) +3. **LLM judge (last, only on the band the checks can't resolve).** A 4-dimension weighted rubric — correctness 0.40 · completeness 0.25 · code_quality 0.20 · robustness 0.15. The rubric text + anchors live **with the judge**, never in the workdir. (`eval.ts`) **How many judges:** -- **Default: 1** — `singleCodeJudge`, one model. Cheap, for the leaderboard sweep. -- **`--ensemble`: 3** — `ensembleCodeJudge`, three **cross-family** models (deepseek + openai + google). `crossFamily: true` rejects a same-family panel at construction, so the ensemble is genuinely independent. Use it only for a ship/no-ship claim. - -**Validators per cell:** 3 deterministic checks + 1 realness anchor = **4**, all before the judge. +- **Default: 1** — `singleCodeJudge`, built from `llmJudge` (one model call). Cheap, for the leaderboard sweep. +- **`--ensemble`: 3** — `ensembleCodeJudge`, built from `ensembleJudge`, three **cross-family** models (deepseek + openai + google). `crossFamily: true` rejects a same-family panel at construction, so the ensemble is genuinely independent. The panel sees the **same full context** (code + check results + rubric note) the single judge does. Use it only for a ship/no-ship claim. ## How the stats are real (`stats.ts`) -Every number is one `agent-eval` primitive call — no hand-rolled statistics: +Every number is one `agent-eval` primitive call — **no hand-rolled statistics and no fake p-values**: - per-harness **mean composite + bootstrap CI** (`confidenceInterval`) - per-harness **pass-rate + Wilson binomial CI** (`wilson`) — the correct interval for a proportion -- every harness **pair** compared on **matched scenarios** with a **paired bootstrap** (`pairedBootstrap`), then **BH-corrected** across all pairs (`benjaminiHochberg`) so running many comparisons doesn't manufacture a false winner +- every harness **pair** compared on **matched scenarios** with a **real paired test** (`pairedTTest`, or `wilcoxonSignedRank` for the non-parametric path) for the p-value, and a **paired bootstrap** (`pairedBootstrap`) for the effect size + CI, then **BH-corrected** across all pairs (`benjaminiHochberg`) so running many comparisons doesn't manufacture a false winner. +- **Pairing discipline:** the paired unit is the *scenario*. With `--reps > 1`, a harness produces several records per scenario; they're averaged to one score per (harness, scenario) before pairing, so reps tighten the per-cell estimate instead of mis-aligning the pairs. + +The leaderboard labels are the readable harness names, not the matrix's internal profile hashes. ## The files | File | What it owns | |---|---| -| `scenarios.ts` | the held-out task corpus + the firewall-as-a-type (`prompt` vs eval-only fields) | -| `profiles.ts` | the harness axis — one bare baseline `AgentProfile` per harness | -| `tools.ts` | the one-line tool knob (`withTools` + presets) | -| `validators.ts` | deterministic checks (`runBoxChecks`) + the realness anchor (`realnessValidator`) | -| `judges.ts` | the rubric + `singleCodeJudge` (1) and `ensembleCodeJudge` (3) | -| `dispatch.ts` | renders one matrix cell: persistent box + multi-round refine. **The firewall lives here.** | +| `scenarios.ts` | the held-out task corpus + the firewall-as-a-type (`prompt` vs eval-only fields) + the seeded test fixtures + the check commands | +| `profiles.ts` | the harness axis (one bare baseline `AgentProfile` per harness) **and** the one-line tool knob (`withTools` + presets) | +| `eval.ts` | the scoring stack: `runChecks` (`MultiLayerVerifier`) + `realnessGate` + `singleCodeJudge` (`llmJudge`) / `ensembleCodeJudge` (`ensembleJudge`) | +| `dispatch.ts` | renders one matrix cell: persistent box + multi-round refine + token metering. **The firewall lives here.** | | `offline-box.ts` | an in-process `SandboxClient` so the whole thing runs with no creds | +| `stats.ts` | leaderboard + `pairedTTest` / `pairedBootstrap` / `benjaminiHochberg` / `confidenceInterval` / `wilson` | | `benchmark.ts` | the entrypoint: build the axes, hand the matrix the dispatch + judges, run, print stats | -| `stats.ts` | pairs harnesses → `pairedBootstrap` / `benjaminiHochberg` / `confidenceInterval` / `wilson` | +| `coding-benchmark.test.ts` | offline smoke — the matrix produces `harnesses × scenarios × reps` records, and the realness gate catches a stub | ## Primitives composed - **matrix:** `runProfileMatrix({ profiles, scenarios, dispatch, judges, reps, integrity, costCeiling })` (`@tangle-network/agent-eval/campaign`) with a `ProfileDispatchFn` rendering each cell - **box + multi-round:** `openSandboxRun(client, opts, deliverable)` → `.start()` / `.resume()` over one persistent, resumable session (`@tangle-network/agent-runtime/loops`) -- **deterministic layer:** the runtime `Validator` seam, run before the judge +- **deterministic layer:** `MultiLayerVerifier` — ordered `typecheck → test → lint` with dependency-based skip and a blended score (`@tangle-network/agent-eval`) +- **token metering:** `extractLlmCallEvent` (`@tangle-network/agent-runtime/loops`) — reads usage off **every** backend event shape (`done` / `result` / `llm_call` / `usage`) so the integrity guard sees a real run - **realness:** `scoreAuthenticity` + `gateRealness` (`@tangle-network/agent-eval/authenticity`) -- **judges:** a hand-built `JudgeConfig`, and `ensembleJudge` + `aggregateJudgeVerdicts` for the panel -- **integrity:** `integrity: 'assert'` on the matrix proves a real backend ran (no stubbed cell) — `'off'` only for the offline stub -- **stats:** `pairedBootstrap`, `benjaminiHochberg`, `confidenceInterval`, `wilson` +- **judges:** `llmJudge` (single model call → canonical `JudgeConfig`) and `ensembleJudge` for the cross-family panel (`@tangle-network/agent-eval`); the judge transport is a `ChatClient` (`createChatClient` — a `mock` handler offline, the `router` live) +- **integrity:** `integrity: 'assert'` on the matrix proves a real backend ran (no stubbed cell) — `'off'` only for the offline mock +- **stats:** `pairedTTest`, `wilcoxonSignedRank`, `pairedBootstrap`, `benjaminiHochberg`, `confidenceInterval`, `wilson` ## Going live -Swap `offlineSandboxClient(...)` for a real `@tangle-network/sandbox` client (the `--live` path in `benchmark.ts`) and point the judge's `complete` / `scoreOne` at your router. **Nothing else in the example changes** — same dispatch, same matrix, same stats. That's the point. +`--live` is not "flip a flag and nothing else changes" — it swaps two stubs for real infra. To run it you need: + +1. **`TANGLE_API_KEY` + `SANDBOX_BASE_URL`** — the dispatch lazily `import()`s `@tangle-network/sandbox` (behind the live flag, so the offline path never needs the SDK) and creates a real harness box per cell. +2. **A real judge model** — the judge's `ChatClient` becomes `createChatClient({ transport: 'router', apiKey })`; set `JUDGE_MODEL` (and optionally `TANGLE_ROUTER_URL`) to point it at your router. `--ensemble` then calls three real cross-family models. +3. The matrix runs with `integrity: 'assert'`, so a cell that produced no real token usage fails loudly instead of reporting a clean stub leaderboard. + +Everything else — the dispatch, the verifier, the realness gate, the stats — is identical between offline and live. That's the point: only the agent and the judge model change. **Note on codex:** codex emits no structured tool calls, so per-tool progress is unavailable there. It still runs and scores; that's a harness property, not a gap in this example. diff --git a/examples/coding-benchmark/benchmark.ts b/examples/coding-benchmark/benchmark.ts index 3a8b863..d231949 100644 --- a/examples/coding-benchmark/benchmark.ts +++ b/examples/coding-benchmark/benchmark.ts @@ -3,7 +3,7 @@ * scenarios, with controlled tool use, validators-before-judge, real stats, and a * no-cheat firewall. Every moving part is an agent-runtime / agent-eval primitive. * - * # offline (no creds — uses the in-process box + stub judge) + * # offline (no creds — uses the in-process box + a mock judge transport) * pnpm tsx examples/coding-benchmark/benchmark.ts * * # one tool preset / ensemble / more reps @@ -20,6 +20,12 @@ import { mkdtempSync } from 'node:fs' import { tmpdir } from 'node:os' import { join } from 'node:path' +import { + agentProfileId, + type ChatClient, + type ChatResponse, + createChatClient, +} from '@tangle-network/agent-eval' import { inMemoryCampaignStorage, type JudgeConfig, @@ -28,38 +34,54 @@ import { import type { AgentProfile } from '@tangle-network/agent-interface' import type { SandboxClient } from '@tangle-network/agent-runtime/loops' import { codingDispatch } from './dispatch' -import { type CompleteFn, ensembleCodeJudge, type RubricDim, singleCodeJudge } from './judges' +import { ensembleCodeJudge, type RubricDim, type RunArtifact, singleCodeJudge } from './eval' import { type OfflineScript, offlineSandboxClient } from './offline-box' -import { harnessProfiles } from './profiles' +import { harnessProfiles, type ToolPreset } from './profiles' import { type CodingScenario, scenarios } from './scenarios' import { pairwiseStats, renderStats } from './stats' -import type { ToolPreset } from './tools' -import type { RunArtifact } from './validators' + +export interface BenchmarkOptions { + live?: boolean + ensemble?: boolean + toolPreset?: ToolPreset + reps?: number +} // ── flags ─────────────────────────────────────────────────────────────────── -const argv = process.argv.slice(2) -const flag = (name: string) => argv.includes(`--${name}`) -const opt = (name: string, fallback: string) => { - const i = argv.indexOf(`--${name}`) - return i >= 0 && argv[i + 1] ? (argv[i + 1] as string) : fallback +function parseArgs(argv: string[]): BenchmarkOptions { + const flag = (name: string) => argv.includes(`--${name}`) + const opt = (name: string, fallback: string) => { + const i = argv.indexOf(`--${name}`) + return i >= 0 && argv[i + 1] ? (argv[i + 1] as string) : fallback + } + return { + live: flag('live'), + ensemble: flag('ensemble'), + toolPreset: opt('tools', 'none') as ToolPreset, + reps: Number(opt('reps', '1')), + } } -const live = flag('live') -const ensemble = flag('ensemble') -const toolPreset = opt('tools', 'none') as ToolPreset -const reps = Number(opt('reps', '1')) - -// ── the offline "agent": a scripted solution per scenario ───────────────────── -// Offline we don't have a model, so each scenario's box writes a canned, REAL -// implementation. (Swap in a `stub` to watch the realness validator catch it.) + +// ── the offline "agent": a scripted, REFINING solution per scenario ─────────── +// Offline we don't have a model, so each scenario's box writes a canned solution. +// `rate-limiter` IMPROVES across rounds (round 0 = a `return true` stub the realness +// gate catches; round 2 = the real token-bucket) — a real refine demo. `csv-parser` +// writes its real implementation from round 0. const offlineSolutions: Record = { 'rate-limiter': { path: 'src/rate-limiter.ts', - solutionFor: () => - `export class RateLimiter {\n private tokens: number\n private last = Date.now()\n` + - ` constructor(private capacity: number, private refillPerSec: number) { this.tokens = capacity }\n` + - ` tryRemove(n: number): boolean {\n const now = Date.now()\n` + - ` this.tokens = Math.min(this.capacity, this.tokens + ((now - this.last) / 1000) * this.refillPerSec)\n` + - ` this.last = now\n if (n > this.tokens) return false\n this.tokens -= n\n return true\n }\n}\n`, + solutionFor: (round) => + round === 0 + ? // round 0 — a stub: compiles, but `tryRemove` is a hardcoded `return true` + // with no refill math. The realness gate flags + gates this. + `export class RateLimiter {\n constructor(private capacity: number, private refillPerSec: number) {}\n` + + ` tryRemove(n: number): boolean { return true }\n}\n` + : // round 1+ — the real token-bucket with continuous time-based refill. + `export class RateLimiter {\n private tokens: number\n private last = Date.now()\n` + + ` constructor(private capacity: number, private refillPerSec: number) { this.tokens = capacity }\n` + + ` tryRemove(n: number): boolean {\n const now = Date.now()\n` + + ` this.tokens = Math.min(this.capacity, this.tokens + ((now - this.last) / 1000) * this.refillPerSec)\n` + + ` this.last = now\n if (n > this.tokens) return false\n this.tokens -= n\n return true\n }\n}\n`, }, 'csv-parser': { path: 'src/csv.ts', @@ -76,59 +98,102 @@ const offlineSolutions: Record = { } // ── the box client: live (real harness) or offline (in-process) ─────────────── -function clientFor(scenario: CodingScenario): (profile: AgentProfile) => SandboxClient { - if (live) { - // Real Tangle sandbox — one real harness box per cell. (Lazy import so the - // offline path never needs the SDK creds.) - const apiKey = process.env.TANGLE_API_KEY - const baseUrl = process.env.SANDBOX_BASE_URL - if (!apiKey || !baseUrl) throw new Error('--live needs TANGLE_API_KEY + SANDBOX_BASE_URL') - // eslint-disable-next-line @typescript-eslint/no-require-imports - const { SandboxClient: RealClient } = require('@tangle-network/sandbox') - return () => new RealClient({ apiKey, baseUrl }) as unknown as SandboxClient +function clientFor( + live: boolean, + RealClient: (new (opts: { apiKey: string; baseUrl: string }) => unknown) | undefined, +): (scenario: CodingScenario) => (profile: AgentProfile) => SandboxClient { + return (scenario) => { + if (live) { + const apiKey = process.env.TANGLE_API_KEY + const baseUrl = process.env.SANDBOX_BASE_URL + if (!apiKey || !baseUrl) throw new Error('--live needs TANGLE_API_KEY + SANDBOX_BASE_URL') + if (!RealClient) throw new Error('@tangle-network/sandbox not loaded') + return () => new RealClient({ apiKey, baseUrl }) as unknown as SandboxClient + } + const script = offlineSolutions[scenario.id] + if (!script) throw new Error(`no offline script for scenario ${scenario.id}`) + return () => offlineSandboxClient(script) } - const script = offlineSolutions[scenario.id] - if (!script) throw new Error(`no offline script for scenario ${scenario.id}`) - return () => offlineSandboxClient(script) } -// ── the judge(s): one model, or a 3-model cross-family ensemble ─────────────── -// Offline the model caller is a deterministic stub (so the pipeline runs with no -// creds). Live, point `complete` / `scoreOne` at your router. -const stubComplete: CompleteFn = async () => - JSON.stringify({ - correctness: 0.85, - completeness: 0.8, - code_quality: 0.8, - robustness: 0.75, - notes: 'stub', +// ── the judge transport: a real router (live) or a deterministic mock (offline) ─ +// Offline the mock handler returns a fixed rubric verdict so the pipeline runs with +// no creds. Live, `createChatClient({ transport: 'router', apiKey })` calls the real +// router. The SAME `singleCodeJudge` / `ensembleCodeJudge` wiring runs either way. +function judgeChat(live: boolean): ChatClient { + if (live) { + const apiKey = process.env.TANGLE_API_KEY + if (!apiKey) throw new Error('--live needs TANGLE_API_KEY for the judge router') + return createChatClient({ + transport: 'router', + apiKey, + ...(process.env.TANGLE_ROUTER_URL ? { baseUrl: process.env.TANGLE_ROUTER_URL } : {}), + defaultModel: process.env.JUDGE_MODEL ?? 'openai/gpt-4.1-2025-04-14', + }) + } + const verdict = JSON.stringify({ + dimensions: { correctness: 0.85, completeness: 0.8, code_quality: 0.8, robustness: 0.75 }, + notes: 'offline mock judge', }) + return createChatClient({ + transport: 'mock', + defaultModel: 'mock-judge', + handler: async (): Promise => ({ + content: verdict, + usage: { promptTokens: 0, completionTokens: 0, totalTokens: 0 }, + costUsd: 0, + model: 'mock-judge', + durationMs: 0, + raw: {}, + }), + }) +} -const stubScoreOne = async (): Promise> => ({ - correctness: 0.85, - completeness: 0.8, - code_quality: 0.8, - robustness: 0.75, -}) - -function judges(): JudgeConfig[] { - if (ensemble) { - // ensembleCodeJudge returns JudgeConfig; the matrix accepts it on - // any artifact — cast to the cell artifact type for the typed judges array. - return [ensembleCodeJudge(stubScoreOne) as unknown as JudgeConfig] +function judges( + opts: BenchmarkOptions, + chat: ChatClient, +): JudgeConfig[] { + if (opts.ensemble) { + // The ensemble scores each panel model through the SAME chat transport — offline + // that is the mock, live it is the router. It sees the SAME full context the + // single judge does. + const scoreOne = async (model: string, context: string): Promise> => { + const res = await chat.chat({ model, messages: [{ role: 'user', content: context }] }) + const parsed = JSON.parse(res.content) as { dimensions: Record } + return parsed.dimensions + } + return [ensembleCodeJudge(scoreOne)] } - return [singleCodeJudge(stubComplete)] + return [singleCodeJudge(chat)] } // ── the sweep ───────────────────────────────────────────────────────────────── -async function main(): Promise { +export async function main(argv: string[] = process.argv.slice(2)): Promise { + const opts = parseArgs(argv) + const live = opts.live ?? false + const reps = opts.reps ?? 1 + const toolPreset = opts.toolPreset ?? 'none' const runDir = mkdtempSync(join(tmpdir(), 'coding-benchmark-')) + + // Lazy dynamic import so the offline path never needs the SDK or its creds. (This + // is an ESM "type":"module" package — a top-level `require` would throw.) + let RealClient: (new (o: { apiKey: string; baseUrl: string }) => unknown) | undefined + if (live) { + const sdk = (await import('@tangle-network/sandbox')) as { + SandboxClient: new (o: never) => unknown + } + RealClient = sdk.SandboxClient as never + } + console.log( `coding-benchmark · ${live ? 'LIVE' : 'OFFLINE'} · tools=${toolPreset} · ` + - `judges=${ensemble ? '3 (ensemble)' : '1'} · reps=${reps} · ` + + `judges=${opts.ensemble ? '3 (ensemble)' : '1'} · reps=${reps} · ` + `harnesses=${harnessProfiles.length} · scenarios=${scenarios.length}`, ) + const chat = judgeChat(live) + const resolveClient = clientFor(live, RealClient) + // The matrix runs one campaign per profile. The dispatch is per-scenario only in // its CLIENT (offline scripts differ by scenario), so run each scenario's matrix // and merge the records. (Live, one client serves all scenarios — collapse this.) @@ -137,10 +202,10 @@ async function main(): Promise { const result = await runProfileMatrix({ profiles: harnessProfiles, // axis: harness × baseline scenarios: [scenario], // axis: tasks (one at a time so the offline client matches) - dispatch: codingDispatch(toolPreset, clientFor(scenario)), - judges: judges(), + dispatch: codingDispatch(toolPreset, resolveClient(scenario)), + judges: judges(opts, chat), reps, - integrity: live ? 'assert' : 'off', // offline stub has no real backend; live proves it + integrity: live ? 'assert' : 'off', // offline mock has no real backend; live proves it costCeiling: 5, runDir, commitSha: process.env.GIT_SHA ?? 'example', @@ -149,11 +214,25 @@ async function main(): Promise { allRecords.push(...result.records) } + // Map the matrix's hashed profileId → the readable harness name for the leaderboard. + const nameById = new Map(harnessProfiles.map((p) => [agentProfileId(p), p.name ?? 'unknown'])) + const nameOf = (id: string) => nameById.get(id) ?? id + const report = pairwiseStats(allRecords, nameOf) + console.log(`\nrecords: ${allRecords.length}\n`) - console.log(renderStats(pairwiseStats(allRecords))) + console.log(renderStats(report)) + return { records: allRecords.length, leaderboard: report.leaderboard.length } +} + +export interface RunArtifactSummary { + records: number + leaderboard: number } -main().catch((err) => { - console.error(err instanceof Error ? (err.stack ?? err.message) : String(err)) - process.exit(1) -}) +// Run only when invoked directly (not when imported by the smoke test). +if (import.meta.url === `file://${process.argv[1]}`) { + main().catch((err) => { + console.error(err instanceof Error ? (err.stack ?? err.message) : String(err)) + process.exit(1) + }) +} diff --git a/examples/coding-benchmark/coding-benchmark.test.ts b/examples/coding-benchmark/coding-benchmark.test.ts new file mode 100644 index 0000000..e99a4a9 --- /dev/null +++ b/examples/coding-benchmark/coding-benchmark.test.ts @@ -0,0 +1,53 @@ +/** + * Offline smoke test — proves the whole pipeline runs with no creds and that the + * two load-bearing honesty claims hold: + * 1. the matrix produces exactly `harnesses × scenarios × reps` records and a + * defined leaderboard (the wiring is real, not a stub that returns nothing); + * 2. the realness gate actually catches a `return true` stub and gates it to 0 + * (the README's anti-fake claim, asserted against the real scan). + */ + +import { describe, expect, it } from 'vitest' +import { main } from './benchmark' +import { realnessGate } from './eval' +import { harnessProfiles } from './profiles' +import { scenarios } from './scenarios' + +describe('coding-benchmark (offline)', () => { + // Integration smoke: runs the real matrix end-to-end (real box.exec on the offline + // toolchain, all refine rounds since the checks can't pass without the toolchain). + it('runs the full matrix and returns a defined leaderboard', async () => { + const reps = 1 + const summary = await main(['--reps', String(reps)]) + expect(summary.records).toBe(harnessProfiles.length * scenarios.length * reps) + expect(summary.leaderboard).toBe(harnessProfiles.length) + }, 180_000) + + it('realness gate catches a return-true stub', () => { + const rl = scenarios.find((s) => s.id === 'rate-limiter') + expect(rl).toBeDefined() + const stub = 'export class RateLimiter { tryRemove(n: number): boolean { return true } }\n' + const verdict = realnessGate( + [{ path: 'src/rate-limiter.ts', content: stub }], + (rl as NonNullable).realnessSignals, + ) + expect(verdict.gated).toBe(true) + expect(verdict.score).toBe(0) + }) + + it('realness gate passes a real token-bucket implementation', () => { + const rl = scenarios.find((s) => s.id === 'rate-limiter') + const real = + 'export class RateLimiter {\n private tokens = 0\n private last = Date.now()\n' + + ' constructor(private capacity: number, private refillPerSec: number) { this.tokens = capacity }\n' + + ' tryRemove(n: number): boolean {\n const now = Date.now()\n' + + ' this.tokens = Math.min(this.capacity, this.tokens + ((now - this.last) / 1000) * this.refillPerSec)\n' + + ' this.last = now\n if (n > this.tokens) return false\n this.tokens -= n\n return true\n }\n}\n' + const verdict = realnessGate( + [{ path: 'src/rate-limiter.ts', content: real }], + (rl as NonNullable).realnessSignals, + ) + expect(verdict.gated).toBe(false) + expect(verdict.score).toBeGreaterThan(0) + }) +}) diff --git a/examples/coding-benchmark/dispatch.ts b/examples/coding-benchmark/dispatch.ts index 221b3f0..7143a13 100644 --- a/examples/coding-benchmark/dispatch.ts +++ b/examples/coding-benchmark/dispatch.ts @@ -4,20 +4,23 @@ * back the `RunArtifact` the judges score. * * This file composes four primitives and nothing bespoke: - * - `createExecutor`/`new SandboxClient` give the box (live) — or `offlineSandboxClient` (offline). + * - `offlineSandboxClient` (offline) or `new SandboxClient(...)` (live) give the box. * - `openSandboxRun(client, opts, deliverable)` opens ONE persistent, resumable box. * `.start(prompt)` = round 1; `.resume(prompt)` = round N over the SAME session. * That IS the "each round builds on the prior output" loop — no extra combinator. - * - `runBoxChecks` (validators.ts) runs the deterministic checks in the box each round. - * - `ctx.cost.observeTokens(...)` reports usage so the backend-integrity guard sees a real run. + * - `runChecks` (eval.ts) runs the deterministic `MultiLayerVerifier` pipeline each round. + * - `extractLlmCallEvent` (the runtime's own metering seam) reads token usage off the + * stream — across ALL backend event shapes — and reports it so the backend-integrity + * guard sees a real run. * * ┌─────────────────────────────────────────────────────────────────────────┐ * │ THE NO-CHEAT FIREWALL LIVES HERE. │ * │ The ONLY scenario field that ever reaches the box is `scenario.prompt` │ - * │ (the `agentRun.taskToPrompt` below, and `nextPrompt` built ONLY from │ - * │ validator stderr). The rubric, the realness signals, and the grading │ - * │ note are read later by judges.ts / the realness validator — never written │ - * │ into the box. The agent literally cannot read the answer key. │ + * │ (the `taskToPrompt` below, and `nextPrompt` built ONLY from validator │ + * │ output). The hidden test is SEEDED into the box but never described to │ + * │ the agent; the rubric, the realness signals, and the grading note are │ + * │ read later by eval.ts — never written into the box. The agent literally │ + * │ cannot read the answer key. │ * └─────────────────────────────────────────────────────────────────────────┘ */ @@ -26,39 +29,30 @@ import type { DispatchContext, ProfileDispatchFn } from '@tangle-network/agent-e import type { AgentProfile } from '@tangle-network/agent-interface' import { type AgentRunSpec, - type DefaultVerdict, + extractLlmCallEvent, openSandboxRun, type SandboxClient, } from '@tangle-network/agent-runtime/loops' -import { harnessOf } from './profiles' -import type { CodingScenario } from './scenarios' -import { type ToolPreset, withTools } from './tools' -import { - type BoxCheckResult, - type CheckBox, - type RunArtifact, - realnessValidator, - runBoxChecks, -} from './validators' +import { type CheckBox, layerOutput, type RunArtifact, realnessGate, runChecks } from './eval' +import { harnessOf, type ToolPreset, withTools } from './profiles' +import { type CodingScenario, checkCmds } from './scenarios' /** Max refine rounds. Round N+1's prompt is built from round N's CHECK output only. */ const maxRounds = 3 -/** Build the next-round prompt from the validators the AGENT is allowed to see — - * pass/fail + stderr. NEVER from the rubric, realness, or judge. This is the - * firewall in action: the agent steers on objective check failures, nothing else. */ -function nextPrompt(checks: BoxCheckResult): string { +/** Build the next-round prompt from the checks the AGENT is allowed to see — the + * pass/fail + output of the deterministic layers. NEVER from the rubric, realness, + * or judge. This is the firewall in action: the agent steers on objective check + * failures, nothing else. */ +function nextPrompt(report: RunArtifact['checks']): string { const fails: string[] = [] - if (!checks.typecheck.passed) - fails.push(`typecheck failed:\n${checks.typecheck.output.slice(0, 1200)}`) - if (!checks.test.passed) fails.push(`tests failed:\n${checks.test.output.slice(0, 1200)}`) - if (!checks.lint.passed) fails.push(`lint failed:\n${checks.lint.output.slice(0, 600)}`) + for (const layer of ['typecheck', 'test', 'lint'] as const) { + const c = layerOutput(report, layer) + if (!c.passed) fails.push(`${layer} failed:\n${c.output.slice(0, 1200)}`) + } return `Your solution did not pass these checks. Fix the file and try again.\n\n${fails.join('\n\n')}` } -/** A box exposing the methods both `openSandboxRun` and the validators call. */ -type RunBox = CheckBox & { fs: { read(path: string): Promise } } - /** * The dispatch factory. Curry the tool preset + the sandbox client; return a * `ProfileDispatchFn` the matrix calls once per cell. @@ -79,6 +73,7 @@ export function codingDispatch( // Author the tool surface onto the profile (one line). The substrate // materializes it into the harness's real config. const equippedProfile = withTools(profile, toolPreset) + const cmds = checkCmds(scenario) const agentRun: AgentRunSpec = { profile: equippedProfile, @@ -102,7 +97,7 @@ export function codingDispatch( ) try { - let checks: BoxCheckResult = { typecheck: blank, test: blank, lint: blank, allPass: false } + let checks = blankReport() let solution = '' let files: ProducedFile[] = [] let finalText = '' @@ -115,22 +110,20 @@ export function codingDispatch( finalText = turn.events.map(eventText).filter(Boolean).join(' ').slice(0, 2000) // Report usage so the integrity guard sees a real backend (not a stub). + // `extractLlmCallEvent` reads usage off EVERY backend event shape — the live + // sandbox's `done`/`result`/`llm_call` events all sum correctly here. const usage = sumTokens(turn.events) if (usage.input || usage.output) ctx.cost.observeTokens(usage) // Deterministic checks, IN THE BOX, this round. These (and only these) steer // the next round — the firewall keeps the rubric/realness out of the loop. - checks = await runBoxChecks(run.box as unknown as RunBox, scenario.validatorCmds) + checks = await runChecks(run.box as unknown as CheckBox, scenario, cmds) if (checks.allPass) break // stop on worker-observable green only } // The realness anchor runs AFTER the loop — never inside it, so it can never - // steer the agent. Its verdict is recorded for honesty (`ctx.artifacts`) and - // carried on the artifact for the record; the box never saw the signals. - const realness = await realnessValidator(scenario.realnessSignals).validate( - { files, solution, finalText, checks, realness: emptyVerdict }, - { iteration: maxRounds, signal: ctx.signal }, - ) + // steer the agent. Its verdict is recorded for honesty AND gates the judge. + const realness = realnessGate(files, scenario.realnessSignals) await ctx.artifacts.writeJson(`realness/${ctx.cellId}.json`, realness) return { files, solution, finalText, checks, realness } @@ -140,11 +133,24 @@ export function codingDispatch( } } -const blank = { passed: false, output: '' } - -/** A placeholder verdict for the artifact passed INTO the realness validator (which - * reads only `files`, never this field). The real verdict replaces it on return. */ -const emptyVerdict: DefaultVerdict = { valid: false, score: 0 } +/** An empty verifier report for the pre-loop state (no layer has run yet). */ +function blankReport(): RunArtifact['checks'] { + const now = new Date().toISOString() + return { + layers: [], + passCount: 0, + failCount: 0, + skippedCount: 0, + errorCount: 0, + allPass: false, + blendedScore: 0, + valid: false, + score: 0, + durationMs: 0, + startedAt: now, + finishedAt: now, + } +} /** Pull the agent's text out of a stream event (best-effort, for judge context). */ function eventText(ev: unknown): string { @@ -153,15 +159,17 @@ function eventText(ev: unknown): string { } /** Sum token usage across the turn's events into the `{ input, output }` shape - * `ctx.cost.observeTokens` (and `RunTokenUsage`) expect. */ + * `ctx.cost.observeTokens` expects, using the runtime's own metering extractor so + * EVERY backend event shape (`done`/`result`/`llm_call`/`usage`) is counted. */ function sumTokens(events: unknown[]): { input: number; output: number } { let input = 0 let output = 0 for (const ev of events) { - const d = (ev as { data?: { tokenUsage?: { inputTokens?: number; outputTokens?: number } } }) - .data - input += d?.tokenUsage?.inputTokens ?? 0 - output += d?.tokenUsage?.outputTokens ?? 0 + const call = extractLlmCallEvent(ev as never, 'agent') + if (call) { + input += call.tokensIn ?? 0 + output += call.tokensOut ?? 0 + } } return { input, output } } diff --git a/examples/coding-benchmark/eval.ts b/examples/coding-benchmark/eval.ts new file mode 100644 index 0000000..7899212 --- /dev/null +++ b/examples/coding-benchmark/eval.ts @@ -0,0 +1,303 @@ +/** + * The SCORING stack, in the order it runs — cheapest and most objective first. + * + * 1. DETERMINISTIC CHECKS (in the box, ~$0) — an ordered `MultiLayerVerifier` + * pipeline: typecheck → test → lint, with dependency-based skip (test never + * runs on a type error) and a blended score. These pass/fail booleans steer the + * refine loop (see the firewall in dispatch.ts). + * 2. REALNESS GATE (no LLM) — `scoreAuthenticity` + `gateRealness`. Catches a stub + * that compiles but fakes the hard part. It does not just record a verdict — it + * GATES: a gated artifact short-circuits the judge to composite 0. + * 3. LLM JUDGE (last, only on the band the checks can't resolve) — one `llmJudge` + * model call for the leaderboard, or a cross-family `ensembleJudge` panel for a + * ship/no-ship claim. Both see the SAME full context (code + rubric + check + * results); the rubric anchors live HERE, never in the agent's workdir. + * + * Every layer is a published agent-eval primitive — `MultiLayerVerifier`, `llmJudge`, + * `ensembleJudge`, `scoreAuthenticity`/`gateRealness`. No hand-rolled scorer. + */ + +import { + type ChatClient, + ensembleJudge, + type Layer, + llmJudge, + MultiLayerVerifier, + type VerificationReport, +} from '@tangle-network/agent-eval' +import { + type AuthenticitySignals, + gateRealness, + type ProducedFile, + scoreAuthenticity, +} from '@tangle-network/agent-eval/authenticity' +import type { JudgeConfig, JudgeScore } from '@tangle-network/agent-eval/campaign' +import type { CodingScenario, Fixture } from './scenarios' + +// ── the rubric (4 weighted dimensions, total 1.0) ───────────────────────────── +// The rubric text + anchors live HERE, with the judge — never in the workdir. The +// agent is graded against criteria it cannot read. +export const rubric = { + correctness: { + weight: 0.4, + description: 'Does the code correctly implement the spec for all stated cases?', + }, + completeness: { + weight: 0.25, + description: 'Are all required behaviors and edge cases handled, nothing stubbed?', + }, + code_quality: { + weight: 0.2, + description: 'Is it clear, idiomatic, dependency-free as required, and maintainable?', + }, + robustness: { + weight: 0.15, + description: 'Does it handle malformed / boundary input without crashing or misbehaving?', + }, +} as const + +export type RubricDim = keyof typeof rubric +const dimKeys = Object.keys(rubric) as RubricDim[] +const weights = Object.fromEntries(dimKeys.map((k) => [k, rubric[k].weight])) as Record< + RubricDim, + number +> +const dimensions = dimKeys.map((k) => ({ key: k, description: rubric[k].description })) + +// ── the artifact the dispatch produces and the judges score ─────────────────── +export interface RunArtifact { + /** Files the agent produced, as `{ path, content }` — the realness currency. */ + files: ProducedFile[] + /** The solution file's content (convenience; also present in `files`). */ + solution: string + /** The agent's final chat text for the round (judge context). */ + finalText: string + /** The deterministic verifier report from the LAST round. */ + checks: VerificationReport + /** The realness gate verdict, computed AFTER the loop. Recorded for honesty AND + * read by the judge: a gated artifact short-circuits the judge to composite 0. */ + realness: RealnessVerdict +} + +export interface RealnessVerdict { + /** 0..1 deterministic realness (0 when gated). */ + score: number + /** True when the artifact faked or omitted the required deliverable. */ + gated: boolean + /** Human-readable flags + gate reason for the record. */ + notes: string +} + +// ── layer 1: the deterministic check pipeline ───────────────────────────────── + +/** The minimal box surface the checks need — a subset of the real `SandboxInstance`. + * The live sandbox satisfies it; the offline in-process box implements it too. */ +export interface CheckBox { + exec(command: string): Promise<{ exitCode: number; stdout: string; stderr: string }> +} + +/** Seed an eval-only file into the box via `exec` (base64 → file). Works on the + * `exec`-only surface, offline and live. The fixture's CONTENT is never described + * to the agent — this is write-only scaffold, not part of the prompt (the firewall). */ +async function seedFile(box: CheckBox, file: Fixture): Promise { + const b64 = Buffer.from(file.content, 'utf8').toString('base64') + const dir = file.path.includes('/') ? file.path.slice(0, file.path.lastIndexOf('/')) : '.' + await box.exec(`mkdir -p ${dir} && printf %s '${b64}' | base64 -d > ${file.path}`) +} + +/** One check command → a `Layer`. Pass/fail comes from the exit code. `advisory` + * layers always report `pass` (they ran) and fold their cleanliness into the + * blended score without gating `allPass` — that is how lint stays advisory. */ +function checkLayer( + name: string, + command: string, + opts: { + dependsOn?: string[] + advisory?: boolean + }, +): Layer { + return { + name, + ...(opts.dependsOn ? { dependsOn: opts.dependsOn } : {}), + async run({ env: box }) { + const r = await box.exec(command) + const ok = r.exitCode === 0 + const output = `${r.stdout}\n${r.stderr}`.trim() + const findings = ok + ? [] + : [ + { + severity: 'major' as const, + message: `${name} failed`, + evidence: output.slice(0, 1200), + }, + ] + if (opts.advisory) { + // Always "ran"; cleanliness folds into the blended score, never gates allPass. + return { + layer: name, + status: 'pass' as const, + score: ok ? 1 : 0.5, + durationMs: 0, + findings, + detail: { output }, + } + } + return { + layer: name, + status: ok ? ('pass' as const) : ('fail' as const), + score: ok ? 1 : 0, + durationMs: 0, + findings, + detail: { output }, + } + }, + } +} + +/** + * Run the scenario's deterministic checks in the box as an ordered pipeline. Seeds + * the hidden test first, then typecheck → test → lint. `report.allPass` is true only + * when typecheck AND test pass (lint is advisory). The `report.layers[*].detail.output` + * is what the refine loop reads to build the next prompt. + */ +export async function runChecks( + box: CheckBox, + scenario: CodingScenario, + cmds: { typecheck: string; test: string; lint: string }, +): Promise { + await seedFile(box, scenario.fixture) + const verifier = new MultiLayerVerifier([ + checkLayer('typecheck', cmds.typecheck, {}), + checkLayer('test', cmds.test, { dependsOn: ['typecheck'] }), + checkLayer('lint', cmds.lint, { dependsOn: ['typecheck'], advisory: true }), + ]) + return verifier.run({ env: box, overallCapMs: 120_000 }) +} + +/** Pull one check layer's captured output (for the refine prompt). */ +export function layerOutput( + report: VerificationReport, + layer: string, +): { passed: boolean; output: string } { + const r = report.layers.find((l) => l.layer === layer) + return { + passed: r?.status === 'pass', + output: typeof r?.detail?.output === 'string' ? r.detail.output : '', + } +} + +// ── layer 2: the realness gate (no LLM) ─────────────────────────────────────── + +/** + * Deterministic realness scan. `scoreAuthenticity` is a pure structural scan + * (required artifact present? hard part implemented? or a fake shim?), and + * `gateRealness` caps anything that faked or omitted the required artifact. The + * verdict is recorded AND read by the judge — a gated artifact cannot earn a score. + */ +export function realnessGate(files: ProducedFile[], signals: AuthenticitySignals): RealnessVerdict { + const result = scoreAuthenticity(files, signals) + const gate = gateRealness(result, { requireArtifact: true }) + const flags = result.flags.length > 0 ? ` — flags: ${result.flags.join(', ')}` : '' + return { + score: gate.gated ? 0 : result.realness / 100, + gated: gate.gated, + notes: `${gate.gated ? `GATED (${gate.reason ?? 'fake/missing artifact'})` : 'real'}${flags}`, + } +} + +// ── layer 3: the LLM judge(s) ───────────────────────────────────────────────── + +/** The judge instructions — the rubric anchors, kept with the judge ONLY. */ +const judgePrompt = [ + 'You are a senior code reviewer scoring a candidate solution to a coding task.', + 'Score each dimension from 0 to 1 (1 = excellent), using the criteria provided.', +].join(' ') + +/** The full context every judge sees: the code + the deterministic check results + + * the eval-only rubric note. Shared by the single judge AND the ensemble so the + * panel never grades on less information than the leaderboard judge. */ +function renderForJudge(artifact: RunArtifact, scenario: CodingScenario): string { + return [ + `Task intent: ${scenario.prompt}`, + `Grading note: ${scenario.rubricNote}`, + `Deterministic checks — typecheck:${layerOutput(artifact.checks, 'typecheck').passed} ` + + `test:${layerOutput(artifact.checks, 'test').passed} lint:${layerOutput(artifact.checks, 'lint').passed}`, + `Realness: ${artifact.realness.notes}`, + '', + 'Candidate solution:', + '```ts', + artifact.solution.slice(0, 8000), + '```', + ].join('\n') +} + +/** ── ONE judge ────────────────────────────────────────────────────────────── + * `llmJudge` builds a campaign `JudgeConfig` whose `score()` makes ONE model call + * against the rubric and reduces it to a canonical `{ dimensions, composite, notes }`. + * We wrap it so a realness-gated artifact short-circuits to composite 0 WITHOUT a + * model call — the realness gate genuinely gates the judge. */ +export function singleCodeJudge( + chat: ChatClient, + model?: string, +): JudgeConfig { + const base = llmJudge('code-quality', judgePrompt, { + chat, + ...(model ? { model } : {}), + dimensions, + weights, + scale: 'unit', + appliesTo: (s) => s.kind === 'coding', + renderUser: ({ artifact, scenario }) => renderForJudge(artifact, scenario), + }) + return gatedByRealness(base) +} + +/** ── THREE judges ──────────────────────────────────────────────────────────── + * `ensembleJudge` fans the artifact across N cross-family models in parallel and + * reduces surviving verdicts to one `JudgeScore`. A model that throws is excluded, + * never folded into a zero. `crossFamily: true` rejects a same-family panel at + * construction. The panel sees the SAME full context as the single judge. */ +export function ensembleCodeJudge( + scoreOne: (model: string, context: string) => Promise>, +): JudgeConfig { + const base = ensembleJudge({ + name: 'code-quality-ensemble', + dimensions: dimKeys, + models: ['deepseek-chat', 'gpt-4o-mini', 'gemini-flash'], + crossFamily: true, + weights, + scoreWith: async (model, input) => { + const artifact = input.artifact as RunArtifact + const scenario = input.scenario as CodingScenario + const perDimension = await scoreOne(model, renderForJudge(artifact, scenario)) + return { model, perDimension } + }, + }) as JudgeConfig + return gatedByRealness(base) +} + +/** Wrap a judge so a realness-gated artifact short-circuits to composite 0 with no + * model call. This is the gate ACTUALLY gating: a stub that faked the hard part + * cannot earn a judge score, however confident the model would have been. */ +function gatedByRealness( + judge: JudgeConfig, +): JudgeConfig { + return { + ...judge, + score(input: { + artifact: RunArtifact + scenario: CodingScenario + signal: AbortSignal + }): JudgeScore | Promise { + if (input.artifact.realness.gated) { + return { + dimensions: Object.fromEntries(dimKeys.map((k) => [k, 0])), + composite: 0, + notes: `realness-gated: ${input.artifact.realness.notes}`, + } + } + return judge.score(input) + }, + } +} diff --git a/examples/coding-benchmark/judges.ts b/examples/coding-benchmark/judges.ts deleted file mode 100644 index ee99f45..0000000 --- a/examples/coding-benchmark/judges.ts +++ /dev/null @@ -1,147 +0,0 @@ -/** - * The JUDGE layer — runs LAST, only on the band the deterministic checks can't - * resolve (e.g. "it builds and passes tests, but is the design good?"). - * - * HOW MANY JUDGES: - * - default leaderboard sweep → ONE judge (`singleCodeJudge`), one model. Cheap. - * - ship/no-ship claim → THREE judges (`ensembleCodeJudge`), cross-family - * models, reduced by `aggregateJudgeVerdicts` inside - * `ensembleJudge`. `crossFamily: true` forbids three - * models from the same family at construction, so the - * "ensemble" is genuinely independent. - * - * THE RUBRIC (4 weighted dimensions, total 1.0): - * correctness 0.40 · completeness 0.25 · code_quality 0.20 · robustness 0.15 - * The rubric text + anchors live HERE, with the judge — never in the workdir. The - * agent is graded against criteria it cannot read. - * - * Both judges are campaign `JudgeConfig`s (the shape `runProfileMatrix.judges` - * takes). There is NO `llmJudge` helper in agent-eval today, so the single judge is - * a hand-built `JudgeConfig` whose `score()` does one model call; the ensemble is - * `ensembleJudge(...)`, which already returns a `JudgeConfig`. - */ - -import { ensembleJudge } from '@tangle-network/agent-eval' -import type { JudgeConfig, Scenario } from '@tangle-network/agent-eval/campaign' -import type { CodingScenario } from './scenarios' -import type { RunArtifact } from './validators' - -/** The rubric: name → (description shown to the judge, weight in the composite). */ -export const rubric = { - correctness: { - weight: 0.4, - description: 'Does the code correctly implement the spec for all stated cases?', - }, - completeness: { - weight: 0.25, - description: 'Are all required behaviors and edge cases handled, nothing stubbed?', - }, - code_quality: { - weight: 0.2, - description: 'Is it clear, idiomatic, dependency-free as required, and maintainable?', - }, - robustness: { - weight: 0.15, - description: 'Does it handle malformed / boundary input without crashing or misbehaving?', - }, -} as const - -export type RubricDim = keyof typeof rubric -const dimKeys = Object.keys(rubric) as RubricDim[] -const weights = Object.fromEntries(dimKeys.map((k) => [k, rubric[k].weight])) as Record< - RubricDim, - number -> - -/** Inject your model caller. `(system, user) → completion`. The default below - * calls the Tangle router when `TANGLE_API_KEY` is set; offline it returns a - * deterministic stub so the pipeline runs with no creds. */ -export type CompleteFn = (system: string, user: string) => Promise - -/** The judge's instructions — the rubric anchors. Kept with the judge ONLY. */ -function judgeSystemPrompt(): string { - const dims = dimKeys - .map((k) => `- ${k} (weight ${rubric[k].weight}): ${rubric[k].description}`) - .join('\n') - return [ - 'You are a senior code reviewer scoring a candidate solution to a coding task.', - 'Score each dimension from 0 to 1 (1 = excellent). Reply with ONLY JSON:', - '{"correctness":0.x,"completeness":0.x,"code_quality":0.x,"robustness":0.x,"notes":"..."}', - '', - 'Dimensions:', - dims, - ].join('\n') -} - -/** What the judge sees: the produced code + check results + the eval-only rubric - * note. (This runs in the harness, not in the agent's box — see the firewall.) */ -function judgeUserPrompt(artifact: RunArtifact, scenario: CodingScenario): string { - return [ - `Task intent: ${scenario.prompt}`, - `Grading note: ${scenario.rubricNote}`, - `Deterministic checks — typecheck:${artifact.checks.typecheck.passed} test:${artifact.checks.test.passed} lint:${artifact.checks.lint.passed}`, - '', - 'Candidate solution:', - '```ts', - artifact.solution.slice(0, 8000), - '```', - ].join('\n') -} - -/** Parse the judge's JSON; fail-closed (a bad response scores 0, never a fake pass). */ -function parseScores(raw: string): Record { - try { - const json = JSON.parse(raw.slice(raw.indexOf('{'), raw.lastIndexOf('}') + 1)) - const out = {} as Record - for (const k of dimKeys) { - const v = Number(json[k]) - out[k] = Number.isFinite(v) ? Math.max(0, Math.min(1, v)) : 0 - } - return out - } catch { - return Object.fromEntries(dimKeys.map((k) => [k, 0])) as Record - } -} - -function composite(scores: Record): number { - return dimKeys.reduce((sum, k) => sum + scores[k] * weights[k], 0) -} - -/** ── ONE judge ──────────────────────────────────────────────────────────── - * A hand-built campaign `JudgeConfig` whose `score()` makes a single model call. - * `appliesTo` runs it only on coding scenarios (a no-op here, shown for the shape). */ -export function singleCodeJudge(complete: CompleteFn): JudgeConfig { - return { - name: 'code-quality', - dimensions: dimKeys.map((k) => ({ key: k, description: rubric[k].description })), - appliesTo: (s: Scenario) => s.kind === 'coding', - async score({ artifact, scenario }) { - const raw = await complete(judgeSystemPrompt(), judgeUserPrompt(artifact, scenario)) - const scores = parseScores(raw) - return { dimensions: scores, composite: composite(scores), notes: 'single-judge' } - }, - } -} - -/** ── THREE judges ───────────────────────────────────────────────────────── - * `ensembleJudge` fans the artifact across N cross-family models in parallel and - * reduces surviving verdicts to one `JudgeScore`. A model that throws is excluded, - * never folded into a zero. Use this only for a ship/no-ship claim. */ -export function ensembleCodeJudge( - scoreOne: (model: string, code: string, intent: string) => Promise>, -): JudgeConfig { - return ensembleJudge({ - name: 'code-quality-ensemble', - dimensions: dimKeys, - // Three cross-family models — independence enforced at construction. - models: ['deepseek-chat', 'gpt-4o-mini', 'gemini-flash'], - crossFamily: true, - weights, - scoreWith: async (model, input) => { - const artifact = input.artifact as RunArtifact - const scenario = input.scenario as CodingScenario | undefined - const perDimension = await scoreOne(model, artifact.solution, scenario?.prompt ?? '') - return { model, perDimension } - }, - }) -} diff --git a/examples/coding-benchmark/offline-box.ts b/examples/coding-benchmark/offline-box.ts index 52b704e..785bc53 100644 --- a/examples/coding-benchmark/offline-box.ts +++ b/examples/coding-benchmark/offline-box.ts @@ -2,20 +2,24 @@ * The OFFLINE seam — an in-process `SandboxClient` so the WHOLE benchmark runs * with no creds and no network, exactly like `examples/ui-audit/` does. * + * The offline "agent" is a SCRIPTED STAND-IN for a real coding agent: it writes a + * canned solution per round instead of calling a model. That is the only thing + * stubbed — the matrix, the verifier, the realness gate, the judge wiring, and the + * stats all run for real. `--live` swaps this client for `new SandboxClient(...)` + * and the same dispatch runs each round in a real harness box. + * * It implements only what `openSandboxRun` actually calls on a box: - * - `streamPrompt(prompt, opts)` — the "agent" turn. Offline it deterministically - * writes a canned solution into a real temp workspace and emits one terminal - * `result` event carrying finalText + tokenUsage (so the run meters honestly). - * - `fs.read` / `fs.write` — over the temp workspace (the `artifact` deliverable - * + the validators read/write real files here). - * - `exec(cmd)` — runs the deterministic check commands. Offline the toolchain - * (tsc/biome/node --test) usually isn't installed, so a missing tool reads as a - * FAIL — which is the honest offline signal, not a fake pass. + * - `streamPrompt(prompt, opts)` — the "agent" turn. Writes the round's scripted + * solution into a real temp workspace and emits one terminal `done` event — the + * SAME shape a live box emits, carrying `tokenUsage` so the run meters honestly + * and `extractLlmCallEvent` reads it. + * - `fs.read` / `fs.write` — over the temp workspace (the `artifact` deliverable + + * the seeded fixture live here). + * - `exec(cmd)` — runs the deterministic check + fixture-seed commands. Offline the + * toolchain (tsc / biome / node --test) usually isn't installed, so a missing tool + * reads as a FAIL — the honest offline signal, not a fake pass. (The checks never + * pass offline, so all `maxRounds` run — which is exactly when refinement shows.) * - `delete()` — tears the temp dir down. - * - * Swap this for `new SandboxClient({ apiKey, baseUrl })` (cast to the runtime's - * `SandboxClient`) and the SAME dispatch runs each round in a real harness box. - * Nothing else in the example changes — that is the point. */ import { exec as execCb } from 'node:child_process' @@ -29,13 +33,9 @@ import type { CreateSandboxOptions, SandboxEvent, SandboxInstance } from '@tangl const execAsync = promisify(execCb) -/** Produces the canned solution an offline "agent" writes for a given task. Two - * fidelity levels let the example show the validators/judge separating quality: - * a `real` implementation passes the realness scan, a `stub` is caught by it. */ -export type OfflineQuality = 'real' | 'stub' - -/** A scripted offline solution: which file, what content, per round. The harness - * calls `solutionFor(round)` so round 2 can differ from round 1 (refine demo). */ +/** A scripted offline solution: which file, and what content to write on a given + * round. `solutionFor(round)` lets round N differ from round N-1 — a REAL refine + * demo, not a constant. */ export interface OfflineScript { path: string solutionFor: (round: number) => string @@ -52,12 +52,14 @@ function instanceMethods(workdir: string, script: OfflineScript) { const abs = join(workdir, script.path) await mkdir(dirname(abs), { recursive: true }) await writeFile(abs, content, 'utf8') + // The real sandbox terminal event shape: `done` with `data.tokenUsage` + + // top-level `totalCostUsd`. `extractLlmCallEvent` reads exactly this. yield { - type: 'result', + type: 'done', data: { - finalText: `wrote ${script.path} (offline round ${round})`, tokenUsage: { inputTokens: 600, outputTokens: 400 }, - costUsd: 0, + totalCostUsd: 0, + finalText: `wrote ${script.path} (offline round ${round})`, }, } as unknown as SandboxEvent }, diff --git a/examples/coding-benchmark/profiles.ts b/examples/coding-benchmark/profiles.ts index b9cc3b7..bf24da5 100644 --- a/examples/coding-benchmark/profiles.ts +++ b/examples/coding-benchmark/profiles.ts @@ -1,23 +1,23 @@ /** - * The HARNESS axis — one baseline `AgentProfile` per coding harness. + * The HARNESS axis + the TOOL knob — the agent-config side of the matrix. * * We measure the HARNESS on its default behavior, so each profile is deliberately * bare: a name, the model it runs, and NOTHING else (no skills, no injected system - * prompt, no extra tools). Adding scaffolding here would measure our scaffolding, - * not the harness. + * prompt). Adding scaffolding here would measure our scaffolding, not the harness. + * The tool surface is a separate, orthogonal knob authored onto the profile in one + * line (`withTools`), so harness × tool is a clean cartesian. * * Two things to know about the shape: * - `runProfileMatrix` takes `AgentProfile[]` from `@tangle-network/agent-interface`. * That type has NO `harness` field (harness is a SANDBOX concept, not a profile * concept), so we carry the harness selector on `metadata.harness`. `dispatch.ts` - * reads it to pick `backend.type` for the sandbox. `harnessOf()` below is the one - * reader. - * - The matrix REQUIRES `model.default` (it stamps it onto every run record). For a - * real harness the agent uses the harness's own default model; we still name a - * model id here so the record is honest about what ran. + * reads it to pick `backend.type`. `harnessOf()` below is the one reader. + * - The matrix REQUIRES a snapshot-dated `model.default` (it stamps it onto every + * run record). For a real harness the agent uses the harness's own default model; + * we still name a model id here so the record is honest about what ran. */ -import type { AgentProfile } from '@tangle-network/agent-interface' +import type { AgentProfile, AgentProfileMcpServer } from '@tangle-network/agent-interface' import type { BackendType } from '@tangle-network/sandbox' /** The harnesses we sweep. `cli-base` is the plain-CLI baseline (no agent harness). */ @@ -41,9 +41,10 @@ export function harnessOf(profile: AgentProfile): BackendType { * the value is stamped onto the run record, so keep it truthful. * * IMPORTANT — the model id MUST carry a SNAPSHOT DATE. `runProfileMatrix` rejects a - * bare `name` and requires `name@YYYY-MM-DD` or `name-YYYYMMDD`, because a run record - * without the exact model snapshot is not reproducible ("which gpt-4.1 was that?"). - * This is the substrate keeping the benchmark paper-grade — keep the date current. */ + * bare alias and requires the snapshot form (`provider/name-YYYY-MM-DD`), because a + * run record without the exact model snapshot is not reproducible ("which gpt-4.1 + * was that?"). This is the substrate keeping the benchmark paper-grade — keep the + * date current. */ const harnessModel: Record = { 'claude-code': process.env.CLAUDE_CODE_MODEL ?? 'anthropic/claude-sonnet-4-5-2025-09-29', opencode: process.env.OPENCODE_MODEL ?? 'anthropic/claude-sonnet-4-5-2025-09-29', @@ -69,6 +70,54 @@ const harnessModel: Record = { export const harnessProfiles: AgentProfile[] = harnesses.map((harness) => ({ name: `${harness}-baseline`, model: { default: harnessModel[harness] }, - // NO prompt, NO resources, NO tools — measure the harness, not our scaffolding. + // NO prompt, NO resources — measure the harness, not our scaffolding. metadata: { harness }, })) + +// ── the tool knob ───────────────────────────────────────────────────────────── +// +// A tool surface is a PRESET, not forked code. Each preset authors the SAME two +// fields onto a profile — native tools on/off (`profile.tools`) and an optional +// mounted MCP server (`profile.mcp`) — and the sandbox substrate materializes them +// into each harness's real shape (`.claude/`, `opencode.json`, codex config, ...). +// We never hand-write a per-harness config file. +// +// withTools(profile, 'web') // turn on the native web tools +// withTools(profile, 'search-mcp') // mount a search MCP instead +// withTools(profile, 'none') // baseline: no web, no MCP +// +// Honesty note for partners: a preset only takes effect for a (harness, lever) pair +// the sandbox actually materializes. If a harness has no native `webfetch`, +// `withTools(p,'web')` is a no-op THERE — a substrate fact, not something this +// example silently patches over. See `@tangle-network/sandbox` for the matrix. + +/** Where a search MCP lives, when the `search-mcp` preset is selected. */ +const searchMcpUrl = process.env.TANGLE_SEARCH_MCP ?? 'https://search-mcp.tangle.tools/mcp' + +export type ToolPreset = 'none' | 'web' | 'search-mcp' + +interface ToolSurface { + /** Native harness tools, by name → enabled. Maps to `profile.tools`. */ + tools?: Record + /** A mounted MCP server, by name. Maps to `profile.mcp`. */ + mcp?: Record +} + +const presets: Record = { + none: { tools: { websearch: false, webfetch: false } }, + web: { tools: { websearch: true, webfetch: true } }, + 'search-mcp': { + tools: { websearch: false, webfetch: false }, + mcp: { search: { transport: 'http', url: searchMcpUrl, enabled: true } }, + }, +} + +/** Author a tool surface onto a profile. Returns a NEW profile (pure). */ +export function withTools(profile: AgentProfile, preset: ToolPreset): AgentProfile { + const surface = presets[preset] + return { + ...profile, + ...(surface.tools ? { tools: surface.tools } : {}), + ...(surface.mcp ? { mcp: surface.mcp } : {}), + } +} diff --git a/examples/coding-benchmark/scenarios.ts b/examples/coding-benchmark/scenarios.ts index 62871b0..b6329f7 100644 --- a/examples/coding-benchmark/scenarios.ts +++ b/examples/coding-benchmark/scenarios.ts @@ -4,13 +4,15 @@ * Every scenario splits cleanly into two halves: * - `prompt` — THE ONLY field the agent ever sees. The dispatch copies it * (and nothing else) into the worker's context. - * - everything else — the rubric note, the validator commands, the realness - * signals — is EVAL-ONLY. It is read by validators.ts and - * judges.ts to score the result; it is NEVER written into the - * box. Because the two halves are different fields on the same - * object, "the agent can read the answer key" becomes a thing - * you can SEE in one place: it would require dispatch.ts to put - * a non-`prompt` field into the profile. It does not. (See the + * - everything else — the deterministic test fixture, the realness signals, the + * rubric note — is EVAL-ONLY. It is read by eval.ts to score the + * result; the fixture is SEEDED into the box (so `node --test` + * has something to run) but its CONTENT is never described to the + * agent, and the rubric/realness signals are never written into + * the box at all. Because the two halves are different fields on + * one object, "the agent can read the answer key" becomes a thing + * you can SEE in one place: it would require dispatch.ts to put a + * non-`prompt` field into the profile. It does not. (See the * `// FIREWALL` comment in dispatch.ts for the exact line.) * * This is the structural defense the design calls for: the firewall is a property @@ -20,6 +22,13 @@ import type { AuthenticitySignals } from '@tangle-network/agent-eval/authenticity' import type { Scenario } from '@tangle-network/agent-eval/campaign' +/** A file the harness seeds into the box workspace before the run — the test the + * deterministic check executes. EVAL-ONLY: its content is never shown to the agent. */ +export interface Fixture { + path: string + content: string +} + /** One held-out coding task. Extends the substrate `Scenario` ({ id, kind, tags }). */ export interface CodingScenario extends Scenario { /** ── AGENT-VISIBLE ────────────────────────────────────────────────────── @@ -27,23 +36,19 @@ export interface CodingScenario extends Scenario { * This is the WHOLE of what reaches the worker's context. */ prompt: string - /** ── EVAL-ONLY (never written into the box) ───────────────────────────── */ + /** ── EVAL-ONLY (the agent never reads these) ──────────────────────────── */ /** Path (relative to the workspace root) the agent is asked to produce. The - * validators read this file off the box AFTER the turn; the judge scores it. */ + * checks read this file off the box AFTER the turn; the judge scores it. */ solutionPath: string - /** Deterministic checks, run in order, in the box, BEFORE any judge. These are - * shell commands the harness runs against the produced code. Objective, ~$0. - * They are eval config — the agent is told WHAT to build, never HOW it's graded. */ - validatorCmds: { - typecheck: string - test: string - lint: string - } + /** The hidden test, seeded into the box so `node --test` has a real file to run. + * Seeded write-only — the agent is told WHAT to build (the prompt), never the + * assertions it is graded against. */ + fixture: Fixture - /** Drives `scoreAuthenticity` — catches a stub that compiles but fakes the - * hard part. Write-only to the record; the agent cannot read or steer it. */ + /** Realness anchor input for `scoreAuthenticity` — catches a stub that compiles + * but fakes the hard part. Write-only to the record; never reaches the box. */ realnessSignals: AuthenticitySignals /** Extra grading context for the JUDGE only (design intent, edge cases to @@ -51,6 +56,17 @@ export interface CodingScenario extends Scenario { rubricNote: string } +// The deterministic check commands. Invoked directly (NOT via `npx -y`, which forces +// a registry round-trip every run): a real harness box has `tsc`/`biome`/`node` on +// PATH, so these run for real there; offline the missing tool fails FAST with a +// non-zero exit (the honest offline signal), not a 20s network stall. +/** A typecheck shell command for one solution file. */ +const typecheckCmd = (path: string) => `tsc --noEmit --strict --skipLibCheck ${path}` +/** A `node --test` command for one fixture. The fixture imports the solution. */ +const testCmd = (fixturePath: string) => `node --test ${fixturePath}` +/** A lint shell command for one solution file. */ +const lintCmd = (path: string) => `biome check ${path}` + /** * A 2-task corpus. Real benchmarks carry 20-50; two keeps the example readable. * Both are self-contained "write one module that passes these checks" tasks — the @@ -70,10 +86,29 @@ export const scenarios: CodingScenario[] = [ 'and false otherwise. No external dependencies.', ].join(' '), solutionPath: 'src/rate-limiter.ts', - validatorCmds: { - typecheck: 'npx tsc --noEmit src/rate-limiter.ts', - test: 'node --test test/rate-limiter.test.js', - lint: 'npx biome check src/rate-limiter.ts', + fixture: { + path: 'test/rate-limiter.test.js', + content: `import { test } from 'node:test' +import assert from 'node:assert/strict' +import { RateLimiter } from '../src/rate-limiter.ts' + +test('consumes when tokens available', () => { + const rl = new RateLimiter(10, 1) + assert.equal(rl.tryRemove(5), true) + assert.equal(rl.tryRemove(5), true) +}) + +test('rejects when over capacity', () => { + const rl = new RateLimiter(3, 1) + assert.equal(rl.tryRemove(4), false) +}) + +test('rejects a second draw that exceeds the remaining bucket', () => { + const rl = new RateLimiter(10, 0) + assert.equal(rl.tryRemove(8), true) + assert.equal(rl.tryRemove(8), false) +}) +`, }, realnessSignals: { label: 'token-bucket', @@ -101,10 +136,28 @@ export const scenarios: CodingScenario[] = [ 'No external dependencies.', ].join(' '), solutionPath: 'src/csv.ts', - validatorCmds: { - typecheck: 'npx tsc --noEmit src/csv.ts', - test: 'node --test test/csv.test.js', - lint: 'npx biome check src/csv.ts', + fixture: { + path: 'test/csv.test.js', + content: `import { test } from 'node:test' +import assert from 'node:assert/strict' +import { parseCsv } from '../src/csv.ts' + +test('parses a plain row', () => { + assert.deepEqual(parseCsv('a,b,c'), [['a', 'b', 'c']]) +}) + +test('keeps a comma inside a quoted field', () => { + assert.deepEqual(parseCsv('"a,b",c'), [['a,b', 'c']]) +}) + +test('keeps a newline inside a quoted field', () => { + assert.deepEqual(parseCsv('"line1\\nline2",b'), [['line1\\nline2', 'b']]) +}) + +test('unescapes a doubled quote', () => { + assert.deepEqual(parseCsv('"she said ""hi"""'), [['she said "hi"']]) +}) +`, }, realnessSignals: { label: 'csv-rfc4180', @@ -122,3 +175,18 @@ export const scenarios: CodingScenario[] = [ 'field containing a comma, a literal newline, and an escaped quote.', }, ] + +/** The deterministic check commands for a scenario — derived from its paths, in the + * ordered pipeline the verifier runs (typecheck → test → lint). Eval config: the + * agent is told WHAT to build, never the commands it is graded by. */ +export function checkCmds(scenario: CodingScenario): { + typecheck: string + test: string + lint: string +} { + return { + typecheck: typecheckCmd(scenario.solutionPath), + test: testCmd(scenario.fixture.path), + lint: lintCmd(scenario.solutionPath), + } +} diff --git a/examples/coding-benchmark/stats.ts b/examples/coding-benchmark/stats.ts index bae6288..506c886 100644 --- a/examples/coding-benchmark/stats.ts +++ b/examples/coding-benchmark/stats.ts @@ -3,27 +3,37 @@ * - per-harness mean composite + a bootstrap CONFIDENCE INTERVAL (`confidenceInterval`) * - per-harness PASS-RATE with a binomial Wilson interval (`wilson`) — the correct * CI for a proportion (the continuous CI assumes the wrong distribution) - * - every harness PAIR compared on MATCHED scenarios with a paired bootstrap - * (`pairedBootstrap`), then BH-corrected across all pairs (`benjaminiHochberg`) - * so running many comparisons doesn't manufacture a false winner. + * - every harness PAIR compared on MATCHED scenarios with a REAL paired significance + * test (`pairedTTest`, or `wilcoxonSignedRank` for the non-parametric path), then + * BH-corrected across all pairs (`benjaminiHochberg`) so running many comparisons + * doesn't manufacture a false winner. The paired delta + its bootstrap CI + * (`pairedBootstrap`) is reported as the effect size. * - * Every number here is one agent-eval primitive call. No hand-rolled statistics. + * Every number here is one agent-eval primitive call. No hand-rolled statistics, + * and no fake p-values: BH is fed the actual paired-test p, not a CI proxy. * - * (The design flagged "no binomial CI in agent-eval" as a gap — that's stale: - * `wilson(successes, n)` ships in the stats surface and is exactly this CI. Used below.) + * Pairing discipline: the paired unit is the SCENARIO. With `reps > 1` a harness + * produces several records per scenario; we average them to ONE score per + * (harness, scenario) before pairing, so the paired arrays line up scenario-for- + * scenario and reps tighten the per-cell estimate instead of corrupting the pairing. */ import { benjaminiHochberg, confidenceInterval, pairedBootstrap, + pairedTTest, type RunRecord, + wilcoxonSignedRank, wilson, } from '@tangle-network/agent-eval' /** A composite at or above this counts as "green" for the pass-rate proportion. */ const greenThreshold = 0.6 +/** Which paired test to run. Parametric `t` by default; `wilcoxon` for skewed scores. */ +export type PairedTest = 't' | 'wilcoxon' + interface HarnessRow { harness: string n: number @@ -40,6 +50,8 @@ interface PairResult { delta: number low: number high: number + /** the paired-test p-value (before correction) */ + p: number /** BH-significant after correcting across all pairs */ significant: boolean } @@ -49,16 +61,17 @@ export interface StatsReport { pairs: PairResult[] } -/** Per-record composite — the search-split score the judges produced. */ +/** Per-record composite — the score the judges produced. */ function score(r: RunRecord): number { return r.outcome.searchScore ?? r.outcome.holdoutScore ?? 0 } -/** Group records by harness profile (the matrix stamps the profile id as candidateId). */ -function byHarness(records: RunRecord[]): Map { +/** Group records by harness profile. The matrix stamps the profile id (a hash) as + * `candidateId`; we resolve it to the readable harness name via `nameOf`. */ +function byHarness(records: RunRecord[], nameOf: (id: string) => string): Map { const m = new Map() for (const r of records) { - const key = r.agentProfile?.profileId ?? r.candidateId + const key = nameOf(r.agentProfile?.profileId ?? r.candidateId) const list = m.get(key) ?? [] list.push(r) m.set(key, list) @@ -66,23 +79,45 @@ function byHarness(records: RunRecord[]): Map { return m } -/** Scores for harness A and B on the SAME scenarios, aligned for pairing. */ +/** ONE mean score per scenario for a harness — collapses reps so the paired unit is + * the scenario, in a stable scenario order. */ +function meanByScenario(records: RunRecord[]): Map { + const sums = new Map() + for (const r of records) { + const id = r.scenarioId ?? '' + const acc = sums.get(id) ?? { total: 0, n: 0 } + acc.total += score(r) + acc.n += 1 + sums.set(id, acc) + } + const out = new Map() + for (const [id, acc] of sums) out.set(id, acc.n ? acc.total / acc.n : 0) + return out +} + +/** Scores for harness A and B on the SAME scenarios, aligned for pairing (one + * averaged score per scenario, in shared scenario order). */ function pairedScores(a: RunRecord[], b: RunRecord[]): { aScores: number[]; bScores: number[] } { - const bByScenario = new Map(b.map((r) => [r.scenarioId ?? '', r])) + const aMean = meanByScenario(a) + const bMean = meanByScenario(b) const aScores: number[] = [] const bScores: number[] = [] - for (const ra of a) { - const rb = bByScenario.get(ra.scenarioId ?? '') - if (rb) { - aScores.push(score(ra)) - bScores.push(score(rb)) + for (const scenarioId of [...aMean.keys()].sort()) { + const bv = bMean.get(scenarioId) + if (bv !== undefined) { + aScores.push(aMean.get(scenarioId) as number) + bScores.push(bv) } } return { aScores, bScores } } -export function pairwiseStats(records: RunRecord[]): StatsReport { - const groups = byHarness(records) +export function pairwiseStats( + records: RunRecord[], + nameOf: (id: string) => string, + test: PairedTest = 't', +): StatsReport { + const groups = byHarness(records, nameOf) const harnesses = [...groups.keys()].sort() const leaderboard: HarnessRow[] = harnesses.map((harness) => { @@ -101,7 +136,7 @@ export function pairwiseStats(records: RunRecord[]): StatsReport { } }) - // Every unordered harness pair, paired-bootstrapped on matched scenarios. + // Every unordered harness pair, with a REAL paired test on matched scenarios. const raw: Omit[] = [] for (let i = 0; i < harnesses.length; i += 1) { for (let j = i + 1; j < harnesses.length; j += 1) { @@ -109,14 +144,21 @@ export function pairwiseStats(records: RunRecord[]): StatsReport { const hb = harnesses[j] as string const { aScores, bScores } = pairedScores(groups.get(ha) ?? [], groups.get(hb) ?? []) if (aScores.length === 0) continue + // Effect size + CI from the paired bootstrap; the p-value from a real paired test. const boot = pairedBootstrap(aScores, bScores, { seed: 7, statistic: 'median' }) - raw.push({ a: ha, b: hb, delta: boot.median, low: boot.low, high: boot.high }) + const p = + test === 'wilcoxon' + ? wilcoxonSignedRank(aScores, bScores).p + : pairedTTest(aScores, bScores).p + raw.push({ a: ha, b: hb, delta: boot.median, low: boot.low, high: boot.high, p }) } } - // A CI excluding 0 is the per-pair p<0.05 proxy; BH-correct across all pairs. - const pProxy = raw.map((r) => (r.low > 0 || r.high < 0 ? 0.04 : 0.5)) - const { significant } = benjaminiHochberg(pProxy, 0.05) + // BH-correct the REAL p-values across all pairs (controls the false-discovery rate). + const { significant } = benjaminiHochberg( + raw.map((r) => r.p), + 0.05, + ) const pairs: PairResult[] = raw.map((r, i) => ({ ...r, significant: significant[i] ?? false })) return { leaderboard, pairs } @@ -135,11 +177,12 @@ export function renderStats(report: StatsReport): string { ) } lines.push('') - lines.push('Pairwise (paired bootstrap on matched scenarios, BH-corrected):') + lines.push('Pairwise (paired delta + bootstrap CI; paired-test p, BH-corrected):') for (const p of report.pairs) { const tag = p.significant ? 'SIGNIFICANT' : 'n.s.' lines.push( - ` ${p.b} − ${p.a}: Δ=${p.delta.toFixed(3)} [${p.low.toFixed(3)}, ${p.high.toFixed(3)}] ${tag}`, + ` ${p.b} − ${p.a}: Δ=${p.delta.toFixed(3)} [${p.low.toFixed(3)}, ${p.high.toFixed(3)}] ` + + `p=${p.p.toFixed(3)} ${tag}`, ) } return lines.join('\n') diff --git a/examples/coding-benchmark/tools.ts b/examples/coding-benchmark/tools.ts deleted file mode 100644 index 65c49bf..0000000 --- a/examples/coding-benchmark/tools.ts +++ /dev/null @@ -1,55 +0,0 @@ -/** - * The TOOL knob — swap the agent's tool surface in ONE line. - * - * A tool surface is a PRESET, not forked code. Each preset authors the SAME two - * fields onto a profile — native tools on/off (`profile.tools`) and an optional - * mounted MCP server (`profile.mcp`) — and the sandbox substrate materializes them - * into each harness's real shape (`.claude/`, `opencode.json`, codex config, ...). - * We never hand-write a per-harness config file. - * - * withTools(profile, 'web') // turn on the native web tools - * withTools(profile, 'search-mcp') // mount a search MCP instead - * withTools(profile, 'none') // baseline: no web, no MCP - * - * To add the tool surface as a 4TH matrix axis, build the profile list as the - * cartesian of harnesses × presets (see benchmark.ts, `--tools` flag). - * - * Honesty note for partners: a preset only takes effect for a (harness, lever) - * pair the sandbox actually materializes. If a harness has no native `webfetch`, - * `withTools(p,'web')` is a no-op THERE — that is a substrate fact, not something - * this example silently patches over. Check `@tangle-network/sandbox` for the - * materialization matrix before trusting a tool swap on a given harness. - */ - -import type { AgentProfile, AgentProfileMcpServer } from '@tangle-network/agent-interface' - -/** Where a search MCP lives, when the `search-mcp` preset is selected. */ -const searchMcpUrl = process.env.TANGLE_SEARCH_MCP ?? 'https://search-mcp.tangle.tools/mcp' - -export type ToolPreset = 'none' | 'web' | 'search-mcp' - -interface ToolSurface { - /** Native harness tools, by name → enabled. Maps to `profile.tools`. */ - tools?: Record - /** A mounted MCP server, by name. Maps to `profile.mcp`. */ - mcp?: Record -} - -const presets: Record = { - none: { tools: { websearch: false, webfetch: false } }, - web: { tools: { websearch: true, webfetch: true } }, - 'search-mcp': { - tools: { websearch: false, webfetch: false }, - mcp: { search: { transport: 'http', url: searchMcpUrl, enabled: true } }, - }, -} - -/** Author a tool surface onto a profile. Returns a NEW profile (pure). */ -export function withTools(profile: AgentProfile, preset: ToolPreset): AgentProfile { - const surface = presets[preset] - return { - ...profile, - ...(surface.tools ? { tools: surface.tools } : {}), - ...(surface.mcp ? { mcp: surface.mcp } : {}), - } -} diff --git a/examples/coding-benchmark/validators.ts b/examples/coding-benchmark/validators.ts deleted file mode 100644 index 7081bd3..0000000 --- a/examples/coding-benchmark/validators.ts +++ /dev/null @@ -1,104 +0,0 @@ -/** - * The DETERMINISTIC layer — validators that run BEFORE any judge. - * - * Scoring a coding task in the right order matters: objective checks first (they - * cost ~$0 and can't be gamed), an anti-fake realness gate next, and only THEN — - * if there is still a subjective band left to grade — an LLM judge. This file owns - * the first two layers. judges.ts owns the third. - * - * Two kinds of validator here: - * 1. `runBoxChecks` — runs the scenario's `typecheck` / `test` / `lint` commands - * IN THE BOX via `box.exec(...)`. Pass/fail comes from the exit code. This is - * a runtime concern (it needs a live box), so it is a plain async function the - * dispatch calls each round; the booleans it returns are what steer the next - * round (see the firewall note in dispatch.ts). - * 2. `realnessValidator` — wraps agent-eval's `scoreAuthenticity` + `gateRealness` - * as a runtime `Validator`. It catches "compiles but is a stub". - * Its score is WRITE-ONLY to the record — the agent never sees it, so it cannot - * steer toward it. - * - * `Validator` is the runtime seam (src/runtime/types.ts): one - * method, `validate(output, ctx) → Promise`. We use the default verdict - * shape `{ valid, score, signals }`. - */ - -import { - type AuthenticitySignals, - gateRealness, - type ProducedFile, - scoreAuthenticity, -} from '@tangle-network/agent-eval/authenticity' -import type { DefaultVerdict, Validator } from '@tangle-network/agent-runtime/loops' - -/** A finished coding attempt — what the dispatch produces and the judge scores. */ -export interface RunArtifact { - /** Files the agent produced, as `{ path, content }` — the realness currency. */ - files: ProducedFile[] - /** The solution file's content (convenience; also present in `files`). */ - solution: string - /** The agent's final chat text for the round (judge context). */ - finalText: string - /** Deterministic check results from the LAST round — gate the judge + the record. */ - checks: BoxCheckResult - /** The realness anchor's verdict, computed AFTER the loop by `realnessValidator`. - * Recorded for honesty; the agent never sees it (see the firewall in dispatch.ts). */ - realness: DefaultVerdict -} - -export interface BoxCheckResult { - typecheck: { passed: boolean; output: string } - test: { passed: boolean; output: string } - lint: { passed: boolean; output: string } - /** True only when typecheck AND test pass (lint is advisory). */ - allPass: boolean -} - -/** Minimal box surface the checks need — a subset of the real `SandboxInstance`. - * The live sandbox satisfies it; the offline in-process box implements it too. */ -export interface CheckBox { - exec(command: string): Promise<{ exitCode: number; stdout: string; stderr: string }> -} - -/** - * Run the scenario's deterministic checks in the box. Exit code 0 = pass. This is - * the objective floor: it can't be talked around by a confident judge, and it costs - * nothing. The agent IS told what to build (the prompt), but never the grading - * commands — those live on the scenario's eval-only fields. - */ -export async function runBoxChecks( - box: CheckBox, - cmds: { typecheck: string; test: string; lint: string }, -): Promise { - const run = async (cmd: string): Promise<{ passed: boolean; output: string }> => { - const r = await box.exec(cmd) - return { passed: r.exitCode === 0, output: `${r.stdout}\n${r.stderr}`.trim() } - } - const typecheck = await run(cmds.typecheck) - const test = await run(cmds.test) - const lint = await run(cmds.lint) - return { typecheck, test, lint, allPass: typecheck.passed && test.passed } -} - -/** - * The realness anchor as a runtime `Validator`. `scoreAuthenticity` is a pure, - * no-LLM structural scan (required artifact present? hard part implemented? or a - * fake shim?), and `gateRealness` caps anything that faked or omitted the required - * artifact. The verdict is recorded but NEVER fed back to the agent. - */ -export function realnessValidator(signals: AuthenticitySignals): Validator { - return { - async validate(artifact: RunArtifact): Promise { - const result = scoreAuthenticity(artifact.files, signals) - const gate = gateRealness(result, { requireArtifact: true }) - // realness is 0..100; normalize to the 0..1 verdict score. - const score = gate.gated ? 0 : result.realness / 100 - const flags = result.flags.length > 0 ? ` — flags: ${result.flags.join(', ')}` : '' - return { - valid: !gate.gated && result.realness >= 50, - score, - scores: { realness: result.realness, gated: gate.gated ? 1 : 0 }, - notes: `${gate.gated ? `GATED (${gate.reason ?? 'fake/missing artifact'})` : 'real'}${flags}`, - } - }, - } -} diff --git a/package.json b/package.json index 61b30f9..3166475 100644 --- a/package.json +++ b/package.json @@ -89,7 +89,7 @@ }, "devDependencies": { "@biomejs/biome": "^2.4.15", - "@tangle-network/agent-eval": ">=0.97.0 <1.0.0", + "@tangle-network/agent-eval": ">=0.99.0 <1.0.0", "@tangle-network/agent-interface": ">=0.10.0 <1.0.0", "@tangle-network/sandbox": ">=0.8.0 <1.0.0", "@types/node": "^25.9.3", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 12ca5b8..8e19c4e 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -16,8 +16,8 @@ importers: specifier: ^2.4.15 version: 2.4.15 '@tangle-network/agent-eval': - specifier: '>=0.97.0 <1.0.0' - version: 0.97.0(typescript@5.9.3) + specifier: '>=0.99.0 <1.0.0' + version: 0.99.0(typescript@5.9.3) '@tangle-network/agent-interface': specifier: '>=0.10.0 <1.0.0' version: 0.10.0 @@ -488,8 +488,8 @@ packages: '@tangle-network/sandbox': optional: true - '@tangle-network/agent-eval@0.97.0': - resolution: {integrity: sha512-SCC2QxNgTqrHK0+WNTQIvuZtfcGdSi/ejf7c1x5yGYIS/iM7nYxBNXsr9i64qMcsvxyHB1ecf3ZOJZiw8WpMfQ==} + '@tangle-network/agent-eval@0.99.0': + resolution: {integrity: sha512-jcCuqDfIhgE2SnVfAu/fwUaYJaX+CpJ6Va9CPI5KtjLsKevtt/ZV2+OOpg84JWbrrYaTS+4DVps7VPN4HV/vaA==} engines: {node: '>=20'} hasBin: true @@ -1423,7 +1423,7 @@ snapshots: - typescript - utf-8-validate - '@tangle-network/agent-eval@0.97.0(typescript@5.9.3)': + '@tangle-network/agent-eval@0.99.0(typescript@5.9.3)': dependencies: '@asteasolutions/zod-to-openapi': 8.5.0(zod@4.4.3) '@ax-llm/ax': 19.0.45(zod@4.4.3) From 818e73db56a55d39fea39046b66e12cc52050dee Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Wed, 24 Jun 2026 04:52:30 -0600 Subject: [PATCH 3/5] =?UTF-8?q?fix(examples):=20coding-benchmark=20honesty?= =?UTF-8?q?=20fixes=20=E2=80=94=20gate=20fires=20on=20real=20data,=20reps?= =?UTF-8?q?=20stop=20pseudo-replicating,=20TS=20test=20runner,=20runnable?= =?UTF-8?q?=20on=20clean=20clone?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four partner-blocking defects against the honesty pitch: - HIGH: the round-0 offline stub the dispatch wrote scored composite 0.6 (gated:false) because its `refillPerSec` param matched the realImpl regex, so the "stub → gated → composite 0" demo never fired on a real run — only the unit test's separate strawman gated. Make the round-0 stub genuinely hollow (inert `_capacity`/`_ratePerSec` args, no refill math) so gateRealness gates it to composite 0 on the benchmark's own data. Export `offlineSolutions` and assert the gate against the EXACT dispatch stub in the smoke test. - HIGH: the leaderboard CI/Wilson were computed over every raw rep record, so identical reps faked a narrower interval (pass-CI [34%,100%] → [61%,100%] at reps=3). Collapse reps to one mean per (harness,scenario) before the CI/Wilson, matching the pairing path. Add a regression test that identical reps leave the CI unchanged. - MEDIUM: the test check ran plain `node --test`; the fixture imports the solution as `.ts`, and Node strip-only mode throws ERR_UNSUPPORTED_TYPESCRIPT_SYNTAX on constructor parameter properties (the canonical impl's style), false-failing a correct solution. Run `node --experimental-transform-types --test`. - MEDIUM: `tsx` was undeclared, so the documented `pnpm tsx ...benchmark.ts` faceplanted on a clean clone. Pin `tsx@^4.22.4` in devDependencies and update the lockfile (agent-eval stays 0.99.0). README updated so every claim matches: the gate fires on real data, reps are honest, the test runner handles TS param properties, and the run command works. --- examples/coding-benchmark/README.md | 8 +- examples/coding-benchmark/benchmark.ts | 20 +- .../coding-benchmark/coding-benchmark.test.ts | 80 ++++- examples/coding-benchmark/scenarios.ts | 10 +- examples/coding-benchmark/stats.ts | 7 +- package.json | 1 + pnpm-lock.yaml | 312 +++++++++++++++++- 7 files changed, 395 insertions(+), 43 deletions(-) diff --git a/examples/coding-benchmark/README.md b/examples/coding-benchmark/README.md index 6165732..1c55440 100644 --- a/examples/coding-benchmark/README.md +++ b/examples/coding-benchmark/README.md @@ -42,7 +42,7 @@ Pairwise (paired delta + bootstrap CI; paired-test p, BH-corrected): ### The offline "agent" is a scripted stand-in -Offline there is no model, so each scenario's box writes a **canned solution** instead of calling a coding agent — a deterministic stand-in so the example runs with no creds. The scripts are honest: `rate-limiter` **improves across rounds** (round 0 is a `return true` stub the realness gate catches; round 1+ is the real token-bucket), a real refine demo. Offline the toolchain (`tsc` / `biome` / `node --test`) isn't on PATH, so the checks fail fast and all 3 rounds run — which is exactly when you want to see refinement. +Offline there is no model, so each scenario's box writes a **canned solution** instead of calling a coding agent — a deterministic stand-in so the example runs with no creds. The scripts are honest: `rate-limiter` **improves across rounds** — round 0 is a genuinely hollow `return true` stub (no refill math, the constructor args unused) that the realness gate **gates to composite 0** on the actual run, and round 1+ is the real token-bucket. That's a real refine demo where the anti-cheat gate fires on the benchmark's **own data**, not only in a unit test. Offline the toolchain (`tsc` / `biome` / `node --test`) isn't on PATH, so the checks fail fast and all 3 rounds run — which is exactly when you want to see refinement. ## How a tool swap works (one line) @@ -68,8 +68,8 @@ On the CLI it's `--tools none|web|search-mcp`. **Honesty caveat:** a preset only Scoring runs in strict order, cheapest and most objective first — an `agent-eval` primitive at each layer: -1. **Deterministic checks (first, in the box, ~$0).** An ordered **`MultiLayerVerifier`** pipeline: `typecheck → test → lint`, with dependency-based skip (test never runs on a type error) and a blended score. typecheck + test gate `allPass` (and the refine loop); lint is advisory. These pass/fail booleans are the only thing that steers the next round. (`eval.ts` · `runChecks`) -2. **Realness gate (no LLM, and it GATES).** `scoreAuthenticity` + `gateRealness` — a pure structural scan that catches a stub that compiles but fakes the hard part. It is not just recorded: a **gated** artifact short-circuits the judge to composite **0 with no model call** (a `return true` rate-limiter cannot earn a score, however confident a judge would be). On the sample tasks it scores a real impl ≈ **85** and the `return true` stub **gated → 0**. (`eval.ts` · `realnessGate`, asserted in the smoke test) +1. **Deterministic checks (first, in the box, ~$0).** An ordered **`MultiLayerVerifier`** pipeline: `typecheck → test → lint`, with dependency-based skip (test never runs on a type error) and a blended score. typecheck + test gate `allPass` (and the refine loop); lint is advisory. These pass/fail booleans are the only thing that steers the next round. The test layer runs `node --experimental-transform-types --test`, not plain `node --test`: the fixture imports the solution as a `.ts` file, and Node's default type-*stripping* throws on constructor parameter properties (`constructor(private x: number)`) — the exact style the canonical impl uses — so a correct solution would otherwise score as a test failure. (`eval.ts` · `runChecks`) +2. **Realness gate (no LLM, and it GATES).** `scoreAuthenticity` + `gateRealness` — a pure structural scan that catches a stub that compiles but fakes the hard part. It is not just recorded: a **gated** artifact short-circuits the judge to composite **0 with no model call** (a hollow `return true` rate-limiter cannot earn a score, however confident a judge would be). On the sample tasks it scores a real impl realness **85** and the offline round-0 stub **gated → composite 0** — and the smoke test asserts the gate against the **exact stub the dispatch writes**, so the demo fires on the benchmark's own data, not only on a hand-built strawman. (`eval.ts` · `realnessGate`) 3. **LLM judge (last, only on the band the checks can't resolve).** A 4-dimension weighted rubric — correctness 0.40 · completeness 0.25 · code_quality 0.20 · robustness 0.15. The rubric text + anchors live **with the judge**, never in the workdir. (`eval.ts`) **How many judges:** @@ -83,7 +83,7 @@ Every number is one `agent-eval` primitive call — **no hand-rolled statistics - per-harness **mean composite + bootstrap CI** (`confidenceInterval`) - per-harness **pass-rate + Wilson binomial CI** (`wilson`) — the correct interval for a proportion - every harness **pair** compared on **matched scenarios** with a **real paired test** (`pairedTTest`, or `wilcoxonSignedRank` for the non-parametric path) for the p-value, and a **paired bootstrap** (`pairedBootstrap`) for the effect size + CI, then **BH-corrected** across all pairs (`benjaminiHochberg`) so running many comparisons doesn't manufacture a false winner. -- **Pairing discipline:** the paired unit is the *scenario*. With `--reps > 1`, a harness produces several records per scenario; they're averaged to one score per (harness, scenario) before pairing, so reps tighten the per-cell estimate instead of mis-aligning the pairs. +- **Reps don't fake independent n — anywhere.** The paired unit is the *scenario*, and **the leaderboard uses the same unit**: with `--reps > 1`, a harness produces several records per scenario, so BOTH the leaderboard CI/Wilson AND the pairing collapse reps to **one mean per (harness, scenario)** before computing anything. Reps tighten the per-cell *estimate*; they are not independent samples, so they never narrow the interval out of zero new information. The reported `n` is the number of distinct scenarios, not the record count. (A regression test asserts identical reps leave the CI unchanged.) The leaderboard labels are the readable harness names, not the matrix's internal profile hashes. diff --git a/examples/coding-benchmark/benchmark.ts b/examples/coding-benchmark/benchmark.ts index d231949..922ef67 100644 --- a/examples/coding-benchmark/benchmark.ts +++ b/examples/coding-benchmark/benchmark.ts @@ -64,17 +64,23 @@ function parseArgs(argv: string[]): BenchmarkOptions { // ── the offline "agent": a scripted, REFINING solution per scenario ─────────── // Offline we don't have a model, so each scenario's box writes a canned solution. -// `rate-limiter` IMPROVES across rounds (round 0 = a `return true` stub the realness -// gate catches; round 2 = the real token-bucket) — a real refine demo. `csv-parser` -// writes its real implementation from round 0. -const offlineSolutions: Record = { +// `rate-limiter` IMPROVES across rounds (round 0 = a genuinely hollow `return true` +// stub the realness gate GATES to composite 0; round 1+ = the real token-bucket) — +// a real refine demo that fires the anti-cheat gate on the benchmark's OWN data. +// `csv-parser` writes its real implementation from round 0. +export const offlineSolutions: Record = { 'rate-limiter': { path: 'src/rate-limiter.ts', solutionFor: (round) => round === 0 - ? // round 0 — a stub: compiles, but `tryRemove` is a hardcoded `return true` - // with no refill math. The realness gate flags + gates this. - `export class RateLimiter {\n constructor(private capacity: number, private refillPerSec: number) {}\n` + + ? // round 0 — a genuinely HOLLOW stub: it accepts the constructor args (so the + // hidden test instantiates it) but the body is pure cheat — `tryRemove` is a + // hardcoded `return true` with NO refill math and NO use of the args. The + // realness gate's `realImpl` signal does not fire (no `Date.now`/`refill`), + // `fakeShim` does, so gateRealness GATES it → composite 0. The param names are + // intentionally inert (no `refill` token) so the stub is hollow, not a real + // impl that merely "happens to return true". (Verified by the smoke test.) + `export class RateLimiter {\n constructor(_capacity: number, _ratePerSec: number) {}\n` + ` tryRemove(n: number): boolean { return true }\n}\n` : // round 1+ — the real token-bucket with continuous time-based refill. `export class RateLimiter {\n private tokens: number\n private last = Date.now()\n` + diff --git a/examples/coding-benchmark/coding-benchmark.test.ts b/examples/coding-benchmark/coding-benchmark.test.ts index e99a4a9..802127e 100644 --- a/examples/coding-benchmark/coding-benchmark.test.ts +++ b/examples/coding-benchmark/coding-benchmark.test.ts @@ -1,17 +1,22 @@ /** * Offline smoke test — proves the whole pipeline runs with no creds and that the - * two load-bearing honesty claims hold: + * load-bearing honesty claims hold: * 1. the matrix produces exactly `harnesses × scenarios × reps` records and a * defined leaderboard (the wiring is real, not a stub that returns nothing); - * 2. the realness gate actually catches a `return true` stub and gates it to 0 - * (the README's anti-fake claim, asserted against the real scan). + * 2. the realness gate gates the ACTUAL round-0 stub the dispatch writes (not a + * separate strawman) to composite 0 — the anti-cheat demo fires on the + * benchmark's own data, and passes the real refined impl; + * 3. reps tighten the per-cell estimate HONESTLY — identical reps do NOT narrow + * the leaderboard CI vs reps=1 (reps are not independent samples). */ +import type { RunRecord } from '@tangle-network/agent-eval' import { describe, expect, it } from 'vitest' -import { main } from './benchmark' +import { main, offlineSolutions } from './benchmark' import { realnessGate } from './eval' import { harnessProfiles } from './profiles' import { scenarios } from './scenarios' +import { pairwiseStats } from './stats' describe('coding-benchmark (offline)', () => { // Integration smoke: runs the real matrix end-to-end (real box.exec on the offline @@ -23,31 +28,76 @@ describe('coding-benchmark (offline)', () => { expect(summary.leaderboard).toBe(harnessProfiles.length) }, 180_000) - it('realness gate catches a return-true stub', () => { + it("gates the dispatch's OWN round-0 stub to composite 0 (the demo fires on real data)", () => { const rl = scenarios.find((s) => s.id === 'rate-limiter') expect(rl).toBeDefined() - const stub = 'export class RateLimiter { tryRemove(n: number): boolean { return true } }\n' + const script = offlineSolutions['rate-limiter'] + expect(script).toBeDefined() + // The EXACT content the offline dispatch writes on round 0 — not a hand-built + // strawman. If a future edit makes this stub look real, this test goes red. + const round0 = (script as NonNullable).solutionFor(0) const verdict = realnessGate( - [{ path: 'src/rate-limiter.ts', content: stub }], + [{ path: 'src/rate-limiter.ts', content: round0 }], (rl as NonNullable).realnessSignals, ) expect(verdict.gated).toBe(true) expect(verdict.score).toBe(0) }) - it('realness gate passes a real token-bucket implementation', () => { + it("passes the dispatch's refined round-1 token-bucket implementation", () => { const rl = scenarios.find((s) => s.id === 'rate-limiter') - const real = - 'export class RateLimiter {\n private tokens = 0\n private last = Date.now()\n' + - ' constructor(private capacity: number, private refillPerSec: number) { this.tokens = capacity }\n' + - ' tryRemove(n: number): boolean {\n const now = Date.now()\n' + - ' this.tokens = Math.min(this.capacity, this.tokens + ((now - this.last) / 1000) * this.refillPerSec)\n' + - ' this.last = now\n if (n > this.tokens) return false\n this.tokens -= n\n return true\n }\n}\n' + const script = offlineSolutions['rate-limiter'] + expect(script).toBeDefined() + const round1 = (script as NonNullable).solutionFor(1) const verdict = realnessGate( - [{ path: 'src/rate-limiter.ts', content: real }], + [{ path: 'src/rate-limiter.ts', content: round1 }], (rl as NonNullable).realnessSignals, ) expect(verdict.gated).toBe(false) expect(verdict.score).toBeGreaterThan(0) }) + + it('reps do NOT fake independent n — identical reps leave the CI unchanged', () => { + // Two harnesses, two scenarios, identical scores. Build records for reps=1 and + // reps=3 (the extra reps are exact duplicates → zero new information). The honest + // leaderboard collapses reps to one mean per (harness, scenario), so the CI width + // and the n must be IDENTICAL across reps — duplicating a sample cannot tighten it. + const mk = (harness: string, scenarioId: string, s: number): RunRecord => + ({ + candidateId: harness, + scenarioId, + outcome: { searchScore: s }, + }) as unknown as RunRecord + const base: Array<[string, string, number]> = [ + ['a', 's1', 0.9], + ['a', 's2', 0.4], + ['b', 's1', 0.8], + ['b', 's2', 0.5], + ] + const nameOf = (id: string) => id + const reps1 = base.map(([h, s, v]) => mk(h, s, v)) + const reps3 = base.flatMap(([h, s, v]) => [mk(h, s, v), mk(h, s, v), mk(h, s, v)]) + + const r1 = pairwiseStats(reps1, nameOf) + const r3 = pairwiseStats(reps3, nameOf) + + for (const harness of ['a', 'b']) { + const row1 = r1.leaderboard.find((r) => r.harness === harness) + const row3 = r3.leaderboard.find((r) => r.harness === harness) + expect(row1).toBeDefined() + expect(row3).toBeDefined() + const r1Row = row1 as NonNullable + const r3Row = row3 as NonNullable + // Same honest n (= distinct scenarios), same mean, and the CI must NOT narrow. + expect(r3Row.n).toBe(r1Row.n) + expect(r3Row.meanComposite).toBeCloseTo(r1Row.meanComposite, 10) + const width1 = r1Row.ci.upper - r1Row.ci.lower + const width3 = r3Row.ci.upper - r3Row.ci.lower + expect(width3).toBeCloseTo(width1, 10) + // The pass-rate Wilson interval likewise must not tighten. + const pw1 = r1Row.passCi.upper - r1Row.passCi.lower + const pw3 = r3Row.passCi.upper - r3Row.passCi.lower + expect(pw3).toBeCloseTo(pw1, 10) + } + }) }) diff --git a/examples/coding-benchmark/scenarios.ts b/examples/coding-benchmark/scenarios.ts index b6329f7..338e32e 100644 --- a/examples/coding-benchmark/scenarios.ts +++ b/examples/coding-benchmark/scenarios.ts @@ -62,8 +62,14 @@ export interface CodingScenario extends Scenario { // non-zero exit (the honest offline signal), not a 20s network stall. /** A typecheck shell command for one solution file. */ const typecheckCmd = (path: string) => `tsc --noEmit --strict --skipLibCheck ${path}` -/** A `node --test` command for one fixture. The fixture imports the solution. */ -const testCmd = (fixturePath: string) => `node --test ${fixturePath}` +/** A `node --test` command for one fixture. The fixture imports the solution as a `.ts` + * file, so we run with `--experimental-transform-types`: Node's DEFAULT type-stripping + * is strip-only and throws `ERR_UNSUPPORTED_TYPESCRIPT_SYNTAX` on TS that emits runtime + * code — including constructor PARAMETER PROPERTIES (`constructor(private x: number)`), + * the exact style the canonical token-bucket impl uses. Without the flag a CORRECT + * solution would exit 1 and score as a test failure. The flag transforms (not just + * strips) the types so param properties run. */ +const testCmd = (fixturePath: string) => `node --experimental-transform-types --test ${fixturePath}` /** A lint shell command for one solution file. */ const lintCmd = (path: string) => `biome check ${path}` diff --git a/examples/coding-benchmark/stats.ts b/examples/coding-benchmark/stats.ts index 506c886..e9528ee 100644 --- a/examples/coding-benchmark/stats.ts +++ b/examples/coding-benchmark/stats.ts @@ -122,7 +122,12 @@ export function pairwiseStats( const leaderboard: HarnessRow[] = harnesses.map((harness) => { const rs = groups.get(harness) ?? [] - const scores = rs.map(score) + // Collapse reps to ONE mean per scenario BEFORE the CI/Wilson — the SAME unit the + // pairing path uses. Reps tighten the per-(harness,scenario) estimate; they are NOT + // independent samples, so feeding every raw rep record into the CI would let + // identical reps fake a narrower interval out of zero new information. The honest n + // is the number of distinct scenarios, not records. + const scores = [...meanByScenario(rs).values()] const ci = confidenceInterval(scores, 0.95, { seed: 7 }) const passes = scores.filter((s) => s >= greenThreshold).length const passCi = wilson(passes, scores.length, 0.95) diff --git a/package.json b/package.json index 3166475..edc504e 100644 --- a/package.json +++ b/package.json @@ -95,6 +95,7 @@ "@types/node": "^25.9.3", "playwright": "^1.61.0", "tsup": "^8.0.0", + "tsx": "^4.22.4", "typedoc": "0.28.19", "typedoc-plugin-markdown": "4.12.0", "typescript": "^5.7.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 8e19c4e..43dc26c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -32,7 +32,10 @@ importers: version: 1.61.0 tsup: specifier: ^8.0.0 - version: 8.5.1(postcss@8.5.13)(typescript@5.9.3)(yaml@2.9.0) + version: 8.5.1(postcss@8.5.13)(tsx@4.22.4)(typescript@5.9.3)(yaml@2.9.0) + tsx: + specifier: ^4.22.4 + version: 4.22.4 typedoc: specifier: 0.28.19 version: 0.28.19(typescript@5.9.3) @@ -44,7 +47,7 @@ importers: version: 5.9.3 vitest: specifier: ^3.0.0 - version: 3.2.4(@types/node@25.9.3)(yaml@2.9.0) + version: 3.2.4(@types/node@25.9.3)(tsx@4.22.4)(yaml@2.9.0) packages: @@ -124,156 +127,312 @@ packages: cpu: [ppc64] os: [aix] + '@esbuild/aix-ppc64@0.28.1': + resolution: {integrity: sha512-Svl7tq8k/08+p6CXPpRjQ1fKX+1odH/BQbb48fV6fj3CWHhsoIOoY87w1oHXm0qEpkIK3ZfVgp0hed3XBXzXMQ==} + engines: {node: '>=18'} + cpu: [ppc64] + os: [aix] + '@esbuild/android-arm64@0.27.7': resolution: {integrity: sha512-62dPZHpIXzvChfvfLJow3q5dDtiNMkwiRzPylSCfriLvZeq0a1bWChrGx/BbUbPwOrsWKMn8idSllklzBy+dgQ==} engines: {node: '>=18'} cpu: [arm64] os: [android] + '@esbuild/android-arm64@0.28.1': + resolution: {integrity: sha512-34EGEbCIAgosYz6goLcopX6Mo7NyGv9tfwEM2/7Ce2VcVRk568iSvniGWcUXIy7wEDR1wzolcxcriFVrWYcwBg==} + engines: {node: '>=18'} + cpu: [arm64] + os: [android] + '@esbuild/android-arm@0.27.7': resolution: {integrity: sha512-jbPXvB4Yj2yBV7HUfE2KHe4GJX51QplCN1pGbYjvsyCZbQmies29EoJbkEc+vYuU5o45AfQn37vZlyXy4YJ8RQ==} engines: {node: '>=18'} cpu: [arm] os: [android] + '@esbuild/android-arm@0.28.1': + resolution: {integrity: sha512-0k2F129Xdio1TdJfzJ8sy1Q47vUD2NnwdhiAf7drUN1EBTfPf4hsFCtmMgu/6m8JSzsBrlmVjudMBQqOfG8usQ==} + engines: {node: '>=18'} + cpu: [arm] + os: [android] + '@esbuild/android-x64@0.27.7': resolution: {integrity: sha512-x5VpMODneVDb70PYV2VQOmIUUiBtY3D3mPBG8NxVk5CogneYhkR7MmM3yR/uMdITLrC1ml/NV1rj4bMJuy9MCg==} engines: {node: '>=18'} cpu: [x64] os: [android] + '@esbuild/android-x64@0.28.1': + resolution: {integrity: sha512-dbwY7ltSMDWsRatcRpCnES4F+im88OCUgGZjy52shC7GqHRE/cYlxNbB4Z4UpJswpcc4Qxd2oE/ufM0p61IKng==} + engines: {node: '>=18'} + cpu: [x64] + os: [android] + '@esbuild/darwin-arm64@0.27.7': resolution: {integrity: sha512-5lckdqeuBPlKUwvoCXIgI2D9/ABmPq3Rdp7IfL70393YgaASt7tbju3Ac+ePVi3KDH6N2RqePfHnXkaDtY9fkw==} engines: {node: '>=18'} cpu: [arm64] os: [darwin] + '@esbuild/darwin-arm64@0.28.1': + resolution: {integrity: sha512-TZbWkQY7kvTAXbXUT7uVACR5cMHsDiSz9z7ZKAX/RTq/WJEk3QyRr0wZpNhBDX+/0CtdqUIJlOiodQcta6tY3Q==} + engines: {node: '>=18'} + cpu: [arm64] + os: [darwin] + '@esbuild/darwin-x64@0.27.7': resolution: {integrity: sha512-rYnXrKcXuT7Z+WL5K980jVFdvVKhCHhUwid+dDYQpH+qu+TefcomiMAJpIiC2EM3Rjtq0sO3StMV/+3w3MyyqQ==} engines: {node: '>=18'} cpu: [x64] os: [darwin] + '@esbuild/darwin-x64@0.28.1': + resolution: {integrity: sha512-zfdzgK9ACBNZLI/CyHTOx81SyNbM6YXn7rxSgX97VjyiPl9W1i4Ka4fgKECEoFCKGpvBj5qArWIGgQjOwkgskQ==} + engines: {node: '>=18'} + cpu: [x64] + os: [darwin] + '@esbuild/freebsd-arm64@0.27.7': resolution: {integrity: sha512-B48PqeCsEgOtzME2GbNM2roU29AMTuOIN91dsMO30t+Ydis3z/3Ngoj5hhnsOSSwNzS+6JppqWsuhTp6E82l2w==} engines: {node: '>=18'} cpu: [arm64] os: [freebsd] + '@esbuild/freebsd-arm64@0.28.1': + resolution: {integrity: sha512-wG2EA8ENdEI0qhkSZMjfqrdY+ziCYCPMmtZjjIwOmXFjmyzEHn+UUxk5of+SYsjtfs3VpnlC7QLzSI5hY/rOAw==} + engines: {node: '>=18'} + cpu: [arm64] + os: [freebsd] + '@esbuild/freebsd-x64@0.27.7': resolution: {integrity: sha512-jOBDK5XEjA4m5IJK3bpAQF9/Lelu/Z9ZcdhTRLf4cajlB+8VEhFFRjWgfy3M1O4rO2GQ/b2dLwCUGpiF/eATNQ==} engines: {node: '>=18'} cpu: [x64] os: [freebsd] + '@esbuild/freebsd-x64@0.28.1': + resolution: {integrity: sha512-i7dZ9vQgnvSCzi/rYCXNgtF/U+eKZNJBzu3eTQbRgHnM7tNSizLOkRFAl3qzVc/Op/u5YkHHa4pf/3DOYHthLQ==} + engines: {node: '>=18'} + cpu: [x64] + os: [freebsd] + '@esbuild/linux-arm64@0.27.7': resolution: {integrity: sha512-RZPHBoxXuNnPQO9rvjh5jdkRmVizktkT7TCDkDmQ0W2SwHInKCAV95GRuvdSvA7w4VMwfCjUiPwDi0ZO6Nfe9A==} engines: {node: '>=18'} cpu: [arm64] os: [linux] + '@esbuild/linux-arm64@0.28.1': + resolution: {integrity: sha512-yHs+0uc8+nvEAfAfxrWQKK5peSNzBc4PegcMO0EJ2hT71uA7vB8Ihg2e77R2P7SG5uYjPbHlLLmve4LLLRCf0g==} + engines: {node: '>=18'} + cpu: [arm64] + os: [linux] + '@esbuild/linux-arm@0.27.7': resolution: {integrity: sha512-RkT/YXYBTSULo3+af8Ib0ykH8u2MBh57o7q/DAs3lTJlyVQkgQvlrPTnjIzzRPQyavxtPtfg0EopvDyIt0j1rA==} engines: {node: '>=18'} cpu: [arm] os: [linux] + '@esbuild/linux-arm@0.28.1': + resolution: {integrity: sha512-qVXBOHQS+d5Y722GwJzJUtOLlX7km3CraOaGormF1pDtPd2C/l1SHRPgjLunLGe51Sh5YYWKMFDyV4SxgMQYTQ==} + engines: {node: '>=18'} + cpu: [arm] + os: [linux] + '@esbuild/linux-ia32@0.27.7': resolution: {integrity: sha512-GA48aKNkyQDbd3KtkplYWT102C5sn/EZTY4XROkxONgruHPU72l+gW+FfF8tf2cFjeHaRbWpOYa/uRBz/Xq1Pg==} engines: {node: '>=18'} cpu: [ia32] os: [linux] + '@esbuild/linux-ia32@0.28.1': + resolution: {integrity: sha512-d1z4ZuP0ajrfz/FhGT4vv278rX8KnPPJx8i5+AtK7TYbx9Le9F1hyzurZpkEyjkGa9dUGhQow4C1NmeGvqxN2w==} + engines: {node: '>=18'} + cpu: [ia32] + os: [linux] + '@esbuild/linux-loong64@0.27.7': resolution: {integrity: sha512-a4POruNM2oWsD4WKvBSEKGIiWQF8fZOAsycHOt6JBpZ+JN2n2JH9WAv56SOyu9X5IqAjqSIPTaJkqN8F7XOQ5Q==} engines: {node: '>=18'} cpu: [loong64] os: [linux] + '@esbuild/linux-loong64@0.28.1': + resolution: {integrity: sha512-M5sRjUVZrkm1OAPR3dlOYzNmN+loZKGVi1VUQGrwuqLcbR6qeAz+famMhjASeH3YVKvZz+zT1jlh/keC3Rj/lg==} + engines: {node: '>=18'} + cpu: [loong64] + os: [linux] + '@esbuild/linux-mips64el@0.27.7': resolution: {integrity: sha512-KabT5I6StirGfIz0FMgl1I+R1H73Gp0ofL9A3nG3i/cYFJzKHhouBV5VWK1CSgKvVaG4q1RNpCTR2LuTVB3fIw==} engines: {node: '>=18'} cpu: [mips64el] os: [linux] + '@esbuild/linux-mips64el@0.28.1': + resolution: {integrity: sha512-mRObBZeHh2OxcBFPWE/FjylkRgZdYuiTR3vaTozquCGOH14iP9oN4x4Ge81CoIDYQrXmIxpFumJBu5MtZpnQJQ==} + engines: {node: '>=18'} + cpu: [mips64el] + os: [linux] + '@esbuild/linux-ppc64@0.27.7': resolution: {integrity: sha512-gRsL4x6wsGHGRqhtI+ifpN/vpOFTQtnbsupUF5R5YTAg+y/lKelYR1hXbnBdzDjGbMYjVJLJTd2OFmMewAgwlQ==} engines: {node: '>=18'} cpu: [ppc64] os: [linux] + '@esbuild/linux-ppc64@0.28.1': + resolution: {integrity: sha512-slScBsMAb3GFDcdrCgLwZtPYRoH2H/youv10QiZyRjmsP48fznoveWytSgCI/R0ZcUgpc0ZhIUEx6LHts8yrfQ==} + engines: {node: '>=18'} + cpu: [ppc64] + os: [linux] + '@esbuild/linux-riscv64@0.27.7': resolution: {integrity: sha512-hL25LbxO1QOngGzu2U5xeXtxXcW+/GvMN3ejANqXkxZ/opySAZMrc+9LY/WyjAan41unrR3YrmtTsUpwT66InQ==} engines: {node: '>=18'} cpu: [riscv64] os: [linux] + '@esbuild/linux-riscv64@0.28.1': + resolution: {integrity: sha512-kw0owk1o0GFETUJyW0jc0G4Yzs0BHZn0JDZ8JRT088vjJYX777BAs1fDGxAC+q831qOs2DTC96mNsG2opdfyyQ==} + engines: {node: '>=18'} + cpu: [riscv64] + os: [linux] + '@esbuild/linux-s390x@0.27.7': resolution: {integrity: sha512-2k8go8Ycu1Kb46vEelhu1vqEP+UeRVj2zY1pSuPdgvbd5ykAw82Lrro28vXUrRmzEsUV0NzCf54yARIK8r0fdw==} engines: {node: '>=18'} cpu: [s390x] os: [linux] + '@esbuild/linux-s390x@0.28.1': + resolution: {integrity: sha512-/lAIjX8aYFRByhh6L5rYtPEDRqa9de/4V/juOXcta5frjvzXO4/sqEtyytse0g3zZFuWu5cDN0MkLz2qRDD2Ag==} + engines: {node: '>=18'} + cpu: [s390x] + os: [linux] + '@esbuild/linux-x64@0.27.7': resolution: {integrity: sha512-hzznmADPt+OmsYzw1EE33ccA+HPdIqiCRq7cQeL1Jlq2gb1+OyWBkMCrYGBJ+sxVzve2ZJEVeePbLM2iEIZSxA==} engines: {node: '>=18'} cpu: [x64] os: [linux] + '@esbuild/linux-x64@0.28.1': + resolution: {integrity: sha512-u/anNYF2mmVOEDwLtnQ1wOr3EZ9sTNGLWrsYGYwHWzGA3Si84IOkHXlbWTD1NB+9/1lcnweYKO54uhxZydNzfA==} + engines: {node: '>=18'} + cpu: [x64] + os: [linux] + '@esbuild/netbsd-arm64@0.27.7': resolution: {integrity: sha512-b6pqtrQdigZBwZxAn1UpazEisvwaIDvdbMbmrly7cDTMFnw/+3lVxxCTGOrkPVnsYIosJJXAsILG9XcQS+Yu6w==} engines: {node: '>=18'} cpu: [arm64] os: [netbsd] + '@esbuild/netbsd-arm64@0.28.1': + resolution: {integrity: sha512-oks0DYbLwWMmaakTsCb+zL4E+aHRVLom9IJZOAthMQEPiQmydXHkziYEsGYRx0uNV/IjEKGAV941JzH02pflqw==} + engines: {node: '>=18'} + cpu: [arm64] + os: [netbsd] + '@esbuild/netbsd-x64@0.27.7': resolution: {integrity: sha512-OfatkLojr6U+WN5EDYuoQhtM+1xco+/6FSzJJnuWiUw5eVcicbyK3dq5EeV/QHT1uy6GoDhGbFpprUiHUYggrw==} engines: {node: '>=18'} cpu: [x64] os: [netbsd] + '@esbuild/netbsd-x64@0.28.1': + resolution: {integrity: sha512-aeL6lAnN89Hz43Mlh1G8ARasbuoYvSITDEx0tHh5b7jJnHcssqgjy9Yx430GDpmCa6OyrKoS0aNRjKundRizGg==} + engines: {node: '>=18'} + cpu: [x64] + os: [netbsd] + '@esbuild/openbsd-arm64@0.27.7': resolution: {integrity: sha512-AFuojMQTxAz75Fo8idVcqoQWEHIXFRbOc1TrVcFSgCZtQfSdc1RXgB3tjOn/krRHENUB4j00bfGjyl2mJrU37A==} engines: {node: '>=18'} cpu: [arm64] os: [openbsd] + '@esbuild/openbsd-arm64@0.28.1': + resolution: {integrity: sha512-MEFJe5C3R8pwXdZ5Y21oo6m7ePiS0d9pWucn99O/wvyJZChoIQKrQDxKrGeW8F5+T0okTHesAmDeiHDTIq0V/Q==} + engines: {node: '>=18'} + cpu: [arm64] + os: [openbsd] + '@esbuild/openbsd-x64@0.27.7': resolution: {integrity: sha512-+A1NJmfM8WNDv5CLVQYJ5PshuRm/4cI6WMZRg1by1GwPIQPCTs1GLEUHwiiQGT5zDdyLiRM/l1G0Pv54gvtKIg==} engines: {node: '>=18'} cpu: [x64] os: [openbsd] + '@esbuild/openbsd-x64@0.28.1': + resolution: {integrity: sha512-i/ZLIOafE0Z8cI/XANJAixoJL/uRAoS2xOA3rb0xN+KK0K177cMAsQYkzHtBrtMXAKuAc7HGgcWiZ/sRC1Nxgw==} + engines: {node: '>=18'} + cpu: [x64] + os: [openbsd] + '@esbuild/openharmony-arm64@0.27.7': resolution: {integrity: sha512-+KrvYb/C8zA9CU/g0sR6w2RBw7IGc5J2BPnc3dYc5VJxHCSF1yNMxTV5LQ7GuKteQXZtspjFbiuW5/dOj7H4Yw==} engines: {node: '>=18'} cpu: [arm64] os: [openharmony] + '@esbuild/openharmony-arm64@0.28.1': + resolution: {integrity: sha512-ge+Z7EXFNt2BO1oAMsVpiQ8EwndV9i1xXerAeTIK7AtPs3bKFXQM7nlRxDSIUIMeueR1CNXxqztLzdNeReKBJg==} + engines: {node: '>=18'} + cpu: [arm64] + os: [openharmony] + '@esbuild/sunos-x64@0.27.7': resolution: {integrity: sha512-ikktIhFBzQNt/QDyOL580ti9+5mL/YZeUPKU2ivGtGjdTYoqz6jObj6nOMfhASpS4GU4Q/Clh1QtxWAvcYKamA==} engines: {node: '>=18'} cpu: [x64] os: [sunos] + '@esbuild/sunos-x64@0.28.1': + resolution: {integrity: sha512-BEjgtECkL3vY+SaSQ6nzVfiALUeFxpawyp8Jmf5PtYhf1Ug40N1h/hxlhts+f1FvSvarEigdxS3BlSMI2PJLcQ==} + engines: {node: '>=18'} + cpu: [x64] + os: [sunos] + '@esbuild/win32-arm64@0.27.7': resolution: {integrity: sha512-7yRhbHvPqSpRUV7Q20VuDwbjW5kIMwTHpptuUzV+AA46kiPze5Z7qgt6CLCK3pWFrHeNfDd1VKgyP4O+ng17CA==} engines: {node: '>=18'} cpu: [arm64] os: [win32] + '@esbuild/win32-arm64@0.28.1': + resolution: {integrity: sha512-lCv9eK/H6ZJWbE7bh2nw54CZ9M2nupBxJcTsdk/QQnWkdSjKGuxmmH8/GWrlT1eMmZfn4dGcCjRte397WqfQXA==} + engines: {node: '>=18'} + cpu: [arm64] + os: [win32] + '@esbuild/win32-ia32@0.27.7': resolution: {integrity: sha512-SmwKXe6VHIyZYbBLJrhOoCJRB/Z1tckzmgTLfFYOfpMAx63BJEaL9ExI8x7v0oAO3Zh6D/Oi1gVxEYr5oUCFhw==} engines: {node: '>=18'} cpu: [ia32] os: [win32] + '@esbuild/win32-ia32@0.28.1': + resolution: {integrity: sha512-zvb/mB2bSCoJOpoCBgYKKpX6YM6mJBlBUVUtVj41DlZJVEB6/0CKlRYxP5wWl1C1ILiCoAU5wZZ4q1P3qeS6Eg==} + engines: {node: '>=18'} + cpu: [ia32] + os: [win32] + '@esbuild/win32-x64@0.27.7': resolution: {integrity: sha512-56hiAJPhwQ1R4i+21FVF7V8kSD5zZTdHcVuRFMW0hn753vVfQN8xlx4uOPT4xoGH0Z/oVATuR82AiqSTDIpaHg==} engines: {node: '>=18'} cpu: [x64] os: [win32] + '@esbuild/win32-x64@0.28.1': + resolution: {integrity: sha512-bm4Mowrv+GXMlpWX++EcXw/iLyd1o3+bJkC2DkWXYVvgZCqD/bSj9ctZeAMC3cIxgjRVR2Dufaiu4YPxr5gW1A==} + engines: {node: '>=18'} + cpu: [x64] + os: [win32] + '@gerrit0/mini-shiki@3.23.0': resolution: {integrity: sha512-bEMORlG0cqdjVyCEuU0cDQbORWX+kYCeo0kV1lbxF5bt4r7SID2l9bqsxJEM0zndaxpOUT7riCyIVEuqq/Ynxg==} @@ -718,6 +877,11 @@ packages: engines: {node: '>=18'} hasBin: true + esbuild@0.28.1: + resolution: {integrity: sha512-HrJrvZv5ayxBzPfwphOoNzkzOIIlifzk0KJrGK2c8R4+LKpMtpYLQeUdjnwjWv/LZlkH2laZk+4w78pi99D4Vw==} + engines: {node: '>=18'} + hasBin: true + estree-walker@3.0.3: resolution: {integrity: sha512-7RUKfXgSMMkzt6ZuXmqapOurLGPPfgj6l9uRZ7lRGolvk0y2yocc35LdcxKC5PQZdn2DMqioAQ2NoWcrTKmm6g==} @@ -979,6 +1143,11 @@ packages: typescript: optional: true + tsx@4.22.4: + resolution: {integrity: sha512-X8EX+XV4QR5xCsrgxaED954zTDfY8KqlDtskKEL0cHhyS/P8b4IFOvGDQpsC9Q1XnLq915wEfwwY/zzskCtmhg==} + engines: {node: '>=18.0.0'} + hasBin: true + typedoc-plugin-markdown@4.12.0: resolution: {integrity: sha512-eJDEMAfxCmede22c/Jw7d0FA13ggAQv+KkwQYKYCdqI02cin6Rc9QRwbG/7XvvHWinuFejySnZVUWDtvGk3Vbg==} engines: {node: '>= 18'} @@ -1166,81 +1335,159 @@ snapshots: '@esbuild/aix-ppc64@0.27.7': optional: true + '@esbuild/aix-ppc64@0.28.1': + optional: true + '@esbuild/android-arm64@0.27.7': optional: true + '@esbuild/android-arm64@0.28.1': + optional: true + '@esbuild/android-arm@0.27.7': optional: true + '@esbuild/android-arm@0.28.1': + optional: true + '@esbuild/android-x64@0.27.7': optional: true + '@esbuild/android-x64@0.28.1': + optional: true + '@esbuild/darwin-arm64@0.27.7': optional: true + '@esbuild/darwin-arm64@0.28.1': + optional: true + '@esbuild/darwin-x64@0.27.7': optional: true + '@esbuild/darwin-x64@0.28.1': + optional: true + '@esbuild/freebsd-arm64@0.27.7': optional: true + '@esbuild/freebsd-arm64@0.28.1': + optional: true + '@esbuild/freebsd-x64@0.27.7': optional: true + '@esbuild/freebsd-x64@0.28.1': + optional: true + '@esbuild/linux-arm64@0.27.7': optional: true + '@esbuild/linux-arm64@0.28.1': + optional: true + '@esbuild/linux-arm@0.27.7': optional: true + '@esbuild/linux-arm@0.28.1': + optional: true + '@esbuild/linux-ia32@0.27.7': optional: true + '@esbuild/linux-ia32@0.28.1': + optional: true + '@esbuild/linux-loong64@0.27.7': optional: true + '@esbuild/linux-loong64@0.28.1': + optional: true + '@esbuild/linux-mips64el@0.27.7': optional: true + '@esbuild/linux-mips64el@0.28.1': + optional: true + '@esbuild/linux-ppc64@0.27.7': optional: true + '@esbuild/linux-ppc64@0.28.1': + optional: true + '@esbuild/linux-riscv64@0.27.7': optional: true + '@esbuild/linux-riscv64@0.28.1': + optional: true + '@esbuild/linux-s390x@0.27.7': optional: true + '@esbuild/linux-s390x@0.28.1': + optional: true + '@esbuild/linux-x64@0.27.7': optional: true + '@esbuild/linux-x64@0.28.1': + optional: true + '@esbuild/netbsd-arm64@0.27.7': optional: true + '@esbuild/netbsd-arm64@0.28.1': + optional: true + '@esbuild/netbsd-x64@0.27.7': optional: true + '@esbuild/netbsd-x64@0.28.1': + optional: true + '@esbuild/openbsd-arm64@0.27.7': optional: true + '@esbuild/openbsd-arm64@0.28.1': + optional: true + '@esbuild/openbsd-x64@0.27.7': optional: true + '@esbuild/openbsd-x64@0.28.1': + optional: true + '@esbuild/openharmony-arm64@0.27.7': optional: true + '@esbuild/openharmony-arm64@0.28.1': + optional: true + '@esbuild/sunos-x64@0.27.7': optional: true + '@esbuild/sunos-x64@0.28.1': + optional: true + '@esbuild/win32-arm64@0.27.7': optional: true + '@esbuild/win32-arm64@0.28.1': + optional: true + '@esbuild/win32-ia32@0.27.7': optional: true + '@esbuild/win32-ia32@0.28.1': + optional: true + '@esbuild/win32-x64@0.27.7': optional: true + '@esbuild/win32-x64@0.28.1': + optional: true + '@gerrit0/mini-shiki@3.23.0': dependencies: '@shikijs/engine-oniguruma': 3.23.0 @@ -1532,13 +1779,13 @@ snapshots: chai: 5.3.3 tinyrainbow: 2.0.0 - '@vitest/mocker@3.2.4(vite@7.3.2(@types/node@25.9.3)(yaml@2.9.0))': + '@vitest/mocker@3.2.4(vite@7.3.2(@types/node@25.9.3)(tsx@4.22.4)(yaml@2.9.0))': dependencies: '@vitest/spy': 3.2.4 estree-walker: 3.0.3 magic-string: 0.30.21 optionalDependencies: - vite: 7.3.2(@types/node@25.9.3)(yaml@2.9.0) + vite: 7.3.2(@types/node@25.9.3)(tsx@4.22.4)(yaml@2.9.0) '@vitest/pretty-format@3.2.4': dependencies: @@ -1655,6 +1902,35 @@ snapshots: '@esbuild/win32-ia32': 0.27.7 '@esbuild/win32-x64': 0.27.7 + esbuild@0.28.1: + optionalDependencies: + '@esbuild/aix-ppc64': 0.28.1 + '@esbuild/android-arm': 0.28.1 + '@esbuild/android-arm64': 0.28.1 + '@esbuild/android-x64': 0.28.1 + '@esbuild/darwin-arm64': 0.28.1 + '@esbuild/darwin-x64': 0.28.1 + '@esbuild/freebsd-arm64': 0.28.1 + '@esbuild/freebsd-x64': 0.28.1 + '@esbuild/linux-arm': 0.28.1 + '@esbuild/linux-arm64': 0.28.1 + '@esbuild/linux-ia32': 0.28.1 + '@esbuild/linux-loong64': 0.28.1 + '@esbuild/linux-mips64el': 0.28.1 + '@esbuild/linux-ppc64': 0.28.1 + '@esbuild/linux-riscv64': 0.28.1 + '@esbuild/linux-s390x': 0.28.1 + '@esbuild/linux-x64': 0.28.1 + '@esbuild/netbsd-arm64': 0.28.1 + '@esbuild/netbsd-x64': 0.28.1 + '@esbuild/openbsd-arm64': 0.28.1 + '@esbuild/openbsd-x64': 0.28.1 + '@esbuild/openharmony-arm64': 0.28.1 + '@esbuild/sunos-x64': 0.28.1 + '@esbuild/win32-arm64': 0.28.1 + '@esbuild/win32-ia32': 0.28.1 + '@esbuild/win32-x64': 0.28.1 + estree-walker@3.0.3: dependencies: '@types/estree': 1.0.8 @@ -1784,11 +2060,12 @@ snapshots: optionalDependencies: fsevents: 2.3.2 - postcss-load-config@6.0.1(postcss@8.5.13)(yaml@2.9.0): + postcss-load-config@6.0.1(postcss@8.5.13)(tsx@4.22.4)(yaml@2.9.0): dependencies: lilconfig: 3.1.3 optionalDependencies: postcss: 8.5.13 + tsx: 4.22.4 yaml: 2.9.0 postcss@8.5.13: @@ -1885,7 +2162,7 @@ snapshots: ts-interface-checker@0.1.13: {} - tsup@8.5.1(postcss@8.5.13)(typescript@5.9.3)(yaml@2.9.0): + tsup@8.5.1(postcss@8.5.13)(tsx@4.22.4)(typescript@5.9.3)(yaml@2.9.0): dependencies: bundle-require: 5.1.0(esbuild@0.27.7) cac: 6.7.14 @@ -1896,7 +2173,7 @@ snapshots: fix-dts-default-cjs-exports: 1.0.1 joycon: 3.1.1 picocolors: 1.1.1 - postcss-load-config: 6.0.1(postcss@8.5.13)(yaml@2.9.0) + postcss-load-config: 6.0.1(postcss@8.5.13)(tsx@4.22.4)(yaml@2.9.0) resolve-from: 5.0.0 rollup: 4.60.2 source-map: 0.7.6 @@ -1913,6 +2190,12 @@ snapshots: - tsx - yaml + tsx@4.22.4: + dependencies: + esbuild: 0.28.1 + optionalDependencies: + fsevents: 2.3.3 + typedoc-plugin-markdown@4.12.0(typedoc@0.28.19(typescript@5.9.3)): dependencies: typedoc: 0.28.19(typescript@5.9.3) @@ -1951,13 +2234,13 @@ snapshots: - utf-8-validate - zod - vite-node@3.2.4(@types/node@25.9.3)(yaml@2.9.0): + vite-node@3.2.4(@types/node@25.9.3)(tsx@4.22.4)(yaml@2.9.0): dependencies: cac: 6.7.14 debug: 4.4.3 es-module-lexer: 1.7.0 pathe: 2.0.3 - vite: 7.3.2(@types/node@25.9.3)(yaml@2.9.0) + vite: 7.3.2(@types/node@25.9.3)(tsx@4.22.4)(yaml@2.9.0) transitivePeerDependencies: - '@types/node' - jiti @@ -1972,7 +2255,7 @@ snapshots: - tsx - yaml - vite@7.3.2(@types/node@25.9.3)(yaml@2.9.0): + vite@7.3.2(@types/node@25.9.3)(tsx@4.22.4)(yaml@2.9.0): dependencies: esbuild: 0.27.7 fdir: 6.5.0(picomatch@4.0.4) @@ -1983,13 +2266,14 @@ snapshots: optionalDependencies: '@types/node': 25.9.3 fsevents: 2.3.3 + tsx: 4.22.4 yaml: 2.9.0 - vitest@3.2.4(@types/node@25.9.3)(yaml@2.9.0): + vitest@3.2.4(@types/node@25.9.3)(tsx@4.22.4)(yaml@2.9.0): dependencies: '@types/chai': 5.2.3 '@vitest/expect': 3.2.4 - '@vitest/mocker': 3.2.4(vite@7.3.2(@types/node@25.9.3)(yaml@2.9.0)) + '@vitest/mocker': 3.2.4(vite@7.3.2(@types/node@25.9.3)(tsx@4.22.4)(yaml@2.9.0)) '@vitest/pretty-format': 3.2.4 '@vitest/runner': 3.2.4 '@vitest/snapshot': 3.2.4 @@ -2007,8 +2291,8 @@ snapshots: tinyglobby: 0.2.16 tinypool: 1.1.1 tinyrainbow: 2.0.0 - vite: 7.3.2(@types/node@25.9.3)(yaml@2.9.0) - vite-node: 3.2.4(@types/node@25.9.3)(yaml@2.9.0) + vite: 7.3.2(@types/node@25.9.3)(tsx@4.22.4)(yaml@2.9.0) + vite-node: 3.2.4(@types/node@25.9.3)(tsx@4.22.4)(yaml@2.9.0) why-is-node-running: 2.3.0 optionalDependencies: '@types/node': 25.9.3 From 2bd13d9f2f07c0807de51dd5f710a9e36a98e203 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Wed, 24 Jun 2026 05:40:10 -0600 Subject: [PATCH 4/5] =?UTF-8?q?fix(examples):=20coding-benchmark=20?= =?UTF-8?q?=E2=80=94=20gate=20the=20natural=20cheat,=20precise=20firewall?= =?UTF-8?q?=20claims,=20runnable-clone=20hardening?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address the union of four critical-audit lenses on the coding-benchmark example. Realness gate (HIGH): the gate fired on one strawman stub shape, not the natural cheat. Tightened each task's realImpl to require the actual hard-part work (refill MATH, quote-state tracking, capacity eviction) so a decoy token — a `refillPerSec` param name, a `for (` loop, a passthrough Map — no longer reads as a real impl, and added a third LRU scenario whose only passing path is the real eviction algorithm. The smoke test now asserts each NATURAL cheat is gated, per task. Firewall claims (HIGH/MEDIUM): the README/dispatch/scenarios claimed the agent "literally cannot read the answer key". The seeded test fixture is intentionally visible (TDD-style) and a multi-round agent can read it; only the LLM-judge rubric and realness signals are firewalled. Softened the claim to that precise truth, and rephrased "gates to composite 0 on the actual run" to "the smoke test asserts the gate; the run refines past it, so no leaderboard cell ends gated". DX/runtime hardening: parseArgs no longer swallows a following flag as an option value (`--reps --live` → reps=1, not NaN) and clamps reps to a positive integer; the matrix runDir is cleaned in a finally; the LLM judge is imported from /campaign so it resolves across the whole peer range; ensemble panel models are snapshot-dated; seedFile prefers the structured fs.write seam (no shell injection surface); the realness scan takes the seeded fixture as a reference so a real solution carries no spurious DEAD_ARTIFACT; stats fails loud on a missing scenarioId instead of merging into one bucket; renderStats prints a power caveat when n<6; advisory lint warnings now reach the refine prompt without gating allPass; casts narrowed; README notes the Node>=22.6 test-layer floor, the live-box PATH requirements, and the offline-ensemble degeneracy. --- examples/coding-benchmark/README.md | 37 +++-- examples/coding-benchmark/benchmark.ts | 87 ++++++---- .../coding-benchmark/coding-benchmark.test.ts | 87 +++++++++- examples/coding-benchmark/dispatch.ts | 66 +++++--- examples/coding-benchmark/eval.ts | 62 +++++-- examples/coding-benchmark/offline-box.ts | 8 +- examples/coding-benchmark/scenarios.ts | 153 ++++++++++++++---- examples/coding-benchmark/stats.ts | 26 ++- 8 files changed, 415 insertions(+), 111 deletions(-) diff --git a/examples/coding-benchmark/README.md b/examples/coding-benchmark/README.md index 1c55440..01e3f23 100644 --- a/examples/coding-benchmark/README.md +++ b/examples/coding-benchmark/README.md @@ -1,6 +1,6 @@ # coding-benchmark -**Run the same coding task across coding agents — fairly, honestly, with real statistics — in 7 files of pure composition.** Every moving part is an `agent-runtime` or `agent-eval` primitive. Zero bespoke harness code, no hand-rolled scorer, no hand-rolled statistics. +**Run the same coding task across coding agents — fairly, honestly, with real statistics — as thin composition over `agent-runtime` / `agent-eval` primitives.** The scorer, the stats, the verifier, and the realness gate are all substrate calls, not reimplemented. The glue this example owns is small and named (an in-process offline box, the per-round refine loop, the leaderboard render); the load-bearing scoring and statistics are not hand-rolled. ```bash # offline — no creds, no network. Runs the whole pipeline against an in-process box @@ -23,7 +23,7 @@ One coding task, run across a **matrix** of three axes, scored, and compared wit | Axis | What varies | Where | |---|---|---| | **harness** | claude-code / opencode / codex / cli, each on its **baseline default profile** (no skills, no injected prompt — we measure the harness, not our scaffolding) | `profiles.ts` | -| **scenario** | the held-out coding tasks (a token-bucket rate limiter, an RFC-4180 CSV parser) | `scenarios.ts` | +| **scenario** | the held-out coding tasks (a token-bucket rate limiter, an RFC-4180 CSV parser, an LRU cache whose only passing path is the real eviction algorithm) | `scenarios.ts` | | **tool surface** | `none` / `web` / `search-mcp` — folded in as a one-line knob (`--tools`) | `profiles.ts` | The agent gets up to **3 refine rounds** in **one persistent box**: round N+1's prompt is built from round N's *check failures* (and nothing else — see the firewall). It stops the moment the deterministic checks pass. @@ -32,17 +32,21 @@ The output is a leaderboard with confidence bands and a significance matrix: ``` Harness leaderboard (mean composite, 95% CI; pass-rate, Wilson CI): - claude-code-baseline composite 0.813 [0.813, 0.813] pass 100% [34%, 100%] (n=2) + claude-code-baseline composite 0.813 [0.813, 0.813] pass 100% [44%, 100%] (n=3) ... Pairwise (paired delta + bootstrap CI; paired-test p, BH-corrected): opencode-baseline − claude-code-baseline: Δ=0.000 [0.000, 0.000] p=1.000 n.s. + + NOTE: n=3 scenarios — below the power floor. The paired tests above cannot reach + significance at this corpus size (they demonstrate the wiring). Use 20-50 tasks for + a real harness comparison. ``` -> **Offline, every harness writes the same scripted solution and is scored by the same deterministic mock judge, so all deltas are 0.000** — the honest no-variance result, not a bug. The whole pipeline (matrix, verifier, realness gate, judge wiring, stats, firewall) runs for real; only the agent and the judge model are stubbed offline. `--live` swaps in real harness boxes and a real judge model and the harnesses separate. +> **Offline, every harness writes the same scripted solution and is scored by the same deterministic mock judge, so all deltas are 0.000** — the honest no-variance result, not a bug. The whole pipeline (matrix, verifier, realness gate, judge wiring, stats, firewall) runs for real; only the agent and the judge model are stubbed offline. **Offline the `--ensemble` panel is degenerate too: all three cross-family models share the one mock transport and return the identical verdict — cross-family independence is a live-only property.** `--live` swaps in real harness boxes, a real judge model, and (with `--ensemble`) three genuinely independent models, and the harnesses separate. ### The offline "agent" is a scripted stand-in -Offline there is no model, so each scenario's box writes a **canned solution** instead of calling a coding agent — a deterministic stand-in so the example runs with no creds. The scripts are honest: `rate-limiter` **improves across rounds** — round 0 is a genuinely hollow `return true` stub (no refill math, the constructor args unused) that the realness gate **gates to composite 0** on the actual run, and round 1+ is the real token-bucket. That's a real refine demo where the anti-cheat gate fires on the benchmark's **own data**, not only in a unit test. Offline the toolchain (`tsc` / `biome` / `node --test`) isn't on PATH, so the checks fail fast and all 3 rounds run — which is exactly when you want to see refinement. +Offline there is no model, so each scenario's box writes a **canned solution** instead of calling a coding agent — a deterministic stand-in so the example runs with no creds. The scripts are honest: `rate-limiter` **improves across rounds** — round 0 is a genuinely hollow `return true` stub (no refill math, the constructor args unused) and round 1+ is the real token-bucket. The smoke test asserts the realness gate **gates that exact round-0 stub to composite 0** (the anti-cheat demo fires on the dispatch's own content, not a hand-built strawman). In the offline *run*, the refine loop then advances past round 0 to the real impl, so **no leaderboard cell ends up gated** — the gate-to-0 is proven against the dispatch's round-0 content, not produced as a gated row in the scored output. Offline the toolchain (`tsc` / `biome` / `node --test`) isn't on PATH, so the checks fail fast and all 3 rounds run — which is exactly when you want to see refinement. ## How a tool swap works (one line) @@ -56,12 +60,13 @@ withTools(profile, 'search-mcp') // mount a search MCP instead On the CLI it's `--tools none|web|search-mcp`. **Honesty caveat:** a preset only takes effect for a `(harness, lever)` pair the sandbox actually materializes — if a harness has no native `webfetch`, `--tools web` is a no-op *there*. That's a substrate fact, not something this example papers over. Check `@tangle-network/sandbox` for the materialization matrix before trusting a tool swap on a given harness. -## How it stays honest (the no-cheat firewall) +## How it stays honest (the grading-criteria firewall) -**The agent's context is the task prompt — and nothing else.** The grading criteria never reach the box. +**The LLM-judge rubric, the grading note, and the realness signals never reach the box** — so the agent cannot steer toward the criteria it is scored on. The test fixture is a different case, and the example is precise about it: the test is *seeded* into the box and a multi-round agent **can** read it, exactly as in real TDD. -- A `CodingScenario` (`scenarios.ts`) splits into `prompt` (the **only** field the agent sees) and eval-only fields: the hidden test fixture, the realness signals, the rubric note. Because they're different fields on one object, "the agent reads the answer key" becomes something you can **see in one place** — it would require the dispatch to write a non-`prompt` field into the agent's context. -- **It does not.** The firewall is one labeled block in **`dispatch.ts`** (`THE NO-CHEAT FIREWALL LIVES HERE`): the only thing the agent reads is `scenario.prompt`, plus next-round prompts built **only** from check pass/fail + output. The hidden test is *seeded* into the box (so `node --test` has a file to run) but its assertions are never described to the agent; the rubric, the realness signals, and the judge are read *after* the loop, never written in. +- A `CodingScenario` (`scenarios.ts`) splits by where each field flows: `prompt` (the only field copied into the agent's **context**), the `fixture` (the deterministic test — **seeded into the workspace**, so `node --test` has a file to run), and the rubric note + realness signals (read **after** the loop by `eval.ts`, **never written into the box**). +- The firewall is one labeled block in **`dispatch.ts`** (`THE GRADING-CRITERIA FIREWALL LIVES HERE`): the only thing the agent's context gets is `scenario.prompt`, plus next-round prompts built **only** from check pass/fail + output. Because the criteria are different fields that the dispatch never writes into the profile, you can **see in one place** that the rubric/realness can't reach the agent. +- **What this protects, precisely:** the agent cannot read the LLM-judge rubric or the realness signals (the metric it would otherwise game). It **can** read the seeded test fixture — that is intentional. The test is a *spec the agent is asked to satisfy*, not a hidden answer key; a coding harness has native file-read tools, and across the 3 refine rounds the agent (and the next-round prompt, which includes the test runner's failure output) sees the assertions. That is the same contract as TDD and is honest for a benchmark: the protected secret is the *grading rubric*, not the tests. - The realness gate runs **after** the loop and is recorded on the run — the agent can't steer toward a metric it can't read. ## How it scores (validators before judge) @@ -69,12 +74,12 @@ On the CLI it's `--tools none|web|search-mcp`. **Honesty caveat:** a preset only Scoring runs in strict order, cheapest and most objective first — an `agent-eval` primitive at each layer: 1. **Deterministic checks (first, in the box, ~$0).** An ordered **`MultiLayerVerifier`** pipeline: `typecheck → test → lint`, with dependency-based skip (test never runs on a type error) and a blended score. typecheck + test gate `allPass` (and the refine loop); lint is advisory. These pass/fail booleans are the only thing that steers the next round. The test layer runs `node --experimental-transform-types --test`, not plain `node --test`: the fixture imports the solution as a `.ts` file, and Node's default type-*stripping* throws on constructor parameter properties (`constructor(private x: number)`) — the exact style the canonical impl uses — so a correct solution would otherwise score as a test failure. (`eval.ts` · `runChecks`) -2. **Realness gate (no LLM, and it GATES).** `scoreAuthenticity` + `gateRealness` — a pure structural scan that catches a stub that compiles but fakes the hard part. It is not just recorded: a **gated** artifact short-circuits the judge to composite **0 with no model call** (a hollow `return true` rate-limiter cannot earn a score, however confident a judge would be). On the sample tasks it scores a real impl realness **85** and the offline round-0 stub **gated → composite 0** — and the smoke test asserts the gate against the **exact stub the dispatch writes**, so the demo fires on the benchmark's own data, not only on a hand-built strawman. (`eval.ts` · `realnessGate`) +2. **Realness gate (no LLM, and it GATES).** `scoreAuthenticity` + `gateRealness` — a pure structural scan that catches the stub shapes each task's signals encode. It is not just recorded: a **gated** artifact short-circuits the judge to composite **0 with no model call**. The gate fires on `fakeShim && !realImpl`, so each task's `realImpl` is anchored to the actual hard-part work (refill *math*, quote-state tracking, capacity eviction) and its `fakeShim` to the natural shortcut — tuned so the **natural cheat gates, not just one strawman**: a `return true` rate-limiter whose only "refill" is a constructor param name, a `for (… input.split('\n'))` CSV split, and a no-eviction `Map` wrapper all gate. It is **not** a general "any fake is caught" guarantee — it catches the specific shapes listed (the smoke test asserts each natural cheat is gated, on the dispatch's own content). On the sample tasks a real impl scores realness **85** and each cheat is **gated → composite 0**. (`eval.ts` · `realnessGate`, `scenarios.ts` · `realnessSignals`) 3. **LLM judge (last, only on the band the checks can't resolve).** A 4-dimension weighted rubric — correctness 0.40 · completeness 0.25 · code_quality 0.20 · robustness 0.15. The rubric text + anchors live **with the judge**, never in the workdir. (`eval.ts`) **How many judges:** - **Default: 1** — `singleCodeJudge`, built from `llmJudge` (one model call). Cheap, for the leaderboard sweep. -- **`--ensemble`: 3** — `ensembleCodeJudge`, built from `ensembleJudge`, three **cross-family** models (deepseek + openai + google). `crossFamily: true` rejects a same-family panel at construction, so the ensemble is genuinely independent. The panel sees the **same full context** (code + check results + rubric note) the single judge does. Use it only for a ship/no-ship claim. +- **`--ensemble`: 3** — `ensembleCodeJudge`, built from `ensembleJudge`, three **cross-family**, snapshot-dated models (deepseek + openai + google). `crossFamily: true` rejects a same-family panel at construction, so the ensemble is genuinely independent **live**. The panel sees the **same full context** (code + check results + rubric note) the single judge does. Use it only for a ship/no-ship claim. (Offline, all three share the mock transport — see the offline note above.) ## How the stats are real (`stats.ts`) @@ -84,6 +89,9 @@ Every number is one `agent-eval` primitive call — **no hand-rolled statistics - per-harness **pass-rate + Wilson binomial CI** (`wilson`) — the correct interval for a proportion - every harness **pair** compared on **matched scenarios** with a **real paired test** (`pairedTTest`, or `wilcoxonSignedRank` for the non-parametric path) for the p-value, and a **paired bootstrap** (`pairedBootstrap`) for the effect size + CI, then **BH-corrected** across all pairs (`benjaminiHochberg`) so running many comparisons doesn't manufacture a false winner. - **Reps don't fake independent n — anywhere.** The paired unit is the *scenario*, and **the leaderboard uses the same unit**: with `--reps > 1`, a harness produces several records per scenario, so BOTH the leaderboard CI/Wilson AND the pairing collapse reps to **one mean per (harness, scenario)** before computing anything. Reps tighten the per-cell *estimate*; they are not independent samples, so they never narrow the interval out of zero new information. The reported `n` is the number of distinct scenarios, not the record count. (A regression test asserts identical reps leave the CI unchanged.) +- A record missing its `scenarioId` is a **loud throw**, not a silent merge — averaging distinct scenarios into one `''` bucket would corrupt the pairing, so it fails fast instead. + +> **Power caveat.** The example corpus is **3 tasks** — far below what these tests need to separate harnesses. The Wilcoxon path returns `p=1` for fewer than 6 non-zero diffs, and the paired t-test has ~1 degree of freedom, so at this corpus size the significance machinery is structurally **non-significant**; it demonstrates the *wiring*, not a defensible claim. `renderStats` prints this caveat whenever `n < 6`. A real harness comparison wants **20-50 tasks**. The leaderboard labels are the readable harness names, not the matrix's internal profile hashes. @@ -91,14 +99,14 @@ The leaderboard labels are the readable harness names, not the matrix's internal | File | What it owns | |---|---| -| `scenarios.ts` | the held-out task corpus + the firewall-as-a-type (`prompt` vs eval-only fields) + the seeded test fixtures + the check commands | +| `scenarios.ts` | the 3-task held-out corpus + the firewall-as-a-type (`prompt` vs rubric/realness vs the seeded fixture) + the seeded test fixtures + the check commands + the realness signals (tuned so the natural cheat gates) | | `profiles.ts` | the harness axis (one bare baseline `AgentProfile` per harness) **and** the one-line tool knob (`withTools` + presets) | | `eval.ts` | the scoring stack: `runChecks` (`MultiLayerVerifier`) + `realnessGate` + `singleCodeJudge` (`llmJudge`) / `ensembleCodeJudge` (`ensembleJudge`) | | `dispatch.ts` | renders one matrix cell: persistent box + multi-round refine + token metering. **The firewall lives here.** | | `offline-box.ts` | an in-process `SandboxClient` so the whole thing runs with no creds | | `stats.ts` | leaderboard + `pairedTTest` / `pairedBootstrap` / `benjaminiHochberg` / `confidenceInterval` / `wilson` | | `benchmark.ts` | the entrypoint: build the axes, hand the matrix the dispatch + judges, run, print stats | -| `coding-benchmark.test.ts` | offline smoke — the matrix produces `harnesses × scenarios × reps` records, and the realness gate catches a stub | +| `coding-benchmark.test.ts` | offline smoke — the matrix produces `harnesses × scenarios × reps` records; the realness gate gates the dispatch's round-0 stub AND each natural cheat (per task); reps don't narrow the CI | ## Primitives composed @@ -107,7 +115,7 @@ The leaderboard labels are the readable harness names, not the matrix's internal - **deterministic layer:** `MultiLayerVerifier` — ordered `typecheck → test → lint` with dependency-based skip and a blended score (`@tangle-network/agent-eval`) - **token metering:** `extractLlmCallEvent` (`@tangle-network/agent-runtime/loops`) — reads usage off **every** backend event shape (`done` / `result` / `llm_call` / `usage`) so the integrity guard sees a real run - **realness:** `scoreAuthenticity` + `gateRealness` (`@tangle-network/agent-eval/authenticity`) -- **judges:** `llmJudge` (single model call → canonical `JudgeConfig`) and `ensembleJudge` for the cross-family panel (`@tangle-network/agent-eval`); the judge transport is a `ChatClient` (`createChatClient` — a `mock` handler offline, the `router` live) +- **judges:** `llmJudge` (single model call → canonical `JudgeConfig`, imported from `@tangle-network/agent-eval/campaign` so it resolves across the whole peer range) and `ensembleJudge` for the cross-family panel (`@tangle-network/agent-eval`); the judge transport is a `ChatClient` (`createChatClient` — a `mock` handler offline, the `router` live) - **integrity:** `integrity: 'assert'` on the matrix proves a real backend ran (no stubbed cell) — `'off'` only for the offline mock - **stats:** `pairedTTest`, `wilcoxonSignedRank`, `pairedBootstrap`, `benjaminiHochberg`, `confidenceInterval`, `wilson` @@ -118,6 +126,7 @@ The leaderboard labels are the readable harness names, not the matrix's internal 1. **`TANGLE_API_KEY` + `SANDBOX_BASE_URL`** — the dispatch lazily `import()`s `@tangle-network/sandbox` (behind the live flag, so the offline path never needs the SDK) and creates a real harness box per cell. 2. **A real judge model** — the judge's `ChatClient` becomes `createChatClient({ transport: 'router', apiKey })`; set `JUDGE_MODEL` (and optionally `TANGLE_ROUTER_URL`) to point it at your router. `--ensemble` then calls three real cross-family models. 3. The matrix runs with `integrity: 'assert'`, so a cell that produced no real token usage fails loudly instead of reporting a clean stub leaderboard. +4. **The harness box image must provide the toolchain on `PATH`** — the deterministic checks invoke bare `tsc`, `biome`, and `node --experimental-transform-types`. The test layer needs **Node >= 22.6** (for `--experimental-transform-types` and `.ts`-import test execution); on an older Node a correct param-property solution would fail with no hint why. A missing **advisory** tool (`biome`) folds to 0.5 and doesn't gate; a missing **`tsc`** gates the cell — so sanity-check your box image before trusting a live leaderboard. (Offline, a missing tool reads as a fail-fast, which is the honest no-toolchain signal.) Everything else — the dispatch, the verifier, the realness gate, the stats — is identical between offline and live. That's the point: only the agent and the judge model change. diff --git a/examples/coding-benchmark/benchmark.ts b/examples/coding-benchmark/benchmark.ts index 922ef67..2942d1f 100644 --- a/examples/coding-benchmark/benchmark.ts +++ b/examples/coding-benchmark/benchmark.ts @@ -17,7 +17,7 @@ * dispatch + the judge(s), run it, then compute pairwise stats. ~40 lines of glue. */ -import { mkdtempSync } from 'node:fs' +import { mkdtempSync, rmSync } from 'node:fs' import { tmpdir } from 'node:os' import { join } from 'node:path' import { @@ -50,15 +50,24 @@ export interface BenchmarkOptions { // ── flags ─────────────────────────────────────────────────────────────────── function parseArgs(argv: string[]): BenchmarkOptions { const flag = (name: string) => argv.includes(`--${name}`) + // A value is the token AFTER `--name`, but only when it is not itself a flag — so + // `--reps --live` does NOT consume `--live` as reps' value (which would yield NaN); + // it falls back instead. `opt` never swallows a following flag. const opt = (name: string, fallback: string) => { const i = argv.indexOf(`--${name}`) - return i >= 0 && argv[i + 1] ? (argv[i + 1] as string) : fallback + if (i < 0) return fallback + const v = argv[i + 1] + return v && !v.startsWith('--') ? v : fallback } + // Clamp reps to a positive integer — a non-numeric or <1 value is a usage error, not + // a silent 0/NaN rep count that produces an empty matrix. + const repsRaw = Math.floor(Number(opt('reps', '1'))) + const reps = Number.isFinite(repsRaw) && repsRaw >= 1 ? repsRaw : 1 return { live: flag('live'), ensemble: flag('ensemble'), toolPreset: opt('tools', 'none') as ToolPreset, - reps: Number(opt('reps', '1')), + reps, } } @@ -101,6 +110,20 @@ export const offlineSolutions: Record = { ` else if (c === '\\n') { row.push(field); rows.push(row); row = []; field = '' }\n` + ` else field += c\n }\n row.push(field); rows.push(row)\n return rows\n}\n`, }, + 'lru-cache': { + path: 'src/lru.ts', + // Writes the real insertion-ordered-Map LRU from round 0 (the eviction logic is the + // whole point; there is no honest hollow stub for this task). passes realness (85) + // and the hidden eviction tests. + solutionFor: () => + `export class LruCache {\n private map = new Map()\n` + + ` constructor(private capacity: number) {}\n` + + ` get(key: K): V | undefined {\n if (!this.map.has(key)) return undefined\n` + + ` const v = this.map.get(key) as V\n this.map.delete(key)\n this.map.set(key, v)\n return v\n }\n` + + ` set(key: K, value: V): void {\n if (this.map.has(key)) this.map.delete(key)\n` + + ` else if (this.map.size >= this.capacity) this.map.delete(this.map.keys().next().value as K)\n` + + ` this.map.set(key, value)\n }\n}\n`, + }, } // ── the box client: live (real harness) or offline (in-process) ─────────────── @@ -200,34 +223,40 @@ export async function main(argv: string[] = process.argv.slice(2)): Promise({ - profiles: harnessProfiles, // axis: harness × baseline - scenarios: [scenario], // axis: tasks (one at a time so the offline client matches) - dispatch: codingDispatch(toolPreset, resolveClient(scenario)), - judges: judges(opts, chat), - reps, - integrity: live ? 'assert' : 'off', // offline mock has no real backend; live proves it - costCeiling: 5, - runDir, - commitSha: process.env.GIT_SHA ?? 'example', - storage: inMemoryCampaignStorage(), - }) - allRecords.push(...result.records) - } + try { + // The matrix runs one campaign per profile. The dispatch is per-scenario only in + // its CLIENT (offline scripts differ by scenario), so run each scenario's matrix + // and merge the records. (Live, one client serves all scenarios — collapse this.) + const allRecords = [] + for (const scenario of scenarios) { + const result = await runProfileMatrix({ + profiles: harnessProfiles, // axis: harness × baseline + scenarios: [scenario], // axis: tasks (one at a time so the offline client matches) + dispatch: codingDispatch(toolPreset, resolveClient(scenario)), + judges: judges(opts, chat), + reps, + integrity: live ? 'assert' : 'off', // offline mock has no real backend; live proves it + costCeiling: 5, + runDir, + commitSha: process.env.GIT_SHA ?? 'example', + storage: inMemoryCampaignStorage(), + }) + allRecords.push(...result.records) + } - // Map the matrix's hashed profileId → the readable harness name for the leaderboard. - const nameById = new Map(harnessProfiles.map((p) => [agentProfileId(p), p.name ?? 'unknown'])) - const nameOf = (id: string) => nameById.get(id) ?? id - const report = pairwiseStats(allRecords, nameOf) + // Map the matrix's hashed profileId → the readable harness name for the leaderboard. + const nameById = new Map(harnessProfiles.map((p) => [agentProfileId(p), p.name ?? 'unknown'])) + const nameOf = (id: string) => nameById.get(id) ?? id + const report = pairwiseStats(allRecords, nameOf) - console.log(`\nrecords: ${allRecords.length}\n`) - console.log(renderStats(report)) - return { records: allRecords.length, leaderboard: report.leaderboard.length } + console.log(`\nrecords: ${allRecords.length}\n`) + console.log(renderStats(report)) + return { records: allRecords.length, leaderboard: report.leaderboard.length } + } finally { + // The matrix writes its run artifacts under `runDir`; tear the temp tree down so + // repeated runs don't leak `/tmp/coding-benchmark-*` directories. + rmSync(runDir, { recursive: true, force: true }) + } } export interface RunArtifactSummary { diff --git a/examples/coding-benchmark/coding-benchmark.test.ts b/examples/coding-benchmark/coding-benchmark.test.ts index 802127e..93b21d1 100644 --- a/examples/coding-benchmark/coding-benchmark.test.ts +++ b/examples/coding-benchmark/coding-benchmark.test.ts @@ -6,18 +6,39 @@ * 2. the realness gate gates the ACTUAL round-0 stub the dispatch writes (not a * separate strawman) to composite 0 — the anti-cheat demo fires on the * benchmark's own data, and passes the real refined impl; - * 3. reps tighten the per-cell estimate HONESTLY — identical reps do NOT narrow + * 3. the gate also fires on the NATURAL cheat for EVERY task (not just one stub + * shape) — the README's "catches the listed stub shapes" claim, ground-truthed; + * 4. a real impl scored WITH its seeded fixture as a reference carries no spurious + * DEAD_ARTIFACT flag (the realness scan sees the artifact is imported); + * 5. reps tighten the per-cell estimate HONESTLY — identical reps do NOT narrow * the leaderboard CI vs reps=1 (reps are not independent samples). */ import type { RunRecord } from '@tangle-network/agent-eval' +import type { ProducedFile } from '@tangle-network/agent-eval/authenticity' import { describe, expect, it } from 'vitest' import { main, offlineSolutions } from './benchmark' import { realnessGate } from './eval' import { harnessProfiles } from './profiles' -import { scenarios } from './scenarios' +import { type CodingScenario, scenarios } from './scenarios' import { pairwiseStats } from './stats' +/** The natural cheat per scenario — the shortcut a real agent would actually reach for, + * NOT a hand-built strawman: the gate must fire on each of these. */ +const naturalCheats: Record = { + // hollow body, but the constructor param is named `refillPerSec` (the prompt's own + // name) — the decoy that defeated a `/refill/` realImpl. + 'rate-limiter': + 'export class RateLimiter {\n constructor(_capacity: number, refillPerSec: number) {}\n tryRemove(n: number): boolean { return true }\n}\n', + // a for-loop is present, but it splits on newline+comma — the naive parse the + // RFC-4180 cases break. The loop must NOT read as a real impl. + 'csv-parser': + 'export function parseCsv(input: string): string[][] {\n const out: string[][] = []\n for (const line of input.split("\\n")) out.push(line.split(","))\n return out\n}\n', + // a Map wrapper that never evicts — grows without bound, fails the at-capacity test. + 'lru-cache': + 'export class LruCache {\n private store = new Map()\n constructor(_capacity: number) {}\n get(k: K) { return this.store.get(k) }\n set(k: K, v: V) { this.store.set(k, v) }\n}\n', +} + describe('coding-benchmark (offline)', () => { // Integration smoke: runs the real matrix end-to-end (real box.exec on the offline // toolchain, all refine rounds since the checks can't pass without the toolchain). @@ -57,6 +78,68 @@ describe('coding-benchmark (offline)', () => { expect(verdict.score).toBeGreaterThan(0) }) + // The HIGH-severity claim: the gate fires on the NATURAL cheat for every task, not + // only on one strawman stub shape. Each cheat below is the realistic shortcut with a + // decoy token that defeated a looser realImpl — all must gate. + it.each(scenarios)('gates the natural cheat for $id', (scenario: CodingScenario) => { + const cheat = naturalCheats[scenario.id] + expect(cheat, `no natural-cheat fixture for ${scenario.id}`).toBeDefined() + const verdict = realnessGate( + [{ path: scenario.solutionPath, content: cheat as string }], + scenario.realnessSignals, + ) + expect(verdict.gated, `natural cheat for ${scenario.id} slipped past the gate`).toBe(true) + expect(verdict.score).toBe(0) + }) + + // The real offline solution for every task scores real (not gated). Confirms each + // scenario's tightened realImpl still accepts the genuine implementation. + it.each(scenarios)('passes the real offline solution for $id', (scenario: CodingScenario) => { + const script = offlineSolutions[scenario.id] + expect(script, `no offline solution for ${scenario.id}`).toBeDefined() + const content = (script as NonNullable).solutionFor(99) // settled round + const verdict = realnessGate( + [{ path: scenario.solutionPath, content }], + scenario.realnessSignals, + ) + expect(verdict.gated, `real solution for ${scenario.id} was wrongly gated`).toBe(false) + expect(verdict.score).toBeGreaterThan(0) + }) + + // The runtime DEAD_ARTIFACT fix: a real solution scored WITH its seeded fixture as a + // non-scored reference carries no DEAD_ARTIFACT flag (the scan sees it IS imported). + it.each( + scenarios, + )('does not flag DEAD_ARTIFACT on the real $id solution', (scenario: CodingScenario) => { + const script = offlineSolutions[scenario.id] + const content = (script as NonNullable).solutionFor(99) + const reference: ProducedFile[] = [ + { path: scenario.fixture.path, content: scenario.fixture.content }, + ] + const verdict = realnessGate( + [{ path: scenario.solutionPath, content }], + scenario.realnessSignals, + reference, + ) + expect(verdict.notes).not.toContain('DEAD_ARTIFACT') + }) + + // A reference cannot rescue a cheat: the gate still fires with the fixture present. + it.each( + scenarios, + )('a fixture reference does not rescue the $id cheat', (scenario: CodingScenario) => { + const cheat = naturalCheats[scenario.id] as string + const reference: ProducedFile[] = [ + { path: scenario.fixture.path, content: scenario.fixture.content }, + ] + const verdict = realnessGate( + [{ path: scenario.solutionPath, content: cheat }], + scenario.realnessSignals, + reference, + ) + expect(verdict.gated).toBe(true) + }) + it('reps do NOT fake independent n — identical reps leave the CI unchanged', () => { // Two harnesses, two scenarios, identical scores. Build records for reps=1 and // reps=3 (the extra reps are exact duplicates → zero new information). The honest diff --git a/examples/coding-benchmark/dispatch.ts b/examples/coding-benchmark/dispatch.ts index 7143a13..aaa0737 100644 --- a/examples/coding-benchmark/dispatch.ts +++ b/examples/coding-benchmark/dispatch.ts @@ -14,13 +14,18 @@ * guard sees a real run. * * ┌─────────────────────────────────────────────────────────────────────────┐ - * │ THE NO-CHEAT FIREWALL LIVES HERE. │ - * │ The ONLY scenario field that ever reaches the box is `scenario.prompt` │ - * │ (the `taskToPrompt` below, and `nextPrompt` built ONLY from validator │ - * │ output). The hidden test is SEEDED into the box but never described to │ - * │ the agent; the rubric, the realness signals, and the grading note are │ - * │ read later by eval.ts — never written into the box. The agent literally │ - * │ cannot read the answer key. │ + * │ THE GRADING-CRITERIA FIREWALL LIVES HERE. │ + * │ The ONLY scenario field that reaches the agent's CONTEXT is │ + * │ `scenario.prompt` (the `taskToPrompt` below, and `nextPrompt` built ONLY │ + * │ from check output). The LLM-judge rubric, the grading note, and the │ + * │ realness signals are read later by eval.ts — they are never written into │ + * │ the box, so the agent cannot steer toward the criteria it is scored on. │ + * │ │ + * │ The deterministic test fixture is a different case: it is SEEDED into the │ + * │ box workspace (so `node --test` has a file to run) and a multi-round │ + * │ agent with native file tools CAN read it — intentional, the same as real │ + * │ TDD. The test is a SPEC the agent is asked to satisfy, not a hidden │ + * │ rubric. So: the rubric/realness are firewalled; the test is visible. │ * └─────────────────────────────────────────────────────────────────────────┘ */ @@ -33,6 +38,7 @@ import { openSandboxRun, type SandboxClient, } from '@tangle-network/agent-runtime/loops' +import type { SandboxEvent } from '@tangle-network/sandbox' import { type CheckBox, layerOutput, type RunArtifact, realnessGate, runChecks } from './eval' import { harnessOf, type ToolPreset, withTools } from './profiles' import { type CodingScenario, checkCmds } from './scenarios' @@ -43,14 +49,29 @@ const maxRounds = 3 /** Build the next-round prompt from the checks the AGENT is allowed to see — the * pass/fail + output of the deterministic layers. NEVER from the rubric, realness, * or judge. This is the firewall in action: the agent steers on objective check - * failures, nothing else. */ + * failures, nothing else. + * + * typecheck/test are gating (a failure blocks `allPass`); lint is advisory (it never + * gates) but its warnings are still surfaced here so the agent can fix style — visible + * to the agent is decoupled from gates-allPass. Advisory warnings ride along as a + * separate, clearly-labeled section. */ function nextPrompt(report: RunArtifact['checks']): string { const fails: string[] = [] - for (const layer of ['typecheck', 'test', 'lint'] as const) { + const advisories: string[] = [] + for (const layer of ['typecheck', 'test'] as const) { const c = layerOutput(report, layer) if (!c.passed) fails.push(`${layer} failed:\n${c.output.slice(0, 1200)}`) } - return `Your solution did not pass these checks. Fix the file and try again.\n\n${fails.join('\n\n')}` + // lint is advisory: report its warnings (not "clean") without treating them as a + // gating failure, so style issues can actually be refined. + const lint = layerOutput(report, 'lint') + if (!lint.clean && lint.output) advisories.push(`lint warnings:\n${lint.output.slice(0, 1200)}`) + + const sections = [`Your solution did not pass these checks. Fix the file and try again.`] + if (fails.length > 0) sections.push(fails.join('\n\n')) + if (advisories.length > 0) + sections.push(`Advisory (does not block, but improve if you can):\n${advisories.join('\n\n')}`) + return sections.join('\n\n') } /** @@ -117,13 +138,20 @@ export function codingDispatch( // Deterministic checks, IN THE BOX, this round. These (and only these) steer // the next round — the firewall keeps the rubric/realness out of the loop. - checks = await runChecks(run.box as unknown as CheckBox, scenario, cmds) + // `run.box` is a `SandboxInstance`; `CheckBox` is the minimal `exec`(+optional + // `fs.write`) subset the checks actually use — a structural narrowing, no widening. + checks = await runChecks(run.box as CheckBox, scenario, cmds) if (checks.allPass) break // stop on worker-observable green only } // The realness anchor runs AFTER the loop — never inside it, so it can never - // steer the agent. Its verdict is recorded for honesty AND gates the judge. - const realness = realnessGate(files, scenario.realnessSignals) + // steer the agent. Its verdict is recorded for honesty AND gates the judge. The + // seeded fixture is passed as a non-scored REFERENCE so the scan sees the + // solution IS imported (no spurious DEAD_ARTIFACT on a real solution); a cheat + // still gates regardless of what references it. + const realness = realnessGate(files, scenario.realnessSignals, [ + { path: scenario.fixture.path, content: scenario.fixture.content }, + ]) await ctx.artifacts.writeJson(`realness/${ctx.cellId}.json`, realness) return { files, solution, finalText, checks, realness } @@ -152,20 +180,22 @@ function blankReport(): RunArtifact['checks'] { } } -/** Pull the agent's text out of a stream event (best-effort, for judge context). */ -function eventText(ev: unknown): string { +/** Pull the agent's text out of a stream event (best-effort, for judge context). The + * text payload isn't on `SandboxEvent`'s typed surface, so we read `data` defensively. */ +function eventText(ev: SandboxEvent): string { const e = ev as { data?: { finalText?: string; text?: string; delta?: string } } return e.data?.finalText ?? e.data?.text ?? e.data?.delta ?? '' } /** Sum token usage across the turn's events into the `{ input, output }` shape * `ctx.cost.observeTokens` expects, using the runtime's own metering extractor so - * EVERY backend event shape (`done`/`result`/`llm_call`/`usage`) is counted. */ -function sumTokens(events: unknown[]): { input: number; output: number } { + * EVERY backend event shape (`done`/`result`/`llm_call`/`usage`) is counted. + * `events` is the turn's real `SandboxEvent[]` — `extractLlmCallEvent` takes it directly. */ +function sumTokens(events: SandboxEvent[]): { input: number; output: number } { let input = 0 let output = 0 for (const ev of events) { - const call = extractLlmCallEvent(ev as never, 'agent') + const call = extractLlmCallEvent(ev, 'agent') if (call) { input += call.tokensIn ?? 0 output += call.tokensOut ?? 0 diff --git a/examples/coding-benchmark/eval.ts b/examples/coding-benchmark/eval.ts index 7899212..869cdc9 100644 --- a/examples/coding-benchmark/eval.ts +++ b/examples/coding-benchmark/eval.ts @@ -21,7 +21,6 @@ import { type ChatClient, ensembleJudge, type Layer, - llmJudge, MultiLayerVerifier, type VerificationReport, } from '@tangle-network/agent-eval' @@ -31,7 +30,10 @@ import { type ProducedFile, scoreAuthenticity, } from '@tangle-network/agent-eval/authenticity' -import type { JudgeConfig, JudgeScore } from '@tangle-network/agent-eval/campaign' +// `llmJudge` is imported from the `/campaign` subpath, not the main index: it is +// exported from `/campaign` across the entire declared peer range (>=0.97), whereas the +// main-index re-export is newer — so a consumer pinned to the peer floor still compiles. +import { type JudgeConfig, type JudgeScore, llmJudge } from '@tangle-network/agent-eval/campaign' import type { CodingScenario, Fixture } from './scenarios' // ── the rubric (4 weighted dimensions, total 1.0) ───────────────────────────── @@ -91,18 +93,28 @@ export interface RealnessVerdict { // ── layer 1: the deterministic check pipeline ───────────────────────────────── /** The minimal box surface the checks need — a subset of the real `SandboxInstance`. - * The live sandbox satisfies it; the offline in-process box implements it too. */ + * The live sandbox satisfies it; the offline in-process box implements it too. `fs.write` + * is the structured write seam (both boxes expose it); we prefer it over a shell write so + * seeding never interpolates a path into a command string. */ export interface CheckBox { exec(command: string): Promise<{ exitCode: number; stdout: string; stderr: string }> + fs?: { write(path: string, content: string): Promise } } -/** Seed an eval-only file into the box via `exec` (base64 → file). Works on the - * `exec`-only surface, offline and live. The fixture's CONTENT is never described - * to the agent — this is write-only scaffold, not part of the prompt (the firewall). */ +/** Seed an eval-only file into the box. Prefers the structured `fs.write` seam so the + * fixture path/content is never interpolated into a shell command (no injection + * surface for partners who later load scenario paths from config). Falls back to a + * base64 shell write with SINGLE-QUOTED path words on a box that only exposes `exec`. + * The fixture's CONTENT is never described to the agent — this is write-only scaffold, + * not part of the prompt (the firewall). */ async function seedFile(box: CheckBox, file: Fixture): Promise { + if (box.fs) { + await box.fs.write(file.path, file.content) + return + } const b64 = Buffer.from(file.content, 'utf8').toString('base64') const dir = file.path.includes('/') ? file.path.slice(0, file.path.lastIndexOf('/')) : '.' - await box.exec(`mkdir -p ${dir} && printf %s '${b64}' | base64 -d > ${file.path}`) + await box.exec(`mkdir -p '${dir}' && printf %s '${b64}' | base64 -d > '${file.path}'`) } /** One check command → a `Layer`. Pass/fail comes from the exit code. `advisory` @@ -175,14 +187,18 @@ export async function runChecks( return verifier.run({ env: box, overallCapMs: 120_000 }) } -/** Pull one check layer's captured output (for the refine prompt). */ +/** Pull one check layer's captured output (for the refine prompt). `passed` is the + * gating status (advisory layers always report `pass`); `clean` is the layer's real + * cleanliness (score === 1) — so the refine prompt can surface advisory lint warnings + * (clean === false) without those warnings gating `allPass`. */ export function layerOutput( report: VerificationReport, layer: string, -): { passed: boolean; output: string } { +): { passed: boolean; clean: boolean; output: string } { const r = report.layers.find((l) => l.layer === layer) return { passed: r?.status === 'pass', + clean: r ? r.score === 1 : false, output: typeof r?.detail?.output === 'string' ? r.detail.output : '', } } @@ -194,9 +210,19 @@ export function layerOutput( * (required artifact present? hard part implemented? or a fake shim?), and * `gateRealness` caps anything that faked or omitted the required artifact. The * verdict is recorded AND read by the judge — a gated artifact cannot earn a score. + * + * `reference` files (e.g. the seeded test fixture) are passed to the scan as non-scored + * context: they let `scoreAuthenticity` observe that the required artifact IS imported, + * so a real solution does not get a spurious `DEAD_ARTIFACT` flag just because the + * dispatch scores the solution file in isolation. A reference cannot rescue a cheat — + * the gate still fires on `fakeShim && !realImpl` regardless of what imports it. */ -export function realnessGate(files: ProducedFile[], signals: AuthenticitySignals): RealnessVerdict { - const result = scoreAuthenticity(files, signals) +export function realnessGate( + files: ProducedFile[], + signals: AuthenticitySignals, + reference: ProducedFile[] = [], +): RealnessVerdict { + const result = scoreAuthenticity([...files, ...reference], signals) const gate = gateRealness(result, { requireArtifact: true }) const flags = result.flags.length > 0 ? ` — flags: ${result.flags.join(', ')}` : '' return { @@ -237,13 +263,9 @@ function renderForJudge(artifact: RunArtifact, scenario: CodingScenario): string * against the rubric and reduces it to a canonical `{ dimensions, composite, notes }`. * We wrap it so a realness-gated artifact short-circuits to composite 0 WITHOUT a * model call — the realness gate genuinely gates the judge. */ -export function singleCodeJudge( - chat: ChatClient, - model?: string, -): JudgeConfig { +export function singleCodeJudge(chat: ChatClient): JudgeConfig { const base = llmJudge('code-quality', judgePrompt, { chat, - ...(model ? { model } : {}), dimensions, weights, scale: 'unit', @@ -264,7 +286,13 @@ export function ensembleCodeJudge( const base = ensembleJudge({ name: 'code-quality-ensemble', dimensions: dimKeys, - models: ['deepseek-chat', 'gpt-4o-mini', 'gemini-flash'], + // Snapshot-dated, cross-family panel — the SAME reproducibility rule profiles.ts + // enforces on harness models (a bare alias isn't reproducible: "which gpt-4o-mini?"). + models: [ + 'deepseek/deepseek-chat-2025-08-21', + 'openai/gpt-4o-mini-2024-07-18', + 'google/gemini-2.0-flash-2025-02-05', + ], crossFamily: true, weights, scoreWith: async (model, input) => { diff --git a/examples/coding-benchmark/offline-box.ts b/examples/coding-benchmark/offline-box.ts index 785bc53..fe4c55b 100644 --- a/examples/coding-benchmark/offline-box.ts +++ b/examples/coding-benchmark/offline-box.ts @@ -53,7 +53,9 @@ function instanceMethods(workdir: string, script: OfflineScript) { await mkdir(dirname(abs), { recursive: true }) await writeFile(abs, content, 'utf8') // The real sandbox terminal event shape: `done` with `data.tokenUsage` + - // top-level `totalCostUsd`. `extractLlmCallEvent` reads exactly this. + // top-level `totalCostUsd`. `extractLlmCallEvent` reads exactly this. The cast is + // structural: this is one member of the wide `SandboxEvent` union, written out + // literally; we don't reconstruct the whole union just to emit one done event. yield { type: 'done', data: { @@ -97,6 +99,10 @@ export function offlineSandboxClient(script: OfflineScript): SandboxClient { return { async create(_options?: CreateSandboxOptions): Promise { const workdir = mkdtempSync(join(tmpdir(), 'coding-bench-')) + // The offline box implements only the members `openSandboxRun` actually calls + // (streamPrompt / fs / exec / delete), not the full `SandboxInstance`. The cast is + // a deliberate subset-as-superset for the offline seam; the live path uses the + // real SDK client. We don't stub the ~40 unused members to satisfy the type. return instanceMethods(workdir, script) as unknown as SandboxInstance }, } diff --git a/examples/coding-benchmark/scenarios.ts b/examples/coding-benchmark/scenarios.ts index 338e32e..7719633 100644 --- a/examples/coding-benchmark/scenarios.ts +++ b/examples/coding-benchmark/scenarios.ts @@ -1,22 +1,27 @@ /** - * The held-out coding-task corpus — and the NO-CHEAT FIREWALL, expressed as a type. + * The held-out coding-task corpus — and the GRADING-CRITERIA FIREWALL, expressed as + * a type. * - * Every scenario splits cleanly into two halves: - * - `prompt` — THE ONLY field the agent ever sees. The dispatch copies it - * (and nothing else) into the worker's context. - * - everything else — the deterministic test fixture, the realness signals, the - * rubric note — is EVAL-ONLY. It is read by eval.ts to score the - * result; the fixture is SEEDED into the box (so `node --test` - * has something to run) but its CONTENT is never described to the - * agent, and the rubric/realness signals are never written into - * the box at all. Because the two halves are different fields on - * one object, "the agent can read the answer key" becomes a thing - * you can SEE in one place: it would require dispatch.ts to put a - * non-`prompt` field into the profile. It does not. (See the - * `// FIREWALL` comment in dispatch.ts for the exact line.) + * Every scenario splits into three layers by where each field flows: + * - `prompt` — the only field copied into the agent's CONTEXT. The dispatch + * copies it (and next-round prompts built only from check output) + * into the worker; nothing else reaches the worker's context. + * - `fixture` — the deterministic test. It is SEEDED into the box workspace (so + * `node --test` has a file to run) and a multi-round agent with + * native file tools CAN read it — this is intentional, the same as + * real TDD: the test is a SPEC the agent is asked to satisfy, not + * a hidden rubric. Its assertions are never described in the + * prompt, but they are not hidden from the filesystem. + * - rubric/realness — the LLM-judge rubric note and the realness signals. These are + * never written into the box at all; eval.ts reads them AFTER the + * loop to score the result. THIS is what the firewall actually + * protects: the grading criteria the agent can't steer toward. * - * This is the structural defense the design calls for: the firewall is a property - * of which field flows where, not a runtime check you have to trust. + * The firewall is a property of which field flows where — you can SEE it in one place + * (it would require dispatch.ts to put a rubric/realness field into the profile, which + * it does not; see the `// FIREWALL` comment in dispatch.ts). The honest claim is the + * precise one: the rubric and realness signals never touch the box; the test fixture + * is deliberately visible to the agent. */ import type { AuthenticitySignals } from '@tangle-network/agent-eval/authenticity' @@ -68,16 +73,32 @@ const typecheckCmd = (path: string) => `tsc --noEmit --strict --skipLibCheck ${p * code — including constructor PARAMETER PROPERTIES (`constructor(private x: number)`), * the exact style the canonical token-bucket impl uses. Without the flag a CORRECT * solution would exit 1 and score as a test failure. The flag transforms (not just - * strips) the types so param properties run. */ + * strips) the types so param properties run. + * + * NODE FLOOR: `--experimental-transform-types` and `.ts`-import test execution need + * Node >= 22.6 (the package's `engines.node` floor covers the offline path, which + * degrades gracefully when the toolchain is absent, but the test LAYER itself — live + * or when copied — requires Node >= 22.6). On an older Node a correct solution would + * fail with no hint why. */ const testCmd = (fixturePath: string) => `node --experimental-transform-types --test ${fixturePath}` /** A lint shell command for one solution file. */ const lintCmd = (path: string) => `biome check ${path}` /** - * A 2-task corpus. Real benchmarks carry 20-50; two keeps the example readable. - * Both are self-contained "write one module that passes these checks" tasks — the + * A 3-task corpus. Real benchmarks carry 20-50; three keeps the example readable. + * Each is a self-contained "write one module that passes these checks" task — the * shape that has a CORRECTABLE MIDDLE BAND (build-passes-but-quality-varies), which * is what makes a benchmark able to separate harnesses at all. + * + * The realness signals on each task are tuned so the NATURAL cheat gates, not just one + * strawman stub: a shim only reads as "real" when the actual hard-part work is present + * (refill math / quote-state tracking / capacity eviction), and the fake patterns catch + * the obvious shortcut regardless of decoy tokens (a `refill` param name, a stray + * `for (`, a passthrough `Map`). The smoke test asserts each natural cheat is gated. + * + * POWER CAVEAT: three scenarios is far below the n the significance machinery needs to + * separate harnesses — the paired tests demonstrate the WIRING, not a defensible claim. + * A real run wants 20-50 tasks. `renderStats` prints this caveat when n < 6. */ export const scenarios: CodingScenario[] = [ { @@ -119,12 +140,18 @@ test('rejects a second draw that exceeds the remaining bucket', () => { realnessSignals: { label: 'token-bucket', requiredArtifact: /rate-limiter\.ts$/, - // The hard part must be present: time-based refill math, not a hardcoded true. - realImpl: /Date\.now\(\)|performance\.now\(\)|elapsed|refill/, + // The hard part must be present: actual refill MATH — a clock read combined with + // refillPerSec, or refillPerSec used in an arithmetic expression. A bare `refill` + // identifier (e.g. a constructor param named `refillPerSec`) is NOT enough, so a + // hollow `return true` whose only `refill` is the param name does not read as real. + realImpl: + /(Date\.now\(\)|performance\.now\(\))[\s\S]*refillPerSec\s*[)*]|\*\s*(this\.)?refillPerSec/, realInfra: /class\s+RateLimiter/, - // The fake: a tryRemove whose ENTIRE body is `return true` (no refill math - // before it). Tightened so a real impl that legitimately ends in `return true` - // is NOT flagged — the shim is "returns true with no logic", not "returns true". + // The fake: a tryRemove whose body opens with `return true` (no refill math before + // it). A real impl that legitimately ENDS in `return true` after the math is not + // flagged — the shim is "returns true with no logic", not "returns true". Combined + // with the tightened realImpl above, the gate (fakeShim && !realImpl) now fires on + // a stub even when its constructor param is named `refillPerSec`. fakeShim: /tryRemove\([^)]*\)\s*:\s*boolean\s*{\s*return\s+true/, }, rubricNote: @@ -168,18 +195,88 @@ test('unescapes a doubled quote', () => { realnessSignals: { label: 'csv-rfc4180', requiredArtifact: /csv\.ts$/, - // Real parsers track quote state char-by-char; a naive split is the fake. - realImpl: /inQuotes|state|charAt|for\s*\(|while\s*\(/, + // Real parsers track quote state and walk the string char-by-char. We anchor to + // quote-state / per-char access (`inQuotes`, `charAt(`, `input[i]`), NOT a bare + // `for (` — a naive `for (line of input.split('\n'))` cheat has a loop but no + // quote state, so it must not read as a real impl. + realImpl: /inQuotes|charAt\(|input\[\s*i\s*\]|quote/i, realInfra: /function\s+parseCsv/, // The fake: splitting on comma or newline (naive parse) — the RFC-4180 cases - // (quoted comma, embedded newline) make `.split` wrong. Matches anywhere, not - // just line-end, so `input.split('\n').map(l => l.split(','))` is caught. + // (quoted comma, embedded newline) make `.split` wrong. Matches anywhere, so the + // naive `input.split('\n').map(l => l.split(','))` AND a `for (… input.split('\n'))` + // loop are both caught. Any such split is the shortcut, regardless of loops around it. fakeShim: /\.split\(\s*['"`](,|\\n)['"`]\s*\)/, }, rubricNote: 'Reward a single-pass state machine over naive splitting; correct handling of a quoted ' + 'field containing a comma, a literal newline, and an escaped quote.', }, + { + // The "only the real algorithm passes" task: a capacity-bounded LRU cache. There is + // no shortcut that satisfies the eviction tests — a bare `Map` (or `extends Map`) + // grows without bound and fails the at-capacity test, AND gates on realness. + id: 'lru-cache', + kind: 'coding', + tags: ['data-structures', 'eviction'], + prompt: [ + 'Implement a capacity-bounded LRU (least-recently-used) cache in TypeScript at', + '`src/lru.ts`. Export `class LruCache` with a constructor `(capacity: number)`,', + 'a `get(key: K): V | undefined`, and a `set(key: K, value: V): void`. On `set` past', + 'capacity, evict the least-recently-used entry; a `get` or a re-`set` counts as a use', + '(refreshes recency). No external dependencies.', + ].join(' '), + solutionPath: 'src/lru.ts', + fixture: { + path: 'test/lru.test.js', + content: `import { test } from 'node:test' +import assert from 'node:assert/strict' +import { LruCache } from '../src/lru.ts' + +test('evicts the least-recently-used entry at capacity', () => { + const c = new LruCache(2) + c.set('a', 1) + c.set('b', 2) + c.set('c', 3) + assert.equal(c.get('a'), undefined) + assert.equal(c.get('b'), 2) + assert.equal(c.get('c'), 3) +}) + +test('a get refreshes recency so the other key is evicted', () => { + const c = new LruCache(2) + c.set('a', 1) + c.set('b', 2) + assert.equal(c.get('a'), 1) + c.set('c', 3) + assert.equal(c.get('b'), undefined) + assert.equal(c.get('a'), 1) +}) + +test('returns undefined for a missing key', () => { + const c = new LruCache(2) + assert.equal(c.get('x'), undefined) +}) +`, + }, + realnessSignals: { + label: 'lru-cache', + requiredArtifact: /lru\.ts$/, + // The hard part is eviction: a delete that precedes a set (the recency move), the + // canonical `keys().next()` oldest-key eviction, or an explicit size>=capacity + // check. None of these appear in a no-eviction wrapper. + realImpl: + /\.delete\([^)]*\)[\s\S]*\.set\(|\.keys\(\)\.next\(\)|\.size\s*>=?\s*this\.capacity/, + realInfra: /class\s+LruCache/, + // The fake: a class that `extends Map` (no eviction override), or a `set` body that + // is a single passthrough `.set` with no delete/size logic — the bounded-cache + // shortcut that grows forever. + fakeShim: + /extends\s+Map\b|set\([^)]*\)[^{]*{\s*(this\.|return\s+)?\w+\.set\([^)]*\)\s*;?\s*}/, + }, + rubricNote: + 'Reward O(1) get/set with correct LRU eviction and recency refresh on read; an ' + + 'insertion-ordered Map with delete+re-set is the idiomatic dependency-free approach.', + }, ] /** The deterministic check commands for a scenario — derived from its paths, in the diff --git a/examples/coding-benchmark/stats.ts b/examples/coding-benchmark/stats.ts index e9528ee..b1f2d34 100644 --- a/examples/coding-benchmark/stats.ts +++ b/examples/coding-benchmark/stats.ts @@ -80,11 +80,21 @@ function byHarness(records: RunRecord[], nameOf: (id: string) => string): Map { const sums = new Map() for (const r of records) { - const id = r.scenarioId ?? '' + const id = r.scenarioId + if (!id) { + throw new Error( + `RunRecord (candidate ${r.candidateId ?? 'unknown'}) is missing scenarioId — ` + + 'cannot pair or average it. The matrix stamps scenarioId on every record; a ' + + 'missing one means an upstream bug, not something to silently merge.', + ) + } const acc = sums.get(id) ?? { total: 0, n: 0 } acc.total += score(r) acc.n += 1 @@ -190,5 +200,17 @@ export function renderStats(report: StatsReport): string { `p=${p.p.toFixed(3)} ${tag}`, ) } + // Power caveat: with a tiny scenario corpus the significance machinery is structurally + // underpowered — the Wilcoxon path returns p=1 for n<6 non-zero diffs, and the paired + // t-test has ~1 df. The tests show the WIRING; a real claim needs 20-50 tasks. + const maxN = report.leaderboard.reduce((m, r) => Math.max(m, r.n), 0) + if (maxN < 6) { + lines.push('') + lines.push( + ` NOTE: n=${maxN} scenarios — below the power floor. The paired tests above cannot ` + + 'reach significance at this corpus size (they demonstrate the wiring). Use 20-50 ' + + 'tasks for a real harness comparison.', + ) + } return lines.join('\n') } From d5fa3a7f9fb701183512c948b3daeec59d201257 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Wed, 24 Jun 2026 06:51:40 -0600 Subject: [PATCH 5/5] refactor(examples): coding-benchmark anti-cheat = held-out test execution, delete the realness regex gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The realness regex gate could never prove code is real — it scans text, so a comment or dead code evades it. Replace it with held-out test execution (SWE-bench / HumanEval style): the agent develops against a few visible example tests, then is graded on a hidden suite it never saw and cannot hardcode. A real solution passes; a hardcode-the-visible cheat fails the held-out inputs. Deleted entirely: realnessGate, scoreAuthenticity / gateRealness usage, the gatedByRealness judge wrapper, realnessSignals (realImpl/fakeShim per scenario), the DEAD_ARTIFACT handling, and the AuthenticitySignals / ProducedFile imports. Built: per-scenario visibleTest (seeded during the turn) + heldoutTest (seeded ONLY at grading — the firewall). runHeldout copies the hidden suite into the box after the loop and runs `node --experimental-transform-types --test`; the held-out pass rate is the PRIMARY, ungameable correctness score. Composite = 0.7 * held-out + 0.3 * judge-quality (the LLM judge stays as a secondary code-quality signal; MultiLayerVerifier stays as advisory dev checks). stats: suppress the SIGNIFICANT tag below the power floor (n<6) and on a zero-variance pair, so small-n / no-variance never prints a bare SIGNIFICANT. Offline-proven: a hardcode-the-visible cheat scores held-out 2/4 -> composite 0.59; the real impl scores held-out 4/4 -> composite 0.94 (judge held at 0.80). Held-out tests are never seeded during the turn (firewall, asserted per scenario). README rewritten honestly; no realness/regex/authenticity claims. --- examples/README.md | 2 +- examples/coding-benchmark/README.md | 68 ++--- examples/coding-benchmark/benchmark.ts | 35 ++- .../coding-benchmark/coding-benchmark.test.ts | 256 ++++++++++------- examples/coding-benchmark/dispatch.ts | 67 +++-- examples/coding-benchmark/eval.ts | 250 ++++++++++------- examples/coding-benchmark/offline-box.ts | 16 +- examples/coding-benchmark/scenarios.ts | 262 +++++++++++------- examples/coding-benchmark/stats.ts | 26 +- 9 files changed, 589 insertions(+), 393 deletions(-) diff --git a/examples/README.md b/examples/README.md index 34931b0..0efd3bf 100644 --- a/examples/README.md +++ b/examples/README.md @@ -44,7 +44,7 @@ purpose — read [`driver-loop/`](./driver-loop/) for the contrast (a driver tha |---|---|---| | 8 | [`researcher-loop/`](./researcher-loop/) | You want the canonical `runLoop` + inline fanout driver, with a validator that hard-fails a namespace leak so the kernel prunes the bad candidate (needs the optional `@tangle-network/agent-knowledge` peer). | | 9 | [`ui-audit/`](./ui-audit/) | You want the smallest end-to-end `runLoop` over a real client (Playwright + stub judge), persisting findings. | -| 9b | [`coding-benchmark/`](./coding-benchmark/) | You want a scientifically-rigorous coding benchmark across harnesses: `runProfileMatrix` over harness × baseline-profile × scenario, a one-line tool knob (websearch / webfetch / MCP), validators-before-judge, a no-cheat firewall (the agent never sees the eval criteria), and paired-bootstrap + Wilson + BH stats (offline by default; `--live` for real harness boxes). | +| 9b | [`coding-benchmark/`](./coding-benchmark/) | You want a scientifically-rigorous coding benchmark across harnesses: `runProfileMatrix` over harness × baseline-profile × scenario, a one-line tool knob (websearch / webfetch / MCP), a held-out-test-execution anti-cheat (the agent is graded on hidden tests it never saw, so it can't hardcode), a secondary quality judge, and paired-bootstrap + Wilson + BH stats (offline by default; `--live` for real harness boxes). | ## Tier 3 — the production runtime, deeper diff --git a/examples/coding-benchmark/README.md b/examples/coding-benchmark/README.md index 01e3f23..7d8f152 100644 --- a/examples/coding-benchmark/README.md +++ b/examples/coding-benchmark/README.md @@ -1,10 +1,10 @@ # coding-benchmark -**Run the same coding task across coding agents — fairly, honestly, with real statistics — as thin composition over `agent-runtime` / `agent-eval` primitives.** The scorer, the stats, the verifier, and the realness gate are all substrate calls, not reimplemented. The glue this example owns is small and named (an in-process offline box, the per-round refine loop, the leaderboard render); the load-bearing scoring and statistics are not hand-rolled. +**Run the same coding task across coding agents — fairly, honestly, with real statistics — as thin composition over `agent-runtime` / `agent-eval` primitives.** The anti-cheat is **held-out test execution** (SWE-bench / HumanEval style): the agent develops against a few visible example tests, then is graded on a **hidden test suite it never saw and cannot hardcode**. A real solution passes; a cheat (memorize the visible examples, fake the hard part) fails. The verifier, the stats, and the judges are all substrate calls, not reimplemented. The glue this example owns is small and named (an in-process offline box, the per-round refine loop, the leaderboard render); the load-bearing scoring and statistics are not hand-rolled. ```bash # offline — no creds, no network. Runs the whole pipeline against an in-process box -# with a deterministic mock judge. +# with a deterministic mock judge. The held-out tests run for real (node --test). pnpm tsx examples/coding-benchmark/benchmark.ts # pick a tool surface, add the 3-model judge panel, run more reps @@ -23,30 +23,30 @@ One coding task, run across a **matrix** of three axes, scored, and compared wit | Axis | What varies | Where | |---|---|---| | **harness** | claude-code / opencode / codex / cli, each on its **baseline default profile** (no skills, no injected prompt — we measure the harness, not our scaffolding) | `profiles.ts` | -| **scenario** | the held-out coding tasks (a token-bucket rate limiter, an RFC-4180 CSV parser, an LRU cache whose only passing path is the real eviction algorithm) | `scenarios.ts` | +| **scenario** | the coding tasks (a token-bucket rate limiter, an RFC-4180 CSV parser, an LRU cache whose only passing path is the real eviction algorithm) — each carries a few **visible** example tests and a **held-out** grading suite | `scenarios.ts` | | **tool surface** | `none` / `web` / `search-mcp` — folded in as a one-line knob (`--tools`) | `profiles.ts` | -The agent gets up to **3 refine rounds** in **one persistent box**: round N+1's prompt is built from round N's *check failures* (and nothing else — see the firewall). It stops the moment the deterministic checks pass. +The agent gets up to **3 refine rounds** in **one persistent box**: round N+1's prompt is built from round N's *visible-test failures* (and nothing else — see the firewall). It stops the moment the dev checks pass. The output is a leaderboard with confidence bands and a significance matrix: ``` Harness leaderboard (mean composite, 95% CI; pass-rate, Wilson CI): - claude-code-baseline composite 0.813 [0.813, 0.813] pass 100% [44%, 100%] (n=3) + claude-code-baseline composite 0.944 [0.944, 0.944] pass 100% [44%, 100%] (n=3) ... Pairwise (paired delta + bootstrap CI; paired-test p, BH-corrected): - opencode-baseline − claude-code-baseline: Δ=0.000 [0.000, 0.000] p=1.000 n.s. + opencode-baseline − claude-code-baseline: Δ=0.000 [0.000, 0.000] p=1.000 n.s. (underpowered) - NOTE: n=3 scenarios — below the power floor. The paired tests above cannot reach - significance at this corpus size (they demonstrate the wiring). Use 20-50 tasks for - a real harness comparison. + NOTE: n=3 scenarios — below the power floor (6). The paired tests above cannot defensibly + reach significance at this corpus size, so the SIGNIFICANT tag is suppressed (they + demonstrate the wiring). Use 20-50 tasks for a real harness comparison. ``` -> **Offline, every harness writes the same scripted solution and is scored by the same deterministic mock judge, so all deltas are 0.000** — the honest no-variance result, not a bug. The whole pipeline (matrix, verifier, realness gate, judge wiring, stats, firewall) runs for real; only the agent and the judge model are stubbed offline. **Offline the `--ensemble` panel is degenerate too: all three cross-family models share the one mock transport and return the identical verdict — cross-family independence is a live-only property.** `--live` swaps in real harness boxes, a real judge model, and (with `--ensemble`) three genuinely independent models, and the harnesses separate. +> **Offline, every harness writes the same scripted solution and is scored by the same deterministic mock judge, so all deltas are 0.000** — the honest no-variance result, not a bug. The whole pipeline (matrix, verifier, held-out test execution, judge wiring, stats, firewall) runs for real; only the agent and the judge model are stubbed offline. **Offline the `--ensemble` panel is degenerate too: all three cross-family models share the one mock transport and return the identical verdict — cross-family independence is a live-only property.** `--live` swaps in real harness boxes, a real judge model, and (with `--ensemble`) three genuinely independent models, and the harnesses separate. ### The offline "agent" is a scripted stand-in -Offline there is no model, so each scenario's box writes a **canned solution** instead of calling a coding agent — a deterministic stand-in so the example runs with no creds. The scripts are honest: `rate-limiter` **improves across rounds** — round 0 is a genuinely hollow `return true` stub (no refill math, the constructor args unused) and round 1+ is the real token-bucket. The smoke test asserts the realness gate **gates that exact round-0 stub to composite 0** (the anti-cheat demo fires on the dispatch's own content, not a hand-built strawman). In the offline *run*, the refine loop then advances past round 0 to the real impl, so **no leaderboard cell ends up gated** — the gate-to-0 is proven against the dispatch's round-0 content, not produced as a gated row in the scored output. Offline the toolchain (`tsc` / `biome` / `node --test`) isn't on PATH, so the checks fail fast and all 3 rounds run — which is exactly when you want to see refinement. +Offline there is no model, so each scenario's box writes a **canned solution** instead of calling a coding agent — a deterministic stand-in so the example runs with no creds. The scripts are honest: `rate-limiter` **improves across rounds** — round 0 is a **hardcode-the-visible cheat** (it memorizes the visible example answers, no bucket math) and round 1+ is the real token-bucket. The smoke test runs both against the real held-out suite and asserts the cheat **passes the visible test but fails the held-out** (it never saw those inputs), while the real impl passes the held-out outright. Offline `node` is present, so the held-out execution is genuine; `tsc`/`biome` usually aren't, so the typecheck-gated dev checks never fully pass and all 3 rounds run — which is exactly when refinement shows. ## How a tool swap works (one line) @@ -60,26 +60,30 @@ withTools(profile, 'search-mcp') // mount a search MCP instead On the CLI it's `--tools none|web|search-mcp`. **Honesty caveat:** a preset only takes effect for a `(harness, lever)` pair the sandbox actually materializes — if a harness has no native `webfetch`, `--tools web` is a no-op *there*. That's a substrate fact, not something this example papers over. Check `@tangle-network/sandbox` for the materialization matrix before trusting a tool swap on a given harness. -## How it stays honest (the grading-criteria firewall) +## The anti-cheat: held-out test execution (the firewall) -**The LLM-judge rubric, the grading note, and the realness signals never reach the box** — so the agent cannot steer toward the criteria it is scored on. The test fixture is a different case, and the example is precise about it: the test is *seeded* into the box and a multi-round agent **can** read it, exactly as in real TDD. +**The agent cannot game tests it never saw.** That is the whole anti-cheat, and it is *execution truth*, not a text scan: -- A `CodingScenario` (`scenarios.ts`) splits by where each field flows: `prompt` (the only field copied into the agent's **context**), the `fixture` (the deterministic test — **seeded into the workspace**, so `node --test` has a file to run), and the rubric note + realness signals (read **after** the loop by `eval.ts`, **never written into the box**). -- The firewall is one labeled block in **`dispatch.ts`** (`THE GRADING-CRITERIA FIREWALL LIVES HERE`): the only thing the agent's context gets is `scenario.prompt`, plus next-round prompts built **only** from check pass/fail + output. Because the criteria are different fields that the dispatch never writes into the profile, you can **see in one place** that the rubric/realness can't reach the agent. -- **What this protects, precisely:** the agent cannot read the LLM-judge rubric or the realness signals (the metric it would otherwise game). It **can** read the seeded test fixture — that is intentional. The test is a *spec the agent is asked to satisfy*, not a hidden answer key; a coding harness has native file-read tools, and across the 3 refine rounds the agent (and the next-round prompt, which includes the test runner's failure output) sees the assertions. That is the same contract as TDD and is honest for a benchmark: the protected secret is the *grading rubric*, not the tests. -- The realness gate runs **after** the loop and is recorded on the run — the agent can't steer toward a metric it can't read. +- Each scenario carries two test files. The **visible** test (a few example cases) is *seeded into the box during the turn* — the agent develops against it, exactly like real TDD. The **held-out** test (the same behavior, with **different inputs and extra edge cases** the visible examples don't cover) is **never seeded during the turn**. +- During the turn, the box has only: the task prompt + the visible example test. The held-out suite never enters the box while the agent is working — **that is the firewall**. +- At grading (after the refine loop), the harness copies the held-out suite into the box and runs it (`node --experimental-transform-types --test`). The **held-out pass rate is the PRIMARY, ungameable correctness score.** +- A solution that hardcoded the visible examples' exact values passes the visible test but **fails the held-out inputs** (e.g. the rate-limiter held-out uses capacities `7/6/5/2`, not the visible `10/3/10`). A solution that faked the hard part fails them too. Only real behavior passes both. -## How it scores (validators before judge) +You can **see the firewall in one place** in `dispatch.ts` (`THE FIREWALL LIVES HERE`): the only thing the agent's context gets is `scenario.prompt`, the only test seeded during the loop is `scenario.visibleTest`, and `runHeldout` (the held-out seed + run) is called *after* the loop closes. The LLM-judge rubric note is read later by `eval.ts` and is never written into the box. + +## How it scores (held-out correctness first, judge second) Scoring runs in strict order, cheapest and most objective first — an `agent-eval` primitive at each layer: -1. **Deterministic checks (first, in the box, ~$0).** An ordered **`MultiLayerVerifier`** pipeline: `typecheck → test → lint`, with dependency-based skip (test never runs on a type error) and a blended score. typecheck + test gate `allPass` (and the refine loop); lint is advisory. These pass/fail booleans are the only thing that steers the next round. The test layer runs `node --experimental-transform-types --test`, not plain `node --test`: the fixture imports the solution as a `.ts` file, and Node's default type-*stripping* throws on constructor parameter properties (`constructor(private x: number)`) — the exact style the canonical impl uses — so a correct solution would otherwise score as a test failure. (`eval.ts` · `runChecks`) -2. **Realness gate (no LLM, and it GATES).** `scoreAuthenticity` + `gateRealness` — a pure structural scan that catches the stub shapes each task's signals encode. It is not just recorded: a **gated** artifact short-circuits the judge to composite **0 with no model call**. The gate fires on `fakeShim && !realImpl`, so each task's `realImpl` is anchored to the actual hard-part work (refill *math*, quote-state tracking, capacity eviction) and its `fakeShim` to the natural shortcut — tuned so the **natural cheat gates, not just one strawman**: a `return true` rate-limiter whose only "refill" is a constructor param name, a `for (… input.split('\n'))` CSV split, and a no-eviction `Map` wrapper all gate. It is **not** a general "any fake is caught" guarantee — it catches the specific shapes listed (the smoke test asserts each natural cheat is gated, on the dispatch's own content). On the sample tasks a real impl scores realness **85** and each cheat is **gated → composite 0**. (`eval.ts` · `realnessGate`, `scenarios.ts` · `realnessSignals`) -3. **LLM judge (last, only on the band the checks can't resolve).** A 4-dimension weighted rubric — correctness 0.40 · completeness 0.25 · code_quality 0.20 · robustness 0.15. The rubric text + anchors live **with the judge**, never in the workdir. (`eval.ts`) +1. **Dev checks (first, in the box, ~$0, advisory for the grade).** An ordered **`MultiLayerVerifier`** pipeline: `typecheck → test(visible) → lint`, with dependency-based skip (test never runs on a type error) and a blended score. These pass/fail booleans are the only thing that steers the next refine round — they tell the agent it's on track, but passing the visible examples does **not** prove correctness. The test layer runs `node --experimental-transform-types --test`, not plain `node --test`: the test imports the solution as a `.ts` file, and Node's default type-*stripping* throws on constructor parameter properties (`constructor(private x: number)`) — the exact style the canonical impl uses — so a correct solution would otherwise score as a test failure. (`eval.ts` · `runChecks`) +2. **Held-out test execution (the PRIMARY anti-cheat).** After the loop, the hidden suite is seeded and run in the same box; the **held-out pass rate** is the primary correctness number. It is ungameable: the agent never saw these inputs, so a hardcode-the-visible cheat or a faked impl fails. (`eval.ts` · `runHeldout`, `scenarios.ts` · `heldoutTest`) +3. **LLM judge (last, SECONDARY quality signal).** A 4-dimension weighted rubric — correctness 0.40 · completeness 0.25 · code_quality 0.20 · robustness 0.15. The rubric text + anchors live **with the judge**, never in the workdir. The judge scores code *quality*; it does not decide correctness. (`eval.ts`) + +**The composite** the leaderboard ranks on is **`0.7 · held-out-pass-rate + 0.3 · judge-quality`** — held-out correctness is load-bearing, the judge is a tie-breaker on style. On the rate-limiter task the round-0 hardcode-the-visible cheat scores held-out 2/4 → composite **0.59**; the real token-bucket scores held-out 4/4 → composite **0.94** (with the judge held equal at 0.80). (`eval.ts` · `composeScore`) **How many judges:** - **Default: 1** — `singleCodeJudge`, built from `llmJudge` (one model call). Cheap, for the leaderboard sweep. -- **`--ensemble`: 3** — `ensembleCodeJudge`, built from `ensembleJudge`, three **cross-family**, snapshot-dated models (deepseek + openai + google). `crossFamily: true` rejects a same-family panel at construction, so the ensemble is genuinely independent **live**. The panel sees the **same full context** (code + check results + rubric note) the single judge does. Use it only for a ship/no-ship claim. (Offline, all three share the mock transport — see the offline note above.) +- **`--ensemble`: 3** — `ensembleCodeJudge`, built from `ensembleJudge`, three **cross-family**, snapshot-dated models (deepseek + openai + google). `crossFamily: true` rejects a same-family panel at construction, so the ensemble is genuinely independent **live**. The panel sees the **same full context** (code + check results + held-out pass rate + rubric note) the single judge does. Use it only for a ship/no-ship claim. (Offline, all three share the mock transport — see the offline note above.) ## How the stats are real (`stats.ts`) @@ -91,7 +95,7 @@ Every number is one `agent-eval` primitive call — **no hand-rolled statistics - **Reps don't fake independent n — anywhere.** The paired unit is the *scenario*, and **the leaderboard uses the same unit**: with `--reps > 1`, a harness produces several records per scenario, so BOTH the leaderboard CI/Wilson AND the pairing collapse reps to **one mean per (harness, scenario)** before computing anything. Reps tighten the per-cell *estimate*; they are not independent samples, so they never narrow the interval out of zero new information. The reported `n` is the number of distinct scenarios, not the record count. (A regression test asserts identical reps leave the CI unchanged.) - A record missing its `scenarioId` is a **loud throw**, not a silent merge — averaging distinct scenarios into one `''` bucket would corrupt the pairing, so it fails fast instead. -> **Power caveat.** The example corpus is **3 tasks** — far below what these tests need to separate harnesses. The Wilcoxon path returns `p=1` for fewer than 6 non-zero diffs, and the paired t-test has ~1 degree of freedom, so at this corpus size the significance machinery is structurally **non-significant**; it demonstrates the *wiring*, not a defensible claim. `renderStats` prints this caveat whenever `n < 6`. A real harness comparison wants **20-50 tasks**. +> **Power caveat.** The example corpus is **3 tasks** — far below what these tests need to separate harnesses. The Wilcoxon path returns `p=1` for fewer than 6 non-zero diffs, and the paired t-test has ~1 degree of freedom. Below the power floor (`n < 6`) `renderStats` **suppresses the `SIGNIFICANT` tag entirely** (a near-constant gap on a few scenarios can return `p<0.05` and still mean nothing — the small-n mirage), and a zero-variance pair (a collapsed bootstrap CI) never reads as a real effect either. At this corpus size the example demonstrates the *wiring*, not a defensible claim. A real harness comparison wants **20-50 tasks**. The leaderboard labels are the readable harness names, not the matrix's internal profile hashes. @@ -99,22 +103,22 @@ The leaderboard labels are the readable harness names, not the matrix's internal | File | What it owns | |---|---| -| `scenarios.ts` | the 3-task held-out corpus + the firewall-as-a-type (`prompt` vs rubric/realness vs the seeded fixture) + the seeded test fixtures + the check commands + the realness signals (tuned so the natural cheat gates) | +| `scenarios.ts` | the 3-task corpus + the firewall-as-a-type (`prompt` vs `visibleTest` vs the held-out `heldoutTest` vs the judge rubric) + the seeded visible tests + the held-out grading suites + the check commands | | `profiles.ts` | the harness axis (one bare baseline `AgentProfile` per harness) **and** the one-line tool knob (`withTools` + presets) | -| `eval.ts` | the scoring stack: `runChecks` (`MultiLayerVerifier`) + `realnessGate` + `singleCodeJudge` (`llmJudge`) / `ensembleCodeJudge` (`ensembleJudge`) | -| `dispatch.ts` | renders one matrix cell: persistent box + multi-round refine + token metering. **The firewall lives here.** | +| `eval.ts` | the scoring stack: `runChecks` (`MultiLayerVerifier`) + `runHeldout` (the held-out execution) + `composeScore` (held-out × judge blend) + `singleCodeJudge` (`llmJudge`) / `ensembleCodeJudge` (`ensembleJudge`) | +| `dispatch.ts` | renders one matrix cell: persistent box + multi-round refine + held-out grading + token metering. **The firewall lives here.** | | `offline-box.ts` | an in-process `SandboxClient` so the whole thing runs with no creds | -| `stats.ts` | leaderboard + `pairedTTest` / `pairedBootstrap` / `benjaminiHochberg` / `confidenceInterval` / `wilson` | +| `stats.ts` | leaderboard + `pairedTTest` / `pairedBootstrap` / `benjaminiHochberg` / `confidenceInterval` / `wilson`, with the small-n SIGNIFICANT-suppression guard | | `benchmark.ts` | the entrypoint: build the axes, hand the matrix the dispatch + judges, run, print stats | -| `coding-benchmark.test.ts` | offline smoke — the matrix produces `harnesses × scenarios × reps` records; the realness gate gates the dispatch's round-0 stub AND each natural cheat (per task); reps don't narrow the CI | +| `coding-benchmark.test.ts` | offline smoke — the matrix produces `harnesses × scenarios × reps` records; a hardcode-the-visible cheat fails the held-out suite while the real solution passes (by execution); the held-out test is never seeded during the turn (firewall); reps don't narrow the CI | ## Primitives composed - **matrix:** `runProfileMatrix({ profiles, scenarios, dispatch, judges, reps, integrity, costCeiling })` (`@tangle-network/agent-eval/campaign`) with a `ProfileDispatchFn` rendering each cell - **box + multi-round:** `openSandboxRun(client, opts, deliverable)` → `.start()` / `.resume()` over one persistent, resumable session (`@tangle-network/agent-runtime/loops`) -- **deterministic layer:** `MultiLayerVerifier` — ordered `typecheck → test → lint` with dependency-based skip and a blended score (`@tangle-network/agent-eval`) +- **dev layer:** `MultiLayerVerifier` — ordered `typecheck → test → lint` with dependency-based skip and a blended score (`@tangle-network/agent-eval`) +- **held-out execution:** the hidden suite is seeded after the loop and run with `node --experimental-transform-types --test`; the pass rate is the primary score (`eval.ts` · `runHeldout`) - **token metering:** `extractLlmCallEvent` (`@tangle-network/agent-runtime/loops`) — reads usage off **every** backend event shape (`done` / `result` / `llm_call` / `usage`) so the integrity guard sees a real run -- **realness:** `scoreAuthenticity` + `gateRealness` (`@tangle-network/agent-eval/authenticity`) - **judges:** `llmJudge` (single model call → canonical `JudgeConfig`, imported from `@tangle-network/agent-eval/campaign` so it resolves across the whole peer range) and `ensembleJudge` for the cross-family panel (`@tangle-network/agent-eval`); the judge transport is a `ChatClient` (`createChatClient` — a `mock` handler offline, the `router` live) - **integrity:** `integrity: 'assert'` on the matrix proves a real backend ran (no stubbed cell) — `'off'` only for the offline mock - **stats:** `pairedTTest`, `wilcoxonSignedRank`, `pairedBootstrap`, `benjaminiHochberg`, `confidenceInterval`, `wilson` @@ -126,8 +130,8 @@ The leaderboard labels are the readable harness names, not the matrix's internal 1. **`TANGLE_API_KEY` + `SANDBOX_BASE_URL`** — the dispatch lazily `import()`s `@tangle-network/sandbox` (behind the live flag, so the offline path never needs the SDK) and creates a real harness box per cell. 2. **A real judge model** — the judge's `ChatClient` becomes `createChatClient({ transport: 'router', apiKey })`; set `JUDGE_MODEL` (and optionally `TANGLE_ROUTER_URL`) to point it at your router. `--ensemble` then calls three real cross-family models. 3. The matrix runs with `integrity: 'assert'`, so a cell that produced no real token usage fails loudly instead of reporting a clean stub leaderboard. -4. **The harness box image must provide the toolchain on `PATH`** — the deterministic checks invoke bare `tsc`, `biome`, and `node --experimental-transform-types`. The test layer needs **Node >= 22.6** (for `--experimental-transform-types` and `.ts`-import test execution); on an older Node a correct param-property solution would fail with no hint why. A missing **advisory** tool (`biome`) folds to 0.5 and doesn't gate; a missing **`tsc`** gates the cell — so sanity-check your box image before trusting a live leaderboard. (Offline, a missing tool reads as a fail-fast, which is the honest no-toolchain signal.) +4. **The harness box image must provide the toolchain on `PATH`** — the checks invoke bare `tsc`, `biome`, and `node --experimental-transform-types`. The test layer **and the held-out grading** need **Node >= 22.6** (for `--experimental-transform-types` and `.ts`-import test execution); on an older Node a correct param-property solution would fail with no hint why. A missing **advisory** tool (`biome`) folds to 0.5 and doesn't gate; a missing **`tsc`** fails the dev checks — so sanity-check your box image before trusting a live leaderboard. (Offline, `tsc`/`biome` are absent so the dev checks fail fast, but `node` is present so the held-out grading still runs for real.) -Everything else — the dispatch, the verifier, the realness gate, the stats — is identical between offline and live. That's the point: only the agent and the judge model change. +Everything else — the dispatch, the verifier, the held-out execution, the stats — is identical between offline and live. That's the point: only the agent and the judge model change. **Note on codex:** codex emits no structured tool calls, so per-tool progress is unavailable there. It still runs and scores; that's a harness property, not a gap in this example. diff --git a/examples/coding-benchmark/benchmark.ts b/examples/coding-benchmark/benchmark.ts index 2942d1f..5987220 100644 --- a/examples/coding-benchmark/benchmark.ts +++ b/examples/coding-benchmark/benchmark.ts @@ -73,24 +73,31 @@ function parseArgs(argv: string[]): BenchmarkOptions { // ── the offline "agent": a scripted, REFINING solution per scenario ─────────── // Offline we don't have a model, so each scenario's box writes a canned solution. -// `rate-limiter` IMPROVES across rounds (round 0 = a genuinely hollow `return true` -// stub the realness gate GATES to composite 0; round 1+ = the real token-bucket) — -// a real refine demo that fires the anti-cheat gate on the benchmark's OWN data. +// `rate-limiter` IMPROVES across rounds (round 0 = a HARDCODE-THE-VISIBLE cheat that +// only answers the visible example inputs; round 1+ = the real token-bucket). The cheat +// passes the visible tests but FAILS the held-out suite (different inputs it never saw) — +// the anti-cheat demo fires on the benchmark's OWN data, by execution, not a regex. // `csv-parser` writes its real implementation from round 0. export const offlineSolutions: Record = { 'rate-limiter': { path: 'src/rate-limiter.ts', solutionFor: (round) => round === 0 - ? // round 0 — a genuinely HOLLOW stub: it accepts the constructor args (so the - // hidden test instantiates it) but the body is pure cheat — `tryRemove` is a - // hardcoded `return true` with NO refill math and NO use of the args. The - // realness gate's `realImpl` signal does not fire (no `Date.now`/`refill`), - // `fakeShim` does, so gateRealness GATES it → composite 0. The param names are - // intentionally inert (no `refill` token) so the stub is hollow, not a real - // impl that merely "happens to return true". (Verified by the smoke test.) - `export class RateLimiter {\n constructor(_capacity: number, _ratePerSec: number) {}\n` + - ` tryRemove(n: number): boolean { return true }\n}\n` + ? // round 0 — a HARDCODE-THE-VISIBLE cheat: it replays the exact visible example + // calls (cap 10/3/10, the specific draws + their call order) and returns canned + // answers, with NO bucket math. It PASSES the visible tests but FAILS the + // held-out suite (cap 7/6/5/2, different draws + edge cases it never saw), + // caught by EXECUTION on inputs the cheat never memorized. + `export class RateLimiter {\n` + + ` private cap: number\n private refill: number\n private call = 0\n` + + ` constructor(capacity: number, refillPerSec: number) { this.cap = capacity; this.refill = refillPerSec }\n` + + ` tryRemove(_n: number): boolean {\n` + + ` // hardcoded to the visible examples only — keyed on the exact (cap, refill)\n` + + ` // pairs the visible tests use; no real bucket math.\n` + + ` this.call++\n` + + ` if (this.cap === 3) return false // visible (3,1): draw 4 -> false\n` + + ` if (this.cap === 10 && this.refill === 0) return this.call === 1 // visible (10,0): T,F\n` + + ` return true // visible (10,1): T,T\n }\n}\n` : // round 1+ — the real token-bucket with continuous time-based refill. `export class RateLimiter {\n private tokens: number\n private last = Date.now()\n` + ` constructor(private capacity: number, private refillPerSec: number) { this.tokens = capacity }\n` + @@ -113,8 +120,8 @@ export const offlineSolutions: Record = { 'lru-cache': { path: 'src/lru.ts', // Writes the real insertion-ordered-Map LRU from round 0 (the eviction logic is the - // whole point; there is no honest hollow stub for this task). passes realness (85) - // and the hidden eviction tests. + // whole point; there is no honest hollow stub for this task). Passes both the visible + // and the held-out eviction suites. solutionFor: () => `export class LruCache {\n private map = new Map()\n` + ` constructor(private capacity: number) {}\n` + diff --git a/examples/coding-benchmark/coding-benchmark.test.ts b/examples/coding-benchmark/coding-benchmark.test.ts index 93b21d1..3034010 100644 --- a/examples/coding-benchmark/coding-benchmark.test.ts +++ b/examples/coding-benchmark/coding-benchmark.test.ts @@ -1,47 +1,99 @@ /** * Offline smoke test — proves the whole pipeline runs with no creds and that the - * load-bearing honesty claims hold: + * load-bearing honesty claims hold, BY EXECUTION (not a text scan): * 1. the matrix produces exactly `harnesses × scenarios × reps` records and a * defined leaderboard (the wiring is real, not a stub that returns nothing); - * 2. the realness gate gates the ACTUAL round-0 stub the dispatch writes (not a - * separate strawman) to composite 0 — the anti-cheat demo fires on the - * benchmark's own data, and passes the real refined impl; - * 3. the gate also fires on the NATURAL cheat for EVERY task (not just one stub - * shape) — the README's "catches the listed stub shapes" claim, ground-truthed; - * 4. a real impl scored WITH its seeded fixture as a reference carries no spurious - * DEAD_ARTIFACT flag (the realness scan sees the artifact is imported); - * 5. reps tighten the per-cell estimate HONESTLY — identical reps do NOT narrow - * the leaderboard CI vs reps=1 (reps are not independent samples). + * 2. THE ANTI-CHEAT, end-to-end: a hardcode-the-visible CHEAT passes the VISIBLE + * tests but FAILS the HELD-OUT suite (low pass rate) → LOW composite; the REAL + * solution PASSES the held-out suite → HIGH composite. Run for real against an + * in-process box (`node --test`), no creds. This is the whole point of the example. + * 3. the held-out test is NEVER seeded into the box during the agent turn — the + * firewall — only at grading; + * 4. reps tighten the per-cell estimate HONESTLY — identical reps do NOT narrow the + * leaderboard CI vs reps=1 (reps are not independent samples). */ +import { exec as execCb } from 'node:child_process' +import { mkdtempSync, rmSync } from 'node:fs' +import { mkdir, writeFile } from 'node:fs/promises' +import { tmpdir } from 'node:os' +import { dirname, join } from 'node:path' +import { promisify } from 'node:util' import type { RunRecord } from '@tangle-network/agent-eval' -import type { ProducedFile } from '@tangle-network/agent-eval/authenticity' import { describe, expect, it } from 'vitest' import { main, offlineSolutions } from './benchmark' -import { realnessGate } from './eval' +import { type CheckBox, composeScore, runChecks, runHeldout } from './eval' import { harnessProfiles } from './profiles' -import { type CodingScenario, scenarios } from './scenarios' +import { type CodingScenario, checkCmds, scenarios } from './scenarios' import { pairwiseStats } from './stats' -/** The natural cheat per scenario — the shortcut a real agent would actually reach for, - * NOT a hand-built strawman: the gate must fire on each of these. */ -const naturalCheats: Record = { - // hollow body, but the constructor param is named `refillPerSec` (the prompt's own - // name) — the decoy that defeated a `/refill/` realImpl. - 'rate-limiter': - 'export class RateLimiter {\n constructor(_capacity: number, refillPerSec: number) {}\n tryRemove(n: number): boolean { return true }\n}\n', - // a for-loop is present, but it splits on newline+comma — the naive parse the - // RFC-4180 cases break. The loop must NOT read as a real impl. - 'csv-parser': - 'export function parseCsv(input: string): string[][] {\n const out: string[][] = []\n for (const line of input.split("\\n")) out.push(line.split(","))\n return out\n}\n', - // a Map wrapper that never evicts — grows without bound, fails the at-capacity test. - 'lru-cache': - 'export class LruCache {\n private store = new Map()\n constructor(_capacity: number) {}\n get(k: K) { return this.store.get(k) }\n set(k: K, v: V) { this.store.set(k, v) }\n}\n', +const execAsync = promisify(execCb) + +/** A real in-process `CheckBox` over a fresh temp dir — `fs.write` + `exec` only, the + * exact surface `runChecks` / `runHeldout` use. `node --test` runs for real here, so the + * held-out execution is genuine (no creds, no network). */ +function tempBox(): { box: CheckBox; dir: string } { + const dir = mkdtempSync(join(tmpdir(), 'coding-bench-test-')) + const box: CheckBox = { + fs: { + async write(path: string, content: string) { + const abs = join(dir, path) + await mkdir(dirname(abs), { recursive: true }) + await writeFile(abs, content, 'utf8') + }, + }, + async exec(command: string) { + try { + const { stdout, stderr } = await execAsync(command, { cwd: dir, timeout: 30_000 }) + return { exitCode: 0, stdout, stderr } + } catch (err) { + const e = err as { code?: number; stdout?: string; stderr?: string; message?: string } + return { + exitCode: e.code ?? 1, + stdout: e.stdout ?? '', + stderr: e.stderr ?? e.message ?? '', + } + } + }, + } + return { box, dir } +} + +/** Write a solution, run the VISIBLE example test + the HELD-OUT suite against it in one + * box, and return both pass rates. This is exactly what the dispatch does (minus the + * agent turn): seed the visible test during "the turn", then the held-out suite at + * grading. We run each test command directly (not the typecheck-gated `runChecks` + * pipeline) so the result reflects the TESTS, not the absence of `tsc` offline — the + * whole point is to compare visible-pass vs held-out-pass by execution. */ +async function gradeSolution( + scenario: CodingScenario, + solution: string, +): Promise<{ visiblePassRate: number; heldoutPassRate: number; heldoutNotes: string }> { + const { box, dir } = tempBox() + try { + const cmds = checkCmds(scenario) + await box.fs?.write(scenario.solutionPath, solution) + // "During the turn": the visible example test is seeded + run. + const visible = await runHeldout( + box, + { ...scenario, heldoutTest: scenario.visibleTest }, + cmds.dev, + ) + // "At grading": the held-out suite is seeded + run (the real anti-cheat). + const heldout = await runHeldout(box, scenario, cmds.heldout) + return { + visiblePassRate: visible.passRate, + heldoutPassRate: heldout.passRate, + heldoutNotes: heldout.notes, + } + } finally { + rmSync(dir, { recursive: true, force: true }) + } } describe('coding-benchmark (offline)', () => { // Integration smoke: runs the real matrix end-to-end (real box.exec on the offline - // toolchain, all refine rounds since the checks can't pass without the toolchain). + // toolchain, all refine rounds since the dev checks can't pass without tsc). it('runs the full matrix and returns a defined leaderboard', async () => { const reps = 1 const summary = await main(['--reps', String(reps)]) @@ -49,96 +101,88 @@ describe('coding-benchmark (offline)', () => { expect(summary.leaderboard).toBe(harnessProfiles.length) }, 180_000) - it("gates the dispatch's OWN round-0 stub to composite 0 (the demo fires on real data)", () => { - const rl = scenarios.find((s) => s.id === 'rate-limiter') - expect(rl).toBeDefined() + // THE ANTI-CHEAT, proven by execution: the round-0 hardcode-the-visible cheat PASSES + // the visible test but FAILS the held-out suite (it never saw those inputs), and the + // refined real impl PASSES the held-out suite. Composite ranks the real one far above. + it('a hardcode-the-visible cheat FAILS the held-out tests; the real solution PASSES', async () => { + const rl = scenarios.find((s) => s.id === 'rate-limiter') as CodingScenario const script = offlineSolutions['rate-limiter'] expect(script).toBeDefined() - // The EXACT content the offline dispatch writes on round 0 — not a hand-built - // strawman. If a future edit makes this stub look real, this test goes red. - const round0 = (script as NonNullable).solutionFor(0) - const verdict = realnessGate( - [{ path: 'src/rate-limiter.ts', content: round0 }], - (rl as NonNullable).realnessSignals, - ) - expect(verdict.gated).toBe(true) - expect(verdict.score).toBe(0) - }) + const cheat = (script as NonNullable).solutionFor(0) // round-0 cheat + const real = (script as NonNullable).solutionFor(1) // refined real impl - it("passes the dispatch's refined round-1 token-bucket implementation", () => { - const rl = scenarios.find((s) => s.id === 'rate-limiter') - const script = offlineSolutions['rate-limiter'] - expect(script).toBeDefined() - const round1 = (script as NonNullable).solutionFor(1) - const verdict = realnessGate( - [{ path: 'src/rate-limiter.ts', content: round1 }], - (rl as NonNullable).realnessSignals, - ) - expect(verdict.gated).toBe(false) - expect(verdict.score).toBeGreaterThan(0) - }) + const cheatGrade = await gradeSolution(rl, cheat) + const realGrade = await gradeSolution(rl, real) - // The HIGH-severity claim: the gate fires on the NATURAL cheat for every task, not - // only on one strawman stub shape. Each cheat below is the realistic shortcut with a - // decoy token that defeated a looser realImpl — all must gate. - it.each(scenarios)('gates the natural cheat for $id', (scenario: CodingScenario) => { - const cheat = naturalCheats[scenario.id] - expect(cheat, `no natural-cheat fixture for ${scenario.id}`).toBeDefined() - const verdict = realnessGate( - [{ path: scenario.solutionPath, content: cheat as string }], - scenario.realnessSignals, - ) - expect(verdict.gated, `natural cheat for ${scenario.id} slipped past the gate`).toBe(true) - expect(verdict.score).toBe(0) - }) + // The cheat memorizes the visible example answers, so it passes the visible test... + expect( + cheatGrade.visiblePassRate, + `cheat should pass the visible test: ${cheatGrade.heldoutNotes}`, + ).toBe(1) + // ...but it FAILS the held-out suite (different inputs it never saw). + expect( + cheatGrade.heldoutPassRate, + `cheat should NOT fully pass held-out: ${cheatGrade.heldoutNotes}`, + ).toBeLessThan(1) - // The real offline solution for every task scores real (not gated). Confirms each - // scenario's tightened realImpl still accepts the genuine implementation. - it.each(scenarios)('passes the real offline solution for $id', (scenario: CodingScenario) => { - const script = offlineSolutions[scenario.id] - expect(script, `no offline solution for ${scenario.id}`).toBeDefined() - const content = (script as NonNullable).solutionFor(99) // settled round - const verdict = realnessGate( - [{ path: scenario.solutionPath, content }], - scenario.realnessSignals, - ) - expect(verdict.gated, `real solution for ${scenario.id} was wrongly gated`).toBe(false) - expect(verdict.score).toBeGreaterThan(0) - }) + // The real implementation passes the held-out suite outright. + expect( + realGrade.heldoutPassRate, + `real solution should pass held-out: ${realGrade.heldoutNotes}`, + ).toBe(1) - // The runtime DEAD_ARTIFACT fix: a real solution scored WITH its seeded fixture as a - // non-scored reference carries no DEAD_ARTIFACT flag (the scan sees it IS imported). + // Composite ranks the real solution strictly above the cheat (held-out is primary). + // Hold the secondary judge score equal so the gap is purely the held-out term. + const judgeQuality = 0.8 + const cheatComposite = composeScore(cheatGrade.heldoutPassRate, judgeQuality) + const realComposite = composeScore(realGrade.heldoutPassRate, judgeQuality) + expect(realComposite).toBeGreaterThan(cheatComposite) + }, 60_000) + + // Every scenario's REAL offline solution passes its held-out suite (the suites are not + // accidentally impossible) — run for real against the in-process box. it.each( scenarios, - )('does not flag DEAD_ARTIFACT on the real $id solution', (scenario: CodingScenario) => { + )('the real offline solution passes the held-out suite for $id', async (scenario: CodingScenario) => { const script = offlineSolutions[scenario.id] - const content = (script as NonNullable).solutionFor(99) - const reference: ProducedFile[] = [ - { path: scenario.fixture.path, content: scenario.fixture.content }, - ] - const verdict = realnessGate( - [{ path: scenario.solutionPath, content }], - scenario.realnessSignals, - reference, - ) - expect(verdict.notes).not.toContain('DEAD_ARTIFACT') - }) + expect(script, `no offline solution for ${scenario.id}`).toBeDefined() + const solution = (script as NonNullable).solutionFor(99) // settled round + const grade = await gradeSolution(scenario, solution) + expect( + grade.heldoutPassRate, + `real ${scenario.id} failed held-out: ${grade.heldoutNotes}`, + ).toBe(1) + }, 60_000) - // A reference cannot rescue a cheat: the gate still fires with the fixture present. + // FIREWALL: the held-out test is never seeded into the box during the agent turn — + // only the visible test is. After running the dev checks (which seed the visible test), + // the held-out file must NOT exist in the box; it appears only after `runHeldout`. it.each( scenarios, - )('a fixture reference does not rescue the $id cheat', (scenario: CodingScenario) => { - const cheat = naturalCheats[scenario.id] as string - const reference: ProducedFile[] = [ - { path: scenario.fixture.path, content: scenario.fixture.content }, - ] - const verdict = realnessGate( - [{ path: scenario.solutionPath, content: cheat }], - scenario.realnessSignals, - reference, - ) - expect(verdict.gated).toBe(true) - }) + )('does NOT seed the held-out test during the turn for $id', async (scenario: CodingScenario) => { + const { box, dir } = tempBox() + try { + const cmds = checkCmds(scenario) + const script = offlineSolutions[scenario.id] as NonNullable<(typeof offlineSolutions)[string]> + await box.fs?.write(scenario.solutionPath, script.solutionFor(99)) + // "The turn": run the dev checks, which seed ONLY the visible test. + await runChecks(box, scenario, cmds) + // The visible test exists; the held-out test must NOT (the firewall). + const visibleExists = await box.exec(`test -f '${scenario.visibleTest.path}'`) + const heldoutExists = await box.exec(`test -f '${scenario.heldoutTest.path}'`) + expect(visibleExists.exitCode, 'visible test should be seeded during the turn').toBe(0) + expect( + heldoutExists.exitCode, + `held-out test for ${scenario.id} leaked into the box during the turn`, + ).not.toBe(0) + // Only AFTER grading does the held-out file appear. + await runHeldout(box, scenario, cmds.heldout) + const afterGrading = await box.exec(`test -f '${scenario.heldoutTest.path}'`) + expect(afterGrading.exitCode, 'held-out should be seeded at grading').toBe(0) + } finally { + rmSync(dir, { recursive: true, force: true }) + } + }, 60_000) it('reps do NOT fake independent n — identical reps leave the CI unchanged', () => { // Two harnesses, two scenarios, identical scores. Build records for reps=1 and diff --git a/examples/coding-benchmark/dispatch.ts b/examples/coding-benchmark/dispatch.ts index aaa0737..9ef243e 100644 --- a/examples/coding-benchmark/dispatch.ts +++ b/examples/coding-benchmark/dispatch.ts @@ -14,22 +14,24 @@ * guard sees a real run. * * ┌─────────────────────────────────────────────────────────────────────────┐ - * │ THE GRADING-CRITERIA FIREWALL LIVES HERE. │ + * │ THE FIREWALL LIVES HERE — and it is EXECUTION-BASED, not a text scan. │ * │ The ONLY scenario field that reaches the agent's CONTEXT is │ * │ `scenario.prompt` (the `taskToPrompt` below, and `nextPrompt` built ONLY │ - * │ from check output). The LLM-judge rubric, the grading note, and the │ - * │ realness signals are read later by eval.ts — they are never written into │ - * │ the box, so the agent cannot steer toward the criteria it is scored on. │ + * │ from check output). The LLM-judge rubric note is read later by eval.ts — │ + * │ never written into the box. │ * │ │ - * │ The deterministic test fixture is a different case: it is SEEDED into the │ - * │ box workspace (so `node --test` has a file to run) and a multi-round │ - * │ agent with native file tools CAN read it — intentional, the same as real │ - * │ TDD. The test is a SPEC the agent is asked to satisfy, not a hidden │ - * │ rubric. So: the rubric/realness are firewalled; the test is visible. │ + * │ The VISIBLE example test is seeded into the box DURING the turn (so │ + * │ `node --test` has a file to run) and a multi-round agent with native file │ + * │ tools CAN read it — intentional, the same as real TDD. │ + * │ │ + * │ The HELD-OUT grading suite is NEVER seeded during the turn. It is copied │ + * │ in ONLY at grading (after the loop, `runHeldout` below) and run; the │ + * │ score is its pass rate. The agent cannot game tests it never saw — a │ + * │ hardcode-the-visible cheat fails the held-out inputs. THAT is the │ + * │ anti-cheat: execution truth on hidden tests. │ * └─────────────────────────────────────────────────────────────────────────┘ */ -import type { ProducedFile } from '@tangle-network/agent-eval/authenticity' import type { DispatchContext, ProfileDispatchFn } from '@tangle-network/agent-eval/campaign' import type { AgentProfile } from '@tangle-network/agent-interface' import { @@ -39,7 +41,7 @@ import { type SandboxClient, } from '@tangle-network/agent-runtime/loops' import type { SandboxEvent } from '@tangle-network/sandbox' -import { type CheckBox, layerOutput, type RunArtifact, realnessGate, runChecks } from './eval' +import { type CheckBox, layerOutput, type RunArtifact, runChecks, runHeldout } from './eval' import { harnessOf, type ToolPreset, withTools } from './profiles' import { type CodingScenario, checkCmds } from './scenarios' @@ -47,9 +49,9 @@ import { type CodingScenario, checkCmds } from './scenarios' const maxRounds = 3 /** Build the next-round prompt from the checks the AGENT is allowed to see — the - * pass/fail + output of the deterministic layers. NEVER from the rubric, realness, - * or judge. This is the firewall in action: the agent steers on objective check - * failures, nothing else. + * pass/fail + output of the VISIBLE example tests. NEVER from the held-out suite, the + * rubric, or the judge. This is the firewall in action: the agent steers on the visible + * example failures, nothing else, and is GRADED on the held-out suite it never saw. * * typecheck/test are gating (a failure blocks `allPass`); lint is advisory (it never * gates) but its warnings are still surfaced here so the agent can fix style — visible @@ -104,30 +106,25 @@ export function codingDispatch( } // Read the produced solution file off the box after each turn (the deliverable). - const run = await openSandboxRun<{ solution: string; files: ProducedFile[] }>( + const run = await openSandboxRun<{ solution: string }>( clientFor(profile), { agentRun, signal: ctx.signal, runId: ctx.cellId, scenarioId: scenario.id }, { kind: 'artifact', path: scenario.solutionPath, - fromArtifact: (raw: string) => ({ - solution: raw, - files: [{ path: scenario.solutionPath, content: raw }], - }), + fromArtifact: (raw: string) => ({ solution: raw }), }, ) try { let checks = blankReport() let solution = '' - let files: ProducedFile[] = [] let finalText = '' for (let round = 0; round < maxRounds; round += 1) { const prompt = round === 0 ? scenario.prompt : nextPrompt(checks) const turn = round === 0 ? await run.start(prompt) : await run.resume(prompt) solution = turn.out.solution - files = turn.out.files finalText = turn.events.map(eventText).filter(Boolean).join(' ').slice(0, 2000) // Report usage so the integrity guard sees a real backend (not a stub). @@ -136,25 +133,23 @@ export function codingDispatch( const usage = sumTokens(turn.events) if (usage.input || usage.output) ctx.cost.observeTokens(usage) - // Deterministic checks, IN THE BOX, this round. These (and only these) steer - // the next round — the firewall keeps the rubric/realness out of the loop. - // `run.box` is a `SandboxInstance`; `CheckBox` is the minimal `exec`(+optional - // `fs.write`) subset the checks actually use — a structural narrowing, no widening. + // Dev checks (visible example tests), IN THE BOX, this round. These (and only + // these) steer the next round — the firewall keeps the held-out suite + rubric + // out of the loop. `run.box` is a `SandboxInstance`; `CheckBox` is the minimal + // `exec`(+optional `fs.write`) subset the checks use — a structural narrowing. checks = await runChecks(run.box as CheckBox, scenario, cmds) if (checks.allPass) break // stop on worker-observable green only } - // The realness anchor runs AFTER the loop — never inside it, so it can never - // steer the agent. Its verdict is recorded for honesty AND gates the judge. The - // seeded fixture is passed as a non-scored REFERENCE so the scan sees the - // solution IS imported (no spurious DEAD_ARTIFACT on a real solution); a cheat - // still gates regardless of what references it. - const realness = realnessGate(files, scenario.realnessSignals, [ - { path: scenario.fixture.path, content: scenario.fixture.content }, - ]) - await ctx.artifacts.writeJson(`realness/${ctx.cellId}.json`, realness) - - return { files, solution, finalText, checks, realness } + // HELD-OUT TEST EXECUTION — the anti-cheat. Runs AFTER the loop in the SAME box: + // the held-out suite (never seeded during the turn) is copied in and run against + // the agent's real solution. Its pass rate is the PRIMARY correctness score (the + // judge blends it as the recorded composite). A solution that hardcoded the visible + // examples fails the held-out inputs it never saw — execution truth, not a regex. + const heldout = await runHeldout(run.box as CheckBox, scenario, cmds.heldout) + await ctx.artifacts.writeJson(`heldout/${ctx.cellId}.json`, heldout) + + return { solution, finalText, checks, heldout } } finally { await run.close() } diff --git a/examples/coding-benchmark/eval.ts b/examples/coding-benchmark/eval.ts index 869cdc9..34cda1c 100644 --- a/examples/coding-benchmark/eval.ts +++ b/examples/coding-benchmark/eval.ts @@ -1,20 +1,28 @@ /** * The SCORING stack, in the order it runs — cheapest and most objective first. * - * 1. DETERMINISTIC CHECKS (in the box, ~$0) — an ordered `MultiLayerVerifier` - * pipeline: typecheck → test → lint, with dependency-based skip (test never - * runs on a type error) and a blended score. These pass/fail booleans steer the - * refine loop (see the firewall in dispatch.ts). - * 2. REALNESS GATE (no LLM) — `scoreAuthenticity` + `gateRealness`. Catches a stub - * that compiles but fakes the hard part. It does not just record a verdict — it - * GATES: a gated artifact short-circuits the judge to composite 0. - * 3. LLM JUDGE (last, only on the band the checks can't resolve) — one `llmJudge` - * model call for the leaderboard, or a cross-family `ensembleJudge` panel for a - * ship/no-ship claim. Both see the SAME full context (code + rubric + check - * results); the rubric anchors live HERE, never in the agent's workdir. + * 1. DEV CHECKS (in the box, ~$0) — an ordered `MultiLayerVerifier` pipeline: + * typecheck → test(visible) → lint, with dependency-based skip (test never runs on + * a type error) and a blended score. These pass/fail booleans steer the refine loop + * (see the firewall in dispatch.ts). They are ADVISORY for the final grade: passing + * the visible examples does not prove correctness, it just tells the agent it's on + * track. + * 2. HELD-OUT TEST EXECUTION (in the box, after the loop, ~$0) — the PRIMARY, + * ungameable correctness score. The hidden test suite (never seeded during the turn) + * is copied in and run with `node --experimental-transform-types --test`; the score + * is the held-out PASS RATE. A real solution passes; a cheat that hardcoded the + * visible examples or faked the hard part FAILS (it never saw these inputs). This is + * execution truth, not a text scan. + * 3. LLM JUDGE (last) — a SECONDARY code-QUALITY signal. One `llmJudge` model call for + * the leaderboard, or a cross-family `ensembleJudge` panel for a ship/no-ship claim. + * Both see the SAME full context (code + rubric + check results); the rubric anchors + * live HERE, never in the agent's workdir. + * + * Composite = held-out correctness (PRIMARY) + judge quality (secondary). The anti-cheat + * is the held-out execution — a hidden suite the agent never saw — not any text scan. * * Every layer is a published agent-eval primitive — `MultiLayerVerifier`, `llmJudge`, - * `ensembleJudge`, `scoreAuthenticity`/`gateRealness`. No hand-rolled scorer. + * `ensembleJudge`. No hand-rolled scorer. */ import { @@ -24,19 +32,19 @@ import { MultiLayerVerifier, type VerificationReport, } from '@tangle-network/agent-eval' -import { - type AuthenticitySignals, - gateRealness, - type ProducedFile, - scoreAuthenticity, -} from '@tangle-network/agent-eval/authenticity' // `llmJudge` is imported from the `/campaign` subpath, not the main index: it is // exported from `/campaign` across the entire declared peer range (>=0.97), whereas the // main-index re-export is newer — so a consumer pinned to the peer floor still compiles. import { type JudgeConfig, type JudgeScore, llmJudge } from '@tangle-network/agent-eval/campaign' -import type { CodingScenario, Fixture } from './scenarios' +import type { CodingScenario, TestFile } from './scenarios' + +// ── the composite weighting ─────────────────────────────────────────────────── +// Held-out correctness is the PRIMARY, ungameable score; the judge is a secondary +// quality signal. composite = heldoutWeight·heldout + judgeWeight·judge. +export const heldoutWeight = 0.7 +export const judgeWeight = 0.3 -// ── the rubric (4 weighted dimensions, total 1.0) ───────────────────────────── +// ── the judge rubric (4 weighted dimensions, total 1.0) ─────────────────────── // The rubric text + anchors live HERE, with the judge — never in the workdir. The // agent is graded against criteria it cannot read. export const rubric = { @@ -66,31 +74,32 @@ const weights = Object.fromEntries(dimKeys.map((k) => [k, rubric[k].weight])) as > const dimensions = dimKeys.map((k) => ({ key: k, description: rubric[k].description })) +// ── the held-out result ──────────────────────────────────────────────────────── +export interface HeldoutResult { + /** Held-out tests that passed. */ + passed: number + /** Total held-out tests run. */ + total: number + /** Pass rate (0..1) — the PRIMARY correctness score. 0 when the suite errored + * (typecheck failure, import failure, or no tests ran). */ + passRate: number + /** Captured runner output (record only). */ + notes: string +} + // ── the artifact the dispatch produces and the judges score ─────────────────── export interface RunArtifact { - /** Files the agent produced, as `{ path, content }` — the realness currency. */ - files: ProducedFile[] - /** The solution file's content (convenience; also present in `files`). */ + /** The solution file's content. */ solution: string /** The agent's final chat text for the round (judge context). */ finalText: string - /** The deterministic verifier report from the LAST round. */ + /** The deterministic dev-check report from the LAST round (visible tests). */ checks: VerificationReport - /** The realness gate verdict, computed AFTER the loop. Recorded for honesty AND - * read by the judge: a gated artifact short-circuits the judge to composite 0. */ - realness: RealnessVerdict + /** The held-out test execution result, run AFTER the loop. The PRIMARY score. */ + heldout: HeldoutResult } -export interface RealnessVerdict { - /** 0..1 deterministic realness (0 when gated). */ - score: number - /** True when the artifact faked or omitted the required deliverable. */ - gated: boolean - /** Human-readable flags + gate reason for the record. */ - notes: string -} - -// ── layer 1: the deterministic check pipeline ───────────────────────────────── +// ── layer 1: the deterministic check pipeline (visible tests) ────────────────── /** The minimal box surface the checks need — a subset of the real `SandboxInstance`. * The live sandbox satisfies it; the offline in-process box implements it too. `fs.write` @@ -101,13 +110,12 @@ export interface CheckBox { fs?: { write(path: string, content: string): Promise } } -/** Seed an eval-only file into the box. Prefers the structured `fs.write` seam so the - * fixture path/content is never interpolated into a shell command (no injection - * surface for partners who later load scenario paths from config). Falls back to a - * base64 shell write with SINGLE-QUOTED path words on a box that only exposes `exec`. - * The fixture's CONTENT is never described to the agent — this is write-only scaffold, - * not part of the prompt (the firewall). */ -async function seedFile(box: CheckBox, file: Fixture): Promise { +/** Seed a test file into the box. Prefers the structured `fs.write` seam so the path/ + * content is never interpolated into a shell command (no injection surface for partners + * who later load scenario paths from config). Falls back to a base64 shell write with + * SINGLE-QUOTED path words on a box that only exposes `exec`. The file's CONTENT is never + * described to the agent in the prompt — this is write-only scaffold (the firewall). */ +async function seedFile(box: CheckBox, file: TestFile): Promise { if (box.fs) { await box.fs.write(file.path, file.content) return @@ -168,20 +176,21 @@ function checkLayer( } /** - * Run the scenario's deterministic checks in the box as an ordered pipeline. Seeds - * the hidden test first, then typecheck → test → lint. `report.allPass` is true only - * when typecheck AND test pass (lint is advisory). The `report.layers[*].detail.output` - * is what the refine loop reads to build the next prompt. + * Run the scenario's dev checks in the box as an ordered pipeline. Seeds the VISIBLE + * example test first (the agent may read it, TDD-style), then typecheck → test → lint. + * `report.allPass` is true only when typecheck AND test pass (lint is advisory). The + * `report.layers[*].detail.output` is what the refine loop reads to build the next + * prompt. The HELD-OUT test is NOT seeded here — that is the firewall. */ export async function runChecks( box: CheckBox, scenario: CodingScenario, - cmds: { typecheck: string; test: string; lint: string }, + cmds: { typecheck: string; dev: string; lint: string }, ): Promise { - await seedFile(box, scenario.fixture) + await seedFile(box, scenario.visibleTest) const verifier = new MultiLayerVerifier([ checkLayer('typecheck', cmds.typecheck, {}), - checkLayer('test', cmds.test, { dependsOn: ['typecheck'] }), + checkLayer('test', cmds.dev, { dependsOn: ['typecheck'] }), checkLayer('lint', cmds.lint, { dependsOn: ['typecheck'], advisory: true }), ]) return verifier.run({ env: box, overallCapMs: 120_000 }) @@ -203,36 +212,64 @@ export function layerOutput( } } -// ── layer 2: the realness gate (no LLM) ─────────────────────────────────────── +// ── layer 2: held-out test execution (the PRIMARY anti-cheat) ────────────────── /** - * Deterministic realness scan. `scoreAuthenticity` is a pure structural scan - * (required artifact present? hard part implemented? or a fake shim?), and - * `gateRealness` caps anything that faked or omitted the required artifact. The - * verdict is recorded AND read by the judge — a gated artifact cannot earn a score. + * Seed the held-out suite into the box AFTER the loop and run it. The score is the + * held-out PASS RATE — the primary, ungameable correctness number. The agent never saw + * these tests during the turn (the firewall), so a solution that hardcoded the visible + * examples or faked the hard part fails them; only real behavior passes. * - * `reference` files (e.g. the seeded test fixture) are passed to the scan as non-scored - * context: they let `scoreAuthenticity` observe that the required artifact IS imported, - * so a real solution does not get a spurious `DEAD_ARTIFACT` flag just because the - * dispatch scores the solution file in isolation. A reference cannot rescue a cheat — - * the gate still fires on `fakeShim && !realImpl` regardless of what imports it. + * `node --test` prints a TAP-ish summary (`# tests N`, `# pass N`, `# fail N`). We parse + * those counts. A non-zero exit with no parseable counts (a typecheck/import error before + * any test ran) is a 0/0 → passRate 0 — the honest "did not even run" signal, never a + * spurious pass. This runs in the SAME box, so it sees the agent's real solution file. */ -export function realnessGate( - files: ProducedFile[], - signals: AuthenticitySignals, - reference: ProducedFile[] = [], -): RealnessVerdict { - const result = scoreAuthenticity([...files, ...reference], signals) - const gate = gateRealness(result, { requireArtifact: true }) - const flags = result.flags.length > 0 ? ` — flags: ${result.flags.join(', ')}` : '' +export async function runHeldout( + box: CheckBox, + scenario: CodingScenario, + heldoutCmd: string, +): Promise { + await seedFile(box, scenario.heldoutTest) + const r = await box.exec(heldoutCmd) + const output = `${r.stdout}\n${r.stderr}`.trim() + const counts = parseTestCounts(output) + // No parseable counts means the suite never ran (e.g. the solution didn't typecheck or + // import) — that is a 0 pass rate, the honest "did not even run" result. + const total = counts.total + const passed = counts.pass + const passRate = total > 0 ? passed / total : 0 return { - score: gate.gated ? 0 : result.realness / 100, - gated: gate.gated, - notes: `${gate.gated ? `GATED (${gate.reason ?? 'fake/missing artifact'})` : 'real'}${flags}`, + passed, + total, + passRate, + notes: + total > 0 + ? `held-out ${passed}/${total} pass` + : `held-out suite did not run (exit ${r.exitCode})`, } } -// ── layer 3: the LLM judge(s) ───────────────────────────────────────────────── +/** Parse `node --test`'s summary counts from its output. Reads the `tests`, `pass`, and + * `fail` summary lines, which `node --test` prefixes with either `ℹ` (its default + * reporter) or `#` (the TAP reporter) and may wrap in ANSI colour. We strip ANSI and + * accept both markers. When `tests` is absent we fall back to pass+fail. Returns + * {total:0,pass:0} when nothing parseable (the suite never ran) — never guesses a pass. */ +function parseTestCounts(output: string): { total: number; pass: number } { + // biome-ignore lint/suspicious/noControlCharactersInRegex: stripping terminal ANSI escapes + const clean = output.replace(/\[[0-9;]*m/g, '') + const read = (label: string): number | undefined => { + const m = clean.match(new RegExp(`(?:#|\\u2139)\\s*${label}\\s+(\\d+)`)) + return m ? Number(m[1]) : undefined + } + const pass = read('pass') ?? 0 + const fail = read('fail') ?? 0 + const tests = read('tests') + const total = tests ?? pass + fail + return { total, pass } +} + +// ── layer 3: the LLM judge(s) — SECONDARY quality signal ─────────────────────── /** The judge instructions — the rubric anchors, kept with the judge ONLY. */ const judgePrompt = [ @@ -241,15 +278,17 @@ const judgePrompt = [ ].join(' ') /** The full context every judge sees: the code + the deterministic check results + - * the eval-only rubric note. Shared by the single judge AND the ensemble so the - * panel never grades on less information than the leaderboard judge. */ + * the held-out pass rate + the eval-only rubric note. Shared by the single judge AND + * the ensemble so the panel never grades on less information than the leaderboard judge. */ function renderForJudge(artifact: RunArtifact, scenario: CodingScenario): string { return [ `Task intent: ${scenario.prompt}`, `Grading note: ${scenario.rubricNote}`, - `Deterministic checks — typecheck:${layerOutput(artifact.checks, 'typecheck').passed} ` + - `test:${layerOutput(artifact.checks, 'test').passed} lint:${layerOutput(artifact.checks, 'lint').passed}`, - `Realness: ${artifact.realness.notes}`, + `Dev checks — typecheck:${layerOutput(artifact.checks, 'typecheck').passed} ` + + `visible-test:${layerOutput(artifact.checks, 'test').passed} ` + + `lint:${layerOutput(artifact.checks, 'lint').passed}`, + `Held-out correctness: ${artifact.heldout.passed}/${artifact.heldout.total} ` + + `(${(artifact.heldout.passRate * 100).toFixed(0)}%)`, '', 'Candidate solution:', '```ts', @@ -261,8 +300,9 @@ function renderForJudge(artifact: RunArtifact, scenario: CodingScenario): string /** ── ONE judge ────────────────────────────────────────────────────────────── * `llmJudge` builds a campaign `JudgeConfig` whose `score()` makes ONE model call * against the rubric and reduces it to a canonical `{ dimensions, composite, notes }`. - * We wrap it so a realness-gated artifact short-circuits to composite 0 WITHOUT a - * model call — the realness gate genuinely gates the judge. */ + * The judge's composite is the SECONDARY quality signal; we wrap it with `blendHeldout` + * so the composite the matrix RECORDS is the PRIMARY-weighted blend (held-out pass rate + * + judge quality). */ export function singleCodeJudge(chat: ChatClient): JudgeConfig { const base = llmJudge('code-quality', judgePrompt, { chat, @@ -272,7 +312,7 @@ export function singleCodeJudge(chat: ChatClient): JudgeConfig s.kind === 'coding', renderUser: ({ artifact, scenario }) => renderForJudge(artifact, scenario), }) - return gatedByRealness(base) + return blendHeldout(base) } /** ── THREE judges ──────────────────────────────────────────────────────────── @@ -302,30 +342,50 @@ export function ensembleCodeJudge( return { model, perDimension } }, }) as JudgeConfig - return gatedByRealness(base) + return blendHeldout(base) } -/** Wrap a judge so a realness-gated artifact short-circuits to composite 0 with no - * model call. This is the gate ACTUALLY gating: a stub that faked the hard part - * cannot earn a judge score, however confident the model would have been. */ -function gatedByRealness( +// ── the composite: held-out correctness (PRIMARY) + judge quality (secondary) ── + +/** + * Blend the PRIMARY held-out pass rate with the SECONDARY judge composite into the final + * score the leaderboard ranks on. This is what makes held-out execution the load-bearing + * grade: a solution that fails the held-out suite is capped low no matter how the judge + * felt about its style, and a stylistically-mediocre but CORRECT solution still scores + * the bulk of the points. + */ +export function composeScore(heldoutPassRate: number, judgeComposite: number): number { + return heldoutWeight * heldoutPassRate + judgeWeight * judgeComposite +} + +/** Wrap a judge so the composite it REPORTS is the held-out-weighted blend. The judge + * still scores its quality dimensions (recorded, secondary), but the composite the + * matrix stamps as the run's score is `composeScore(heldoutPassRate, judgeComposite)` — + * so the leaderboard ranks on execution truth first, style second. The artifact is in + * scope at score time, so the held-out pass rate (computed before the judge runs) is + * read directly off it; no separate stats-side blend is needed. */ +function blendHeldout( judge: JudgeConfig, ): JudgeConfig { return { ...judge, - score(input: { + async score(input: { artifact: RunArtifact scenario: CodingScenario signal: AbortSignal - }): JudgeScore | Promise { - if (input.artifact.realness.gated) { - return { - dimensions: Object.fromEntries(dimKeys.map((k) => [k, 0])), - composite: 0, - notes: `realness-gated: ${input.artifact.realness.notes}`, - } + }): Promise { + const base = await judge.score(input) + const heldout = input.artifact.heldout + const composite = composeScore(heldout.passRate, base.composite) + return { + ...base, + composite, + notes: + `composite=${composite.toFixed(3)} ` + + `(held-out ${(heldout.passRate * 100).toFixed(0)}% × ${heldoutWeight} + ` + + `quality ${base.composite.toFixed(3)} × ${judgeWeight})` + + (base.notes ? ` — ${base.notes}` : ''), } - return judge.score(input) }, } } diff --git a/examples/coding-benchmark/offline-box.ts b/examples/coding-benchmark/offline-box.ts index fe4c55b..8956ba6 100644 --- a/examples/coding-benchmark/offline-box.ts +++ b/examples/coding-benchmark/offline-box.ts @@ -4,8 +4,8 @@ * * The offline "agent" is a SCRIPTED STAND-IN for a real coding agent: it writes a * canned solution per round instead of calling a model. That is the only thing - * stubbed — the matrix, the verifier, the realness gate, the judge wiring, and the - * stats all run for real. `--live` swaps this client for `new SandboxClient(...)` + * stubbed — the matrix, the verifier, the held-out test execution, the judge wiring, + * and the stats all run for real. `--live` swaps this client for `new SandboxClient(...)` * and the same dispatch runs each round in a real harness box. * * It implements only what `openSandboxRun` actually calls on a box: @@ -14,11 +14,13 @@ * SAME shape a live box emits, carrying `tokenUsage` so the run meters honestly * and `extractLlmCallEvent` reads it. * - `fs.read` / `fs.write` — over the temp workspace (the `artifact` deliverable + - * the seeded fixture live here). - * - `exec(cmd)` — runs the deterministic check + fixture-seed commands. Offline the - * toolchain (tsc / biome / node --test) usually isn't installed, so a missing tool - * reads as a FAIL — the honest offline signal, not a fake pass. (The checks never - * pass offline, so all `maxRounds` run — which is exactly when refinement shows.) + * the seeded test files live here). + * - `exec(cmd)` — runs the check + seed commands. Offline, `node` IS present so the + * test commands (`node --experimental-transform-types --test`) run for real — which + * is what lets the HELD-OUT execution genuinely grade the solution with no creds. But + * `tsc`/`biome` usually aren't installed, so the typecheck (and the test layer that + * `dependsOn` it) read as a FAIL — the honest offline signal. The dev checks never + * fully pass offline, so all `maxRounds` run, which is when refinement shows. * - `delete()` — tears the temp dir down. */ diff --git a/examples/coding-benchmark/scenarios.ts b/examples/coding-benchmark/scenarios.ts index 7719633..46698ce 100644 --- a/examples/coding-benchmark/scenarios.ts +++ b/examples/coding-benchmark/scenarios.ts @@ -2,34 +2,36 @@ * The held-out coding-task corpus — and the GRADING-CRITERIA FIREWALL, expressed as * a type. * - * Every scenario splits into three layers by where each field flows: - * - `prompt` — the only field copied into the agent's CONTEXT. The dispatch - * copies it (and next-round prompts built only from check output) - * into the worker; nothing else reaches the worker's context. - * - `fixture` — the deterministic test. It is SEEDED into the box workspace (so - * `node --test` has a file to run) and a multi-round agent with - * native file tools CAN read it — this is intentional, the same as - * real TDD: the test is a SPEC the agent is asked to satisfy, not - * a hidden rubric. Its assertions are never described in the - * prompt, but they are not hidden from the filesystem. - * - rubric/realness — the LLM-judge rubric note and the realness signals. These are - * never written into the box at all; eval.ts reads them AFTER the - * loop to score the result. THIS is what the firewall actually - * protects: the grading criteria the agent can't steer toward. + * Every scenario splits into four layers by where each field flows: + * - `prompt` — the only field copied into the agent's CONTEXT. The dispatch + * copies it (and next-round prompts built only from check output) + * into the worker; nothing else reaches the worker's context. + * - `visibleTest` — the example tests, SEEDED into the box workspace during the turn + * (so `node --test` has a file to run) and readable by a multi-round + * agent with native file tools — this is intentional, the same as + * real TDD: a few example cases the agent develops against. + * - `heldoutTest` — the HIDDEN grading suite. Same behavior, MORE cases and DIFFERENT + * inputs/edge cases the visible examples don't cover. It is NEVER + * seeded into the box during the turn — that is the anti-cheat + * firewall. At grading (after the loop) the harness copies it in and + * runs it; the score is the held-out pass rate. A solution that + * hardcoded the visible examples FAILS these; only real behavior + * passes. This is execution truth, not a text scan. + * - rubricNote — the LLM-judge rubric note. Never written into the box at all; + * eval.ts reads it AFTER the loop to score CODE QUALITY (secondary). * * The firewall is a property of which field flows where — you can SEE it in one place - * (it would require dispatch.ts to put a rubric/realness field into the profile, which - * it does not; see the `// FIREWALL` comment in dispatch.ts). The honest claim is the - * precise one: the rubric and realness signals never touch the box; the test fixture - * is deliberately visible to the agent. + * (dispatch.ts seeds `visibleTest` into the box but never `heldoutTest`; see the + * `// FIREWALL` comment there). The honest claim is the precise one: the held-out test + * suite and the rubric never touch the box during the turn; the visible example tests are + * deliberately readable by the agent. */ -import type { AuthenticitySignals } from '@tangle-network/agent-eval/authenticity' import type { Scenario } from '@tangle-network/agent-eval/campaign' -/** A file the harness seeds into the box workspace before the run — the test the - * deterministic check executes. EVAL-ONLY: its content is never shown to the agent. */ -export interface Fixture { +/** A test file the harness writes into the box. `visibleTest` is seeded DURING the turn + * (the agent may read it); `heldoutTest` is seeded ONLY at grading, after the turn. */ +export interface TestFile { path: string content: string } @@ -41,23 +43,22 @@ export interface CodingScenario extends Scenario { * This is the WHOLE of what reaches the worker's context. */ prompt: string - /** ── EVAL-ONLY (the agent never reads these) ──────────────────────────── */ - /** Path (relative to the workspace root) the agent is asked to produce. The * checks read this file off the box AFTER the turn; the judge scores it. */ solutionPath: string - /** The hidden test, seeded into the box so `node --test` has a real file to run. - * Seeded write-only — the agent is told WHAT to build (the prompt), never the - * assertions it is graded against. */ - fixture: Fixture + /** ── DEVELOP-AGAINST (seeded during the turn, TDD-style) ───────────────── + * A few example tests, seeded into the box so the agent can run/read them. */ + visibleTest: TestFile - /** Realness anchor input for `scoreAuthenticity` — catches a stub that compiles - * but fakes the hard part. Write-only to the record; never reaches the box. */ - realnessSignals: AuthenticitySignals + /** ── GRADING-ONLY (the agent NEVER sees this during the turn) ──────────── + * The held-out suite — same behavior, different inputs + edge cases the visible + * examples don't cover. Seeded ONLY at grading; the held-out pass rate is the + * PRIMARY, ungameable correctness score. Catches a hardcode-the-visible cheat. */ + heldoutTest: TestFile /** Extra grading context for the JUDGE only (design intent, edge cases to - * reward). Lives with the judge, never in the workdir. */ + * reward). Lives with the judge, never in the workdir. Secondary signal. */ rubricNote: string } @@ -67,7 +68,7 @@ export interface CodingScenario extends Scenario { // non-zero exit (the honest offline signal), not a 20s network stall. /** A typecheck shell command for one solution file. */ const typecheckCmd = (path: string) => `tsc --noEmit --strict --skipLibCheck ${path}` -/** A `node --test` command for one fixture. The fixture imports the solution as a `.ts` +/** A `node --test` command for one test file. The test imports the solution as a `.ts` * file, so we run with `--experimental-transform-types`: Node's DEFAULT type-stripping * is strip-only and throws `ERR_UNSUPPORTED_TYPESCRIPT_SYNTAX` on TS that emits runtime * code — including constructor PARAMETER PROPERTIES (`constructor(private x: number)`), @@ -80,7 +81,7 @@ const typecheckCmd = (path: string) => `tsc --noEmit --strict --skipLibCheck ${p * degrades gracefully when the toolchain is absent, but the test LAYER itself — live * or when copied — requires Node >= 22.6). On an older Node a correct solution would * fail with no hint why. */ -const testCmd = (fixturePath: string) => `node --experimental-transform-types --test ${fixturePath}` +const testCmd = (testPath: string) => `node --experimental-transform-types --test ${testPath}` /** A lint shell command for one solution file. */ const lintCmd = (path: string) => `biome check ${path}` @@ -90,15 +91,16 @@ const lintCmd = (path: string) => `biome check ${path}` * shape that has a CORRECTABLE MIDDLE BAND (build-passes-but-quality-varies), which * is what makes a benchmark able to separate harnesses at all. * - * The realness signals on each task are tuned so the NATURAL cheat gates, not just one - * strawman stub: a shim only reads as "real" when the actual hard-part work is present - * (refill math / quote-state tracking / capacity eviction), and the fake patterns catch - * the obvious shortcut regardless of decoy tokens (a `refill` param name, a stray - * `for (`, a passthrough `Map`). The smoke test asserts each natural cheat is gated. + * THE ANTI-CHEAT is the held-out suite, not a text scan. Each `heldoutTest` covers the + * SAME behavior as its `visibleTest` with DIFFERENT inputs and extra edge cases, so a + * solution that hardcoded the visible examples' exact values passes `visibleTest` but + * FAILS `heldoutTest`. Execution truth: a real implementation passes both; a cheat that + * fakes the hard part or memorizes the visible cases fails the held-out one (exit 1). * * POWER CAVEAT: three scenarios is far below the n the significance machinery needs to * separate harnesses — the paired tests demonstrate the WIRING, not a defensible claim. - * A real run wants 20-50 tasks. `renderStats` prints this caveat when n < 6. + * A real run wants 20-50 tasks. At this n a near-constant gap can SHOW significance (the + * small-n mirage); `renderStats` flags that and prints the caveat when n < 6. */ export const scenarios: CodingScenario[] = [ { @@ -113,8 +115,8 @@ export const scenarios: CodingScenario[] = [ 'and false otherwise. No external dependencies.', ].join(' '), solutionPath: 'src/rate-limiter.ts', - fixture: { - path: 'test/rate-limiter.test.js', + visibleTest: { + path: 'test/rate-limiter.test.ts', content: `import { test } from 'node:test' import assert from 'node:assert/strict' import { RateLimiter } from '../src/rate-limiter.ts' @@ -137,22 +139,38 @@ test('rejects a second draw that exceeds the remaining bucket', () => { }) `, }, - realnessSignals: { - label: 'token-bucket', - requiredArtifact: /rate-limiter\.ts$/, - // The hard part must be present: actual refill MATH — a clock read combined with - // refillPerSec, or refillPerSec used in an arithmetic expression. A bare `refill` - // identifier (e.g. a constructor param named `refillPerSec`) is NOT enough, so a - // hollow `return true` whose only `refill` is the param name does not read as real. - realImpl: - /(Date\.now\(\)|performance\.now\(\))[\s\S]*refillPerSec\s*[)*]|\*\s*(this\.)?refillPerSec/, - realInfra: /class\s+RateLimiter/, - // The fake: a tryRemove whose body opens with `return true` (no refill math before - // it). A real impl that legitimately ENDS in `return true` after the math is not - // flagged — the shim is "returns true with no logic", not "returns true". Combined - // with the tightened realImpl above, the gate (fakeShim && !realImpl) now fires on - // a stub even when its constructor param is named `refillPerSec`. - fakeShim: /tryRemove\([^)]*\)\s*:\s*boolean\s*{\s*return\s+true/, + // HELD-OUT: same token-bucket behavior, DIFFERENT capacities/draws + extra edge + // cases (exact-capacity draw, zero-token draw). A solution that hardcoded the + // visible numbers (cap 10/3/10, draws 5/4/8) cannot satisfy these. + heldoutTest: { + path: 'test/rate-limiter.heldout.test.ts', + content: `import { test } from 'node:test' +import assert from 'node:assert/strict' +import { RateLimiter } from '../src/rate-limiter.ts' + +test('consumes within a different capacity', () => { + const rl = new RateLimiter(7, 1) + assert.equal(rl.tryRemove(4), true) + assert.equal(rl.tryRemove(3), true) +}) + +test('allows a draw exactly equal to the remaining bucket', () => { + const rl = new RateLimiter(6, 0) + assert.equal(rl.tryRemove(6), true) + assert.equal(rl.tryRemove(1), false) +}) + +test('rejects a draw over a different capacity', () => { + const rl = new RateLimiter(5, 1) + assert.equal(rl.tryRemove(6), false) +}) + +test('a zero-token draw always succeeds without consuming', () => { + const rl = new RateLimiter(2, 0) + assert.equal(rl.tryRemove(0), true) + assert.equal(rl.tryRemove(2), true) +}) +`, }, rubricNote: 'Reward continuous (not discrete-tick) refill, integer-safe token accounting, and ' + @@ -169,8 +187,8 @@ test('rejects a second draw that exceeds the remaining bucket', () => { 'No external dependencies.', ].join(' '), solutionPath: 'src/csv.ts', - fixture: { - path: 'test/csv.test.js', + visibleTest: { + path: 'test/csv.test.ts', content: `import { test } from 'node:test' import assert from 'node:assert/strict' import { parseCsv } from '../src/csv.ts' @@ -192,20 +210,39 @@ test('unescapes a doubled quote', () => { }) `, }, - realnessSignals: { - label: 'csv-rfc4180', - requiredArtifact: /csv\.ts$/, - // Real parsers track quote state and walk the string char-by-char. We anchor to - // quote-state / per-char access (`inQuotes`, `charAt(`, `input[i]`), NOT a bare - // `for (` — a naive `for (line of input.split('\n'))` cheat has a loop but no - // quote state, so it must not read as a real impl. - realImpl: /inQuotes|charAt\(|input\[\s*i\s*\]|quote/i, - realInfra: /function\s+parseCsv/, - // The fake: splitting on comma or newline (naive parse) — the RFC-4180 cases - // (quoted comma, embedded newline) make `.split` wrong. Matches anywhere, so the - // naive `input.split('\n').map(l => l.split(','))` AND a `for (… input.split('\n'))` - // loop are both caught. Any such split is the shortcut, regardless of loops around it. - fakeShim: /\.split\(\s*['"`](,|\\n)['"`]\s*\)/, + // HELD-OUT: same RFC-4180 behaviors, DIFFERENT strings + extra edge cases (a multi-row + // input with two records, an empty field). A parser that hardcoded the visible inputs + // cannot satisfy these. + heldoutTest: { + path: 'test/csv.heldout.test.ts', + content: `import { test } from 'node:test' +import assert from 'node:assert/strict' +import { parseCsv } from '../src/csv.ts' + +test('parses a different plain row', () => { + assert.deepEqual(parseCsv('x,y,z,w'), [['x', 'y', 'z', 'w']]) +}) + +test('keeps a comma inside a different quoted field', () => { + assert.deepEqual(parseCsv('p,"q,r,s"'), [['p', 'q,r,s']]) +}) + +test('parses two records separated by a newline', () => { + assert.deepEqual(parseCsv('a,b\\nc,d'), [['a', 'b'], ['c', 'd']]) +}) + +test('keeps a newline inside a different quoted field', () => { + assert.deepEqual(parseCsv('"alpha\\nbeta",gamma'), [['alpha\\nbeta', 'gamma']]) +}) + +test('unescapes a different doubled quote', () => { + assert.deepEqual(parseCsv('"a ""b"" c"'), [['a "b" c']]) +}) + +test('keeps an empty field between commas', () => { + assert.deepEqual(parseCsv('a,,c'), [['a', '', 'c']]) +}) +`, }, rubricNote: 'Reward a single-pass state machine over naive splitting; correct handling of a quoted ' + @@ -214,7 +251,7 @@ test('unescapes a doubled quote', () => { { // The "only the real algorithm passes" task: a capacity-bounded LRU cache. There is // no shortcut that satisfies the eviction tests — a bare `Map` (or `extends Map`) - // grows without bound and fails the at-capacity test, AND gates on realness. + // grows without bound and fails the at-capacity held-out test. id: 'lru-cache', kind: 'coding', tags: ['data-structures', 'eviction'], @@ -226,8 +263,8 @@ test('unescapes a doubled quote', () => { '(refreshes recency). No external dependencies.', ].join(' '), solutionPath: 'src/lru.ts', - fixture: { - path: 'test/lru.test.js', + visibleTest: { + path: 'test/lru.test.ts', content: `import { test } from 'node:test' import assert from 'node:assert/strict' import { LruCache } from '../src/lru.ts' @@ -258,20 +295,47 @@ test('returns undefined for a missing key', () => { }) `, }, - realnessSignals: { - label: 'lru-cache', - requiredArtifact: /lru\.ts$/, - // The hard part is eviction: a delete that precedes a set (the recency move), the - // canonical `keys().next()` oldest-key eviction, or an explicit size>=capacity - // check. None of these appear in a no-eviction wrapper. - realImpl: - /\.delete\([^)]*\)[\s\S]*\.set\(|\.keys\(\)\.next\(\)|\.size\s*>=?\s*this\.capacity/, - realInfra: /class\s+LruCache/, - // The fake: a class that `extends Map` (no eviction override), or a `set` body that - // is a single passthrough `.set` with no delete/size logic — the bounded-cache - // shortcut that grows forever. - fakeShim: - /extends\s+Map\b|set\([^)]*\)[^{]*{\s*(this\.|return\s+)?\w+\.set\([^)]*\)\s*;?\s*}/, + // HELD-OUT: same eviction behavior, DIFFERENT capacity + key sequence + extra edge + // cases (a re-set updates value AND refreshes recency). A cache that hardcoded the + // visible keys/order cannot satisfy these. + heldoutTest: { + path: 'test/lru.heldout.test.ts', + content: `import { test } from 'node:test' +import assert from 'node:assert/strict' +import { LruCache } from '../src/lru.ts' + +test('evicts the LRU entry at a different capacity', () => { + const c = new LruCache(3) + c.set('p', 1) + c.set('q', 2) + c.set('r', 3) + c.set('s', 4) + assert.equal(c.get('p'), undefined) + assert.equal(c.get('q'), 2) + assert.equal(c.get('s'), 4) +}) + +test('a get refreshes recency for a different sequence', () => { + const c = new LruCache(2) + c.set('m', 1) + c.set('n', 2) + assert.equal(c.get('m'), 1) + c.set('o', 3) + assert.equal(c.get('n'), undefined) + assert.equal(c.get('m'), 1) +}) + +test('a re-set updates the value and refreshes recency', () => { + const c = new LruCache(2) + c.set('a', 1) + c.set('b', 2) + c.set('a', 9) + c.set('c', 3) + assert.equal(c.get('b'), undefined) + assert.equal(c.get('a'), 9) + assert.equal(c.get('c'), 3) +}) +`, }, rubricNote: 'Reward O(1) get/set with correct LRU eviction and recency refresh on read; an ' + @@ -279,17 +343,23 @@ test('returns undefined for a missing key', () => { }, ] -/** The deterministic check commands for a scenario — derived from its paths, in the - * ordered pipeline the verifier runs (typecheck → test → lint). Eval config: the - * agent is told WHAT to build, never the commands it is graded by. */ +/** The deterministic check commands for a scenario — derived from its paths. + * + * `dev` runs the VISIBLE example tests (seeded during the turn, what steers the refine + * loop). `heldout` runs the HIDDEN grading suite (seeded only at grading, never during + * the turn — the firewall). Eval config: the agent is told WHAT to build (the prompt) + * and develops against the visible tests, but is GRADED on the held-out suite it never + * saw, so it cannot fit the grade. */ export function checkCmds(scenario: CodingScenario): { typecheck: string - test: string + dev: string + heldout: string lint: string } { return { typecheck: typecheckCmd(scenario.solutionPath), - test: testCmd(scenario.fixture.path), + dev: testCmd(scenario.visibleTest.path), + heldout: testCmd(scenario.heldoutTest.path), lint: lintCmd(scenario.solutionPath), } } diff --git a/examples/coding-benchmark/stats.ts b/examples/coding-benchmark/stats.ts index b1f2d34..9ec73c8 100644 --- a/examples/coding-benchmark/stats.ts +++ b/examples/coding-benchmark/stats.ts @@ -179,6 +179,11 @@ export function pairwiseStats( return { leaderboard, pairs } } +/** The power floor below which we never print a bare `SIGNIFICANT` claim — a paired + * test on fewer scenarios than this cannot defensibly separate harnesses, so the tag + * is suppressed regardless of the p-value (small-n mirage protection). */ +const powerFloor = 6 + /** Render the report as a plain leaderboard + significance lines. */ export function renderStats(report: StatsReport): string { const lines: string[] = [] @@ -191,10 +196,19 @@ export function renderStats(report: StatsReport): string { `[${(row.passCi.lower * 100).toFixed(0)}%, ${(row.passCi.upper * 100).toFixed(0)}%] (n=${row.n})`, ) } + // The honest n for the significance tests is the number of MATCHED scenarios — the + // paired unit. Below the power floor we suppress the SIGNIFICANT tag entirely (a + // near-constant gap on a few scenarios can return p<0.05 yet mean nothing — the + // small-n mirage), and a zero-variance pair (delta CI collapsed to a point) likewise + // never reads as a real effect. + const maxN = report.leaderboard.reduce((m, r) => Math.max(m, r.n), 0) + const underpowered = maxN < powerFloor lines.push('') lines.push('Pairwise (paired delta + bootstrap CI; paired-test p, BH-corrected):') for (const p of report.pairs) { - const tag = p.significant ? 'SIGNIFICANT' : 'n.s.' + const degenerate = p.low === p.high // bootstrap CI collapsed → no variance to test + const claimSignificant = p.significant && !underpowered && !degenerate + const tag = claimSignificant ? 'SIGNIFICANT' : underpowered ? 'n.s. (underpowered)' : 'n.s.' lines.push( ` ${p.b} − ${p.a}: Δ=${p.delta.toFixed(3)} [${p.low.toFixed(3)}, ${p.high.toFixed(3)}] ` + `p=${p.p.toFixed(3)} ${tag}`, @@ -203,13 +217,13 @@ export function renderStats(report: StatsReport): string { // Power caveat: with a tiny scenario corpus the significance machinery is structurally // underpowered — the Wilcoxon path returns p=1 for n<6 non-zero diffs, and the paired // t-test has ~1 df. The tests show the WIRING; a real claim needs 20-50 tasks. - const maxN = report.leaderboard.reduce((m, r) => Math.max(m, r.n), 0) - if (maxN < 6) { + if (underpowered) { lines.push('') lines.push( - ` NOTE: n=${maxN} scenarios — below the power floor. The paired tests above cannot ` + - 'reach significance at this corpus size (they demonstrate the wiring). Use 20-50 ' + - 'tasks for a real harness comparison.', + ` NOTE: n=${maxN} scenarios — below the power floor (${powerFloor}). The paired tests ` + + 'above cannot defensibly reach significance at this corpus size, so the SIGNIFICANT ' + + 'tag is suppressed (they demonstrate the wiring). Use 20-50 tasks for a real ' + + 'harness comparison.', ) } return lines.join('\n')