From 113f6efc52720f2709568c136fab3755d8221bf9 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Wed, 24 Jun 2026 16:16:01 -0600 Subject: [PATCH] feat(research): real web-research worker + genuinely-live A/B arm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the live A/B arm's skeleton (which ran the same offline naive proposer over a hardcoded pool with a junk/-prefix verifier) with a real, any-topic web-research worker and a real LLM verifying driver, so AGENT_KNOWLEDGE_LIVE=1 runs a genuine experiment. src/web-research-worker.ts (general, no hardcoded corpus): - createWebResearchWorker — glm-5.2 turns open gaps into search queries, runs real web search over the router (/v1/search, the endpoint tcloud mcp's web_search forwards to), fetches with politeFetch, reduces with htmlToText, proposes citing pages via buildPages. Conforms to the loop's ResearchWorker contract. - createVerifyingResearchDriver — a glm-5.2 verifySource pass judging each fetched source's on-topic relevance + near-duplicate against the round, fail-closed on parse/router failure. - createTangleRouterClient — dependency-free router client over fetch (search + chat), so it works with or without the tcloud CLI installed. Reasoning-model floor: glm-5.2 calls get max_tokens >= 1200 so visible content isn't starved by reasoning_content. Live arm: injects the real worker + verifier + topic-relevant readiness specs, runs both arms at equal agent-pass budget, cost-gates with a cheap glm-5.2 smoke first, asserts the worker actually web-searched, and reports admitted-source count + coverage per arm with agent-eval's pairedBootstrap. The offline arm is unchanged (CI, $0, deterministic) — the arm-runners gain defaulted parameters only. --- src/index.ts | 1 + src/web-research-worker.ts | 497 ++++++++++++++++++ .../loops/research-loop-equal-compute.test.ts | 277 ++++++++-- 3 files changed, 735 insertions(+), 40 deletions(-) create mode 100644 src/web-research-worker.ts diff --git a/src/index.ts b/src/index.ts index 2ce4f8c..9487362 100644 --- a/src/index.ts +++ b/src/index.ts @@ -26,5 +26,6 @@ export * from './store' export * from './two-agent-research-loop' export * from './types' export * from './validate' +export * from './web-research-worker' export * from './wikilinks' export * from './write-protocol' diff --git a/src/web-research-worker.ts b/src/web-research-worker.ts new file mode 100644 index 0000000..ff147a1 --- /dev/null +++ b/src/web-research-worker.ts @@ -0,0 +1,497 @@ +/** + * Real web-research worker + verifying driver for `runTwoAgentResearchLoop`. + * + * This is the GENERAL, any-topic implementation behind the two-agent research + * loop's live arm. Given the open knowledge gaps the readiness gate surfaces, + * the worker: + * + * 1. asks an LLM (glm-5.2 by default) to turn each gap into focused web + * search queries, + * 2. runs a REAL web search over the Tangle router (`POST /v1/search` — the + * same endpoint `tcloud mcp`'s `web_search` tool forwards to), so there is + * no hardcoded corpus, + * 3. fetches the top results with the repo's polite, cached `politeFetch` and + * reduces each page to text with `htmlToText`, + * 4. proposes the readable, verifiable pages as `ResearchSourceProposal`s plus + * a `buildPages` that writes citing `knowledge/*.md` pages from the sources + * the driver accepts. + * + * The verifying DRIVER is the differentiated role from the two-agent loop: a + * second LLM pass that judges each fetched source's on-topic relevance to the + * goal + open gaps and rejects off-topic / spam / already-covered material. The + * worker ADDS; the driver GATES. Together they build a cleaner knowledge base + * than a single agent at the same compute budget. + * + * Dependency-free on purpose: it talks to the router over `fetch` directly with + * the published OpenAI-compatible chat shape and the `/v1/search` shape, so it + * works whether or not the `tcloud` CLI is installed. Point it at any router by + * passing `baseUrl`; supply the key via `apiKey` or `TANGLE_API_KEY`. + */ + +import { htmlToText } from './sources/html' +import { politeFetch } from './sources/http' +import type { + KnowledgeGap, + ResearchContribution, + ResearchDriver, + ResearchSourceProposal, + ResearchWorker, + SourceVerdict, + SourceVerificationContext, + WorkerResearchContext, +} from './two-agent-research-loop' +import type { SourceRecord } from './types' + +/** Default router model. Plain id (no namespace) — see CLAUDE/creds notes. */ +const DEFAULT_MODEL = 'glm-5.2' +/** Default router base. */ +const DEFAULT_BASE_URL = 'https://router.tangle.tools/v1' +/** + * glm-5.2 spends its first tokens on hidden `reasoning_content`; below ~1200 + * output tokens it returns EMPTY visible content. Floor every call so a + * reasoning model never silently yields nothing. (Verified creds note.) + */ +const MIN_MAX_TOKENS = 1200 + +/** One live web result, as the router's `/v1/search` returns it. */ +export interface WebSearchHit { + title: string + url: string + snippet?: string +} + +/** + * The two router capabilities the worker/driver need. Injectable so tests can + * stub the network; the default talks to the live Tangle router over `fetch`. + */ +export interface RouterClient { + /** Live web search — returns title/url/snippet hits. */ + search(query: string, opts?: { maxResults?: number }): Promise + /** Chat completion — returns the assistant message's visible text. */ + chat( + messages: { role: 'system' | 'user'; content: string }[], + maxTokens?: number, + ): Promise +} + +export interface TangleRouterOptions { + /** Router base URL. Defaults to `https://router.tangle.tools/v1`. */ + baseUrl?: string + /** Bearer key. Defaults to `process.env.TANGLE_API_KEY`. */ + apiKey?: string + /** Chat model id. Defaults to `glm-5.2`. */ + model?: string + /** Optional preferred search provider (exa | you | perplexity | …). */ + searchProvider?: string + signal?: AbortSignal +} + +/** A small error so a failed router call fails loud rather than returning junk. */ +export class RouterError extends Error { + constructor( + public readonly status: number, + message: string, + ) { + super(`router ${status}: ${message}`) + this.name = 'RouterError' + } +} + +/** + * Build a dependency-free Tangle router client over `fetch`. This is the same + * wire surface the `tcloud` SDK + `tcloud mcp` use (`/v1/search` for web search, + * `/v1/chat/completions` for chat) so it needs no CLI installed. + */ +export function createTangleRouterClient(options: TangleRouterOptions = {}): RouterClient { + const baseUrl = (options.baseUrl ?? DEFAULT_BASE_URL).replace(/\/$/, '') + const apiKey = options.apiKey ?? process.env.TANGLE_API_KEY + if (!apiKey) { + throw new RouterError(401, 'no TANGLE_API_KEY (pass apiKey or set the env var)') + } + const model = options.model ?? DEFAULT_MODEL + const headers = { + 'Content-Type': 'application/json', + Authorization: `Bearer ${apiKey}`, + } + + return { + async search(query, opts) { + const res = await fetch(`${baseUrl}/search`, { + method: 'POST', + headers, + signal: options.signal, + body: JSON.stringify({ + query, + ...(options.searchProvider ? { provider: options.searchProvider } : {}), + ...(opts?.maxResults != null ? { maxResults: opts.maxResults } : {}), + }), + }) + if (!res.ok) { + throw new RouterError(res.status, await res.text().catch(() => res.statusText)) + } + const body = (await res.json()) as { data?: WebSearchHit[] } + return (body.data ?? []) + .filter((hit) => typeof hit?.url === 'string' && hit.url.length > 0) + .map((hit) => ({ title: hit.title ?? hit.url, url: hit.url, snippet: hit.snippet })) + }, + async chat(messages, maxTokens) { + // Reasoning-model floor: never let glm-5.2 spend the whole budget on + // hidden reasoning and return empty visible content. + const max_tokens = Math.max(MIN_MAX_TOKENS, maxTokens ?? MIN_MAX_TOKENS) + const res = await fetch(`${baseUrl}/chat/completions`, { + method: 'POST', + headers, + signal: options.signal, + body: JSON.stringify({ model, messages, max_tokens, temperature: 0.2, stream: false }), + }) + if (!res.ok) { + throw new RouterError(res.status, await res.text().catch(() => res.statusText)) + } + const body = (await res.json()) as { + choices?: { message?: { content?: string } }[] + } + return body.choices?.[0]?.message?.content ?? '' + }, + } +} + +export interface WebResearchWorkerOptions { + /** Router client. Defaults to a live Tangle router client from env creds. */ + router?: RouterClient + router_options?: TangleRouterOptions + /** Max search queries the LLM may form per gap. Default 2. */ + queriesPerGap?: number + /** Max web results fetched per query. Default 3. */ + resultsPerQuery?: number + /** Hard cap on sources proposed per round (across all gaps). Default 6. */ + maxSourcesPerRound?: number + /** Disk cache dir for `politeFetch`. Optional; speeds repeat runs. */ + cacheDir?: string + /** Minimum readable text length to keep a fetched page. Default 200. */ + minTextChars?: number + /** Max chars of page text stored per source (keeps pages bounded). Default 4000. */ + maxTextChars?: number +} + +/** Resolve the router client lazily so a worker with an injected client never reads env. */ +function resolveRouter(opts: { + router?: RouterClient + router_options?: TangleRouterOptions +}): RouterClient { + return opts.router ?? createTangleRouterClient(opts.router_options) +} + +/** + * The real web-research worker. Conforms to the loop's `ResearchWorker` + * contract: given the open gaps, it returns the sources it found plus a + * `buildPages` that emits citing pages from the sources the driver accepted. + */ +export function createWebResearchWorker(options: WebResearchWorkerOptions = {}): ResearchWorker { + const queriesPerGap = Math.max(1, options.queriesPerGap ?? 2) + const resultsPerQuery = Math.max(1, options.resultsPerQuery ?? 3) + const maxSourcesPerRound = Math.max(1, options.maxSourcesPerRound ?? 6) + const minTextChars = Math.max(1, options.minTextChars ?? 200) + const maxTextChars = Math.max(minTextChars, options.maxTextChars ?? 4000) + + return async (ctx: WorkerResearchContext): Promise => { + const router = resolveRouter(options) + // Target the BLOCKING gaps first; fall back to all gaps if none are blocking. + const targetGaps = ctx.gaps.filter((gap) => gap.blocking) + const gaps = targetGaps.length > 0 ? targetGaps : ctx.gaps + if (gaps.length === 0) { + return { sources: [], notes: 'no open gaps to research' } + } + + const queries = await formSearchQueries(router, ctx, gaps, queriesPerGap) + const proposals: ResearchSourceProposal[] = [] + const seenUris = new Set() + + for (const query of queries) { + if (proposals.length >= maxSourcesPerRound) break + if (ctx.signal?.aborted) break + let hits: WebSearchHit[] + try { + hits = await router.search(query, { maxResults: resultsPerQuery }) + } catch (error) { + // A single failed query must not sink the round — record nothing, move on. + if ((error as { name?: string }).name === 'AbortError') break + continue + } + for (const hit of hits) { + if (proposals.length >= maxSourcesPerRound) break + if (seenUris.has(hit.url)) continue + const fetched = await politeFetch(hit.url, { + signal: ctx.signal, + cacheDir: options.cacheDir, + }) + if (!fetched.verifiable) continue + const text = htmlToText(fetched.body).slice(0, maxTextChars) + if (text.length < minTextChars) continue + seenUris.add(hit.url) + proposals.push({ + uri: hit.url, + title: hit.title || hit.url, + text, + // We just fetched + verified this page, so stamp `lastVerifiedAt` with + // fetch time. Do NOT set `validUntil`: a live page has no inherent + // future expiry, and the readiness freshness check treats any + // `validUntil <= now` as EXPIRED (score 0). The page's `Last-Modified` + // is a PAST date, so writing it here would mark every real source stale + // and zero out coverage. Record it as provenance metadata instead. + lastVerifiedAt: fetched.fetchedAt, + metadata: { + discoveredVia: query, + snippet: hit.snippet ?? '', + goal: ctx.goal, + sourceUpdatedAt: fetched.sourceUpdatedAt, + }, + }) + } + } + + return { + sources: proposals, + buildPages: buildCitingPages(proposals), + notes: `web-research worker: ${queries.length} queries → ${proposals.length} fetched sources`, + } + } +} + +/** + * Ask the LLM to turn the open gaps into focused web search queries. Falls back + * to the gap's own readiness query if the model returns nothing parseable, so a + * model hiccup degrades to a sane search rather than an empty round. + */ +async function formSearchQueries( + router: RouterClient, + ctx: WorkerResearchContext, + gaps: KnowledgeGap[], + queriesPerGap: number, +): Promise { + const gapLines = gaps + .map((gap, i) => `${i + 1}. ${gap.description} (readiness query: "${gap.query}")`) + .join('\n') + const want = gaps.length * queriesPerGap + const system = + 'You are a research librarian. Turn knowledge gaps into precise web search queries that will ' + + 'surface authoritative primary sources (papers, docs, standards, official pages). ' + + 'Return ONLY a JSON array of query strings, no prose.' + const user = [ + `Research goal: ${ctx.goal}`, + ctx.steer ? `Steer from the coordinator:\n${ctx.steer}` : '', + `Open knowledge gaps:\n${gapLines}`, + `Return up to ${want} search query strings as a JSON array (e.g. ["query one","query two"]).`, + ] + .filter(Boolean) + .join('\n\n') + + let raw = '' + try { + raw = await router.chat( + [ + { role: 'system', content: system }, + { role: 'user', content: user }, + ], + MIN_MAX_TOKENS, + ) + } catch { + raw = '' + } + const parsed = parseQueryList(raw) + const fromLlm = parsed.slice(0, want) + if (fromLlm.length > 0) return dedupeStrings(fromLlm) + // Degrade to the readiness queries themselves — still a real search. + return dedupeStrings(gaps.map((gap) => gap.query || gap.description)) +} + +/** Parse a JSON array of query strings, tolerant of code fences / surrounding prose. */ +function parseQueryList(raw: string): string[] { + const text = raw.trim() + if (!text) return [] + const candidates: string[] = [] + // Prefer a fenced or bare JSON array. + const arrayMatch = text.match(/\[[\s\S]*\]/) + if (arrayMatch) { + try { + const arr = JSON.parse(arrayMatch[0]) as unknown[] + for (const item of arr) + if (typeof item === 'string' && item.trim()) candidates.push(item.trim()) + } catch { + /* fall through to line parsing */ + } + } + if (candidates.length === 0) { + for (const line of text.split('\n')) { + const cleaned = line + .replace(/^\s*(?:[-*]|\d+[.)])\s*/, '') + .replace(/^["']|["']$/g, '') + .trim() + if (cleaned && cleaned.length > 2 && !cleaned.startsWith('{')) candidates.push(cleaned) + } + } + return candidates +} + +function dedupeStrings(values: string[]): string[] { + const seen = new Set() + const out: string[] = [] + for (const value of values) { + const key = value.toLowerCase().trim() + if (key && !seen.has(key)) { + seen.add(key) + out.push(value.trim()) + } + } + return out +} + +/** + * Build the curated `knowledge/*.md` pages from the sources the driver accepted. + * Each page cites the registered source by its assigned `record.id` (matched back + * to the proposal via `metadata.originalUri`, which `addSourceText` stashes). + */ +function buildCitingPages( + proposals: ResearchSourceProposal[], +): (acceptedSources: SourceRecord[]) => string | undefined { + return (acceptedSources) => { + if (acceptedSources.length === 0) return undefined + const blocks = acceptedSources.map((record) => { + const proposal = proposals.find((p) => p.uri === record.metadata?.originalUri) + const uri = proposal?.uri ?? record.id + const slug = + uri + .replace(/^https?:\/\//, '') + .replace(/[^a-z0-9]+/gi, '-') + .replace(/^-+|-+$/g, '') + .slice(0, 120) || record.id + const title = proposal?.title ?? record.id + const body = proposal?.text ?? '' + return [ + `---FILE: knowledge/${slug}.md---`, + '---', + `title: ${escapeYaml(title)}`, + `sources: ["${record.id}"]`, + `source_url: ${uri}`, + '---', + `# ${title}`, + '', + body, + '', + `Source: ${uri}`, + '---END FILE---', + ].join('\n') + }) + return blocks.join('\n') + } +} + +function escapeYaml(value: string): string { + // Keep the frontmatter line single-valued and safe: collapse newlines, strip + // quotes that would break the scalar. + return value + .replace(/[\r\n]+/g, ' ') + .replace(/"/g, "'") + .trim() +} + +export interface VerifyingDriverOptions { + router?: RouterClient + router_options?: TangleRouterOptions + /** + * When the LLM verdict can't be parsed, default to REJECT (fail-closed) so a + * model hiccup never poisons the KB with an unverified source. Set `true` to + * accept-on-parse-failure only if you have a reason to. Default false. + */ + acceptOnParseFailure?: boolean +} + +/** + * The verifying driver: a real LLM pass that judges each candidate source's + * on-topic relevance to the goal + open gaps and whether it duplicates material + * already accepted this round. This is the differentiated coordinator role — it + * GATES the worker's additions; it adds nothing itself. + * + * The loop already dedups exact-uri duplicates and only calls this on genuinely + * new candidates, so the verifier focuses on relevance + near-duplicate + * judgement, not bookkeeping. + */ +export function createVerifyingResearchDriver( + options: VerifyingDriverOptions = {}, +): ResearchDriver { + const acceptOnParseFailure = options.acceptOnParseFailure ?? false + return { + async verifySource( + source: ResearchSourceProposal, + ctx: SourceVerificationContext, + ): Promise { + const router = resolveRouter(options) + const gapLines = ctx.gaps + .map((gap) => `- ${gap.description} (query: "${gap.query}")`) + .join('\n') + const acceptedTitles = ctx.acceptedThisRound + .map((accepted) => `- ${accepted.title ?? accepted.uri}`) + .join('\n') + const excerpt = source.text.slice(0, 1500) + const system = + 'You verify whether a fetched web source belongs in a curated knowledge base. ' + + 'Accept a source ONLY if it is genuinely on-topic for the research goal and helps close ' + + 'one of the open gaps, AND it is not a near-duplicate of an already-accepted source. ' + + 'Reject spam, listicles, off-topic pages, marketing, and near-duplicates. ' + + 'Respond with ONLY a JSON object: {"accept": true|false, "reason": ""}.' + const user = [ + `Research goal: ${ctx.goal}`, + `Open gaps:\n${gapLines || '(none specified)'}`, + acceptedTitles + ? `Already accepted this round:\n${acceptedTitles}` + : 'Nothing accepted yet this round.', + `Candidate source:\nURL: ${source.uri}\nTitle: ${source.title ?? '(none)'}\nExcerpt:\n${excerpt}`, + 'Verdict as JSON {"accept": boolean, "reason": string}:', + ].join('\n\n') + + let raw = '' + try { + raw = await router.chat( + [ + { role: 'system', content: system }, + { role: 'user', content: user }, + ], + MIN_MAX_TOKENS, + ) + } catch (error) { + if ((error as { name?: string }).name === 'AbortError') throw error + // Router failure: fail-closed (reject) so an unverified source can't slip in. + return acceptOnParseFailure + ? { accept: true } + : { accept: false, reason: `verifier unavailable: ${(error as Error).message}` } + } + + const verdict = parseVerdict(raw) + if (verdict) return verdict + return acceptOnParseFailure + ? { accept: true } + : { accept: false, reason: 'verifier returned an unparseable verdict' } + }, + } +} + +/** Parse the verifier's `{accept, reason}` JSON, tolerant of fences / prose. */ +function parseVerdict(raw: string): SourceVerdict | null { + const text = raw.trim() + if (!text) return null + const objMatch = text.match(/\{[\s\S]*\}/) + if (!objMatch) return null + try { + const parsed = JSON.parse(objMatch[0]) as { accept?: unknown; reason?: unknown } + if (typeof parsed.accept !== 'boolean') return null + if (parsed.accept) return { accept: true } + return { + accept: false, + reason: + typeof parsed.reason === 'string' && parsed.reason.trim() + ? parsed.reason.trim() + : 'rejected by verifier', + } + } catch { + return null + } +} diff --git a/tests/loops/research-loop-equal-compute.test.ts b/tests/loops/research-loop-equal-compute.test.ts index 74ee214..8ae6544 100644 --- a/tests/loops/research-loop-equal-compute.test.ts +++ b/tests/loops/research-loop-equal-compute.test.ts @@ -31,6 +31,12 @@ import { type SourceVerdict, type WorkerResearchContext, } from '../../src/two-agent-research-loop' +import { + createTangleRouterClient, + createVerifyingResearchDriver, + createWebResearchWorker, + type RouterClient, +} from '../../src/web-research-worker' // =========================================================================== // THE VALUE A/B: does the verifying-driver (two-agent) loop build a CLEANER @@ -289,17 +295,33 @@ async function junkAdmitted(root: string): Promise { * the loops gate: build the readiness report over the current index, count how * many blocking specs are NOT in `blockingMissingRequirements`. */ -async function coverage(root: string, goal: string): Promise { +async function coverage( + root: string, + goal: string, + specs: KnowledgeReadinessSpec[] = blockingSpecs, +): Promise { const index = await buildKnowledgeIndex(root) // `buildEvalKnowledgeBundle` searches each spec over the index's PAGES and // runs the substrate `scoreKnowledgeReadiness` — the SAME scorer both loops // gate on. Coverage = fraction of blocking specs NOT in the missing set. - const { report } = buildEvalKnowledgeBundle({ taskId: goal, index, specs: blockingSpecs }) - const blockingCount = blockingSpecs.filter((s) => s.importance === 'blocking').length + const { report } = buildEvalKnowledgeBundle({ taskId: goal, index, specs }) + const blockingCount = specs.filter((s) => s.importance === 'blocking').length const missing = report.blockingMissingRequirements.length return blockingCount === 0 ? 1 : (blockingCount - missing) / blockingCount } +/** + * Total sources that reached the KB. For REAL research there is no `junk/` + * prefix oracle — the live signal is that the verifying driver admits FEWER + * sources because it rejects the off-topic ones the single-agent loop keeps. So + * the live A/B compares admitted-source COUNTS (cleaner = fewer admitted at + * equal-or-higher coverage), the real-world analogue of the offline junk count. + */ +async function admittedSourceCount(root: string): Promise { + const index = await buildKnowledgeIndex(root) + return index.sources.length +} + /** * The source-record id `addSourceText` will assign — deterministic from the * text+uri (see src/sources.ts: `stableId('src', `${sha256(text)}:${uri}`)`). We @@ -321,17 +343,38 @@ function predictedSourceId(source: ResearchSourceProposal): string { * uses (so both arms converge by the same criterion, not at unequal effort). * Returns the actual passes spent so the test can compare compute. */ +/** + * A round of primary research, abstracted so the SAME arm-runners drive both the + * offline naive proposer AND a real web-research worker. Given the goal + open + * gaps + optional steer, return the sources found this pass. The offline default + * is `makeNaiveProposals` over the planted pool; the live arm injects the real + * `createWebResearchWorker`. + */ +type ProposeSources = ( + ctx: { + goal: string + gaps: { description: string; query: string }[] + steer?: string + root: string + }, + onPass: () => void, +) => Promise + +const offlinePropose: ProposeSources = (ctx, onPass) => makeNaiveProposals(ctx, onPass) + async function runSingleAgentArm( root: string, goal: string, maxIterations: number, + propose: ProposeSources = offlinePropose, + specs: KnowledgeReadinessSpec[] = blockingSpecs, ): Promise<{ passes: number }> { let passes = 0 await runKnowledgeResearchLoop({ root, goal, maxIterations, - readinessSpecs: blockingSpecs, + readinessSpecs: specs, async step(context): Promise { passes += 1 const report = context.readiness?.report @@ -348,7 +391,7 @@ async function runSingleAgentArm( ? (req.metadata.query as string) : req.description, })) - const proposals = await makeNaiveProposals({ goal, gaps }, () => {}) + const proposals = await propose({ goal, gaps, root }, () => {}) if (proposals.length === 0) return { notes: 'no new proposals' } // Register sources AND write citing pages in one step — exactly like the // two-agent worker (sources + buildPages), but with NO driver verification @@ -370,16 +413,32 @@ async function runTwoAgentArm( root: string, goal: string, rounds: number, + arm?: { worker: ResearchWorker; driver: ResearchDriver }, + specs: KnowledgeReadinessSpec[] = blockingSpecs, ): Promise<{ passes: number }> { let workerPasses = 0 + // Default arm = the offline naive proposer + prefix-check verifier. The live + // arm injects the real web-research worker + the LLM verifying driver. Either + // way the worker pass is counted once per invocation via a thin wrapper, so + // the equal-compute accounting below holds for both. + const worker: ResearchWorker = + arm?.worker ?? + twoAgentWorker(() => { + workerPasses += 1 + }) + const countedWorker: ResearchWorker = arm + ? async (ctx) => { + workerPasses += 1 + return worker(ctx) + } + : worker + const driver: ResearchDriver = arm?.driver ?? verifyingDriver() await runTwoAgentResearchLoop({ root, goal, - worker: twoAgentWorker(() => { - workerPasses += 1 - }), - driver: verifyingDriver(), - readinessSpecs: blockingSpecs, + worker: countedWorker, + driver, + readinessSpecs: specs, maxRounds: rounds, }) // EQUAL-COMPUTE ACCOUNTING: every round that ran a worker pass also ran one @@ -475,57 +534,195 @@ describe('research loop A/B at equal compute (offline, controlled lower bound)', // =========================================================================== // LIVE A/B — the real evidence. Skipped offline (no creds), exactly like -// tests/sources-live.test.ts. Runs BOTH loops on a real research goal with a -// real worker at equal compute over N goals, and reports a PAIRED comparison -// of junk-admitted and coverage via agent-eval's pairedBootstrap (no hand- -// rolled significance). What this adds over the offline arm: the worker's junk -// is NOT prefix-detectable and the driver's verifier is a real LLM, so a win -// here is evidence the verifying driver cleans REAL research, not a planted -// floor. +// tests/sources-live.test.ts. Runs BOTH loops on a REAL research goal with the +// REAL web-research worker (glm-5.2 query-gen → live `/v1/search` → politeFetch +// → htmlToText) and a REAL LLM verifying driver (glm-5.2 on-topic judgement) at +// equal compute, then reports a PAIRED comparison via agent-eval's +// pairedBootstrap (no hand-rolled significance). +// +// What this adds over the offline arm: there is NO planted `junk/` pool and NO +// prefix-check verifier — the worker fetches whatever the web returns and the +// driver is an LLM. So the live cleanliness signal is admitted-source COUNT: +// the verifying driver rejects off-topic fetches, so the two-agent KB admits +// FEWER sources at equal-or-higher coverage. A win here is evidence the +// verifying driver cleans REAL research, not a planted floor. +// +// Gate: `AGENT_KNOWLEDGE_LIVE=1` + a TANGLE_API_KEY with glm-5.2 credits. +// AGENT_KNOWLEDGE_LIVE_GOALS — `|`-separated goals (default: self-speculative decoding) +// AGENT_KNOWLEDGE_LIVE_BUDGET — agent-pass ceiling B per arm (default 4) +// AGENT_KNOWLEDGE_LIVE_MODEL — router chat model (default glm-5.2) // =========================================================================== + +/** + * Topic-relevant blocking readiness specs for a live goal. General: the gaps + * are phrased so a real web search can close them, and the search/readiness + * query carries the goal so coverage scores against fetched pages. Override the + * default goal via `AGENT_KNOWLEDGE_LIVE_GOALS` (this builds matching specs). + */ +function liveSpecsForGoal(liveGoal: string): KnowledgeReadinessSpec[] { + return [ + defineReadinessSpec({ + id: 'topic/definition', + description: `what ${liveGoal} is and how it works`, + query: `${liveGoal} how it works method`, + requiredFor: ['ResearchAgent'], + importance: 'blocking', + minSources: 1, + minHits: 1, + }), + defineReadinessSpec({ + id: 'topic/results', + description: `reported results, speedups, or trade-offs for ${liveGoal}`, + query: `${liveGoal} speedup results benchmark`, + requiredFor: ['ResearchAgent'], + importance: 'blocking', + minSources: 1, + minHits: 1, + }), + ] +} + describe.skipIf(!process.env.AGENT_KNOWLEDGE_LIVE)( 'live: research loop A/B at equal compute', () => { - it('two-agent vs single-agent over N goals — paired comparison of junk-admitted', async () => { - // The set of real research goals to A/B over. A live harness would swap - // the offline naive proposer for a real researcher backend and the - // prefix-check verifier for an LLM verifySource. Both arms still run at - // the same agent-pass budget B; only the worker/verifier change. - const goals = (process.env.AGENT_KNOWLEDGE_LIVE_GOALS ?? goal).split('|').map((g) => g.trim()) - const budgetPasses = Number(process.env.AGENT_KNOWLEDGE_LIVE_BUDGET ?? 6) - - const twoAgentJunkByGoal: number[] = [] - const singleAgentJunkByGoal: number[] = [] + it('two-agent (real worker + LLM verifier) vs single-agent — paired comparison', async () => { + const goals = (process.env.AGENT_KNOWLEDGE_LIVE_GOALS ?? 'self-speculative decoding') + .split('|') + .map((g) => g.trim()) + .filter(Boolean) + const budgetPasses = Number(process.env.AGENT_KNOWLEDGE_LIVE_BUDGET ?? 4) + const model = process.env.AGENT_KNOWLEDGE_LIVE_MODEL ?? 'glm-5.2' + + // ONE shared router client for the whole run (web search + chat). + const router: RouterClient = createTangleRouterClient({ model }) + + // COST GATE: a cheap glm-5.2 smoke BEFORE the multi-arm burn. Proves the + // key works + the reasoning-token floor returns visible content. If this + // returns empty or throws, the full A/B can't produce real numbers — fail + // fast instead of spending the whole budget to discover it. + const smoke = await router.chat( + [ + { role: 'system', content: 'Reply with exactly the word: OK' }, + { role: 'user', content: 'Say OK.' }, + ], + 1200, + ) + console.log(`[LIVE smoke] glm-5.2 visible content length=${smoke.trim().length}`) + expect(smoke.trim().length).toBeGreaterThan(0) + + const realWorker = createWebResearchWorker({ + router, + resultsPerQuery: 3, + queriesPerGap: 1, + maxSourcesPerRound: 6, + }) + const realDriver = createVerifyingResearchDriver({ router }) + + const twoAdmittedByGoal: number[] = [] + const singleAdmittedByGoal: number[] = [] + let anySourceFetched = false + for (const liveGoal of goals) { + const specs = liveSpecsForGoal(liveGoal) const twoRoot = await mkdtemp(join(tmpdir(), 'live-two-')) const singleRoot = await mkdtemp(join(tmpdir(), 'live-single-')) try { - await runTwoAgentArm(twoRoot, liveGoal, budgetPasses / 2) - await runSingleAgentArm(singleRoot, liveGoal, budgetPasses) - twoAgentJunkByGoal.push(await junkAdmitted(twoRoot)) - singleAgentJunkByGoal.push(await junkAdmitted(singleRoot)) + // TWO-AGENT arm: real worker proposes, real LLM driver verifies. + const two = await runTwoAgentArm( + twoRoot, + liveGoal, + budgetPasses / 2, + { worker: realWorker, driver: realDriver }, + specs, + ) + // SINGLE-AGENT arm: the SAME real worker, NO verifier gate, more iters + // to spend the same agent-pass budget the two-agent loop burns on + // verification. + const single = await runSingleAgentArm( + singleRoot, + liveGoal, + budgetPasses, + (ctx, onPass) => realWorkerPropose(realWorker, ctx, onPass), + specs, + ) + + const twoAdmitted = await admittedSourceCount(twoRoot) + const singleAdmitted = await admittedSourceCount(singleRoot) + const twoCoverage = await coverage(twoRoot, liveGoal, specs) + const singleCoverage = await coverage(singleRoot, liveGoal, specs) + if (twoAdmitted > 0 || singleAdmitted > 0) anySourceFetched = true + + twoAdmittedByGoal.push(twoAdmitted) + singleAdmittedByGoal.push(singleAdmitted) + + console.log( + `[LIVE A/B ${JSON.stringify(liveGoal)} @ B<=${budgetPasses}] ` + + `two-agent: passes=${two.passes} admitted=${twoAdmitted} coverage=${twoCoverage.toFixed(2)} | ` + + `single-agent: passes=${single.passes} admitted=${singleAdmitted} coverage=${singleCoverage.toFixed(2)}`, + ) } finally { await rm(twoRoot, { recursive: true, force: true }) await rm(singleRoot, { recursive: true, force: true }) } } - // Paired bootstrap on (single − two) junk deltas: a POSITIVE delta means - // the single-agent loop admitted MORE junk, i.e. the two-agent loop is - // cleaner. `low > 0` is the gate — the cleanliness gain is real at the - // confidence level, not luck. Do NOT hand-roll this; reuse the substrate. - const result = pairedBootstrap(twoAgentJunkByGoal, singleAgentJunkByGoal, { + // The live arm is only evidence if the worker actually web-searched and + // fetched real pages. Zero sources across both arms = the worker never + // reached the web (creds/network) — that is a FALSE null, fail loud. + expect(anySourceFetched).toBe(true) + + // Paired bootstrap on (single − two) admitted-source deltas: a POSITIVE + // delta means the single-agent loop admitted MORE sources, i.e. the + // verifying driver kept the KB cleaner. `low > 0` is the significance + // gate. Reuse the substrate; do NOT hand-roll significance. + const result = pairedBootstrap(twoAdmittedByGoal, singleAdmittedByGoal, { statistic: 'mean', seed: 1, }) console.log( - `[LIVE A/B] n=${result.n} mean(single-two junk)=${result.mean.toFixed(3)} ` + + `[LIVE A/B] n=${result.n} mean(single-two admitted)=${result.mean.toFixed(3)} ` + `CI=[${result.low.toFixed(3)}, ${result.high.toFixed(3)}] — ` + `two-agent cleaner iff low > 0`, ) - // At equal compute, the two-agent loop should admit no more junk on - // average; the bootstrap lower bound says whether that is significant. + // At equal compute the verifying driver should admit no MORE sources than + // the ungated single-agent loop on average; the bootstrap lower bound says + // whether the cleanliness gain is significant. expect(result.mean).toBeGreaterThanOrEqual(0) - }, 120_000) + }, 600_000) }, ) + +/** + * Drive the real web-research worker as a single-agent proposer: build a + * `WorkerResearchContext` from the loop context and return the sources it found. + * Charges one pass per invocation via `onPass`, matching the two-agent worker. + */ +async function realWorkerPropose( + worker: ResearchWorker, + ctx: { + goal: string + gaps: { description: string; query: string }[] + root: string + steer?: string + }, + onPass: () => void, +): Promise { + onPass() + const index = await buildKnowledgeIndex(ctx.root) + const readiness = buildEvalKnowledgeBundle({ taskId: ctx.goal, index, specs: [] }) + const contribution = await worker({ + root: ctx.root, + goal: ctx.goal, + round: 1, + index, + gaps: ctx.gaps.map((gap) => ({ + id: gap.description, + description: gap.description, + query: gap.query, + blocking: true, + })), + steer: ctx.steer, + readiness, + }) + return contribution.sources ?? [] +}