From 113f6efc52720f2709568c136fab3755d8221bf9 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Wed, 24 Jun 2026 16:16:01 -0600
Subject: [PATCH] feat(research): real web-research worker + genuinely-live A/B
 arm
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the live A/B arm's skeleton (which ran the same offline naive
proposer over a hardcoded pool with a junk/-prefix verifier) with a real,
any-topic web-research worker and a real LLM verifying driver, so
AGENT_KNOWLEDGE_LIVE=1 runs a genuine experiment.

src/web-research-worker.ts (general, no hardcoded corpus):
- createWebResearchWorker — glm-5.2 turns open gaps into search queries,
  runs real web search over the router (/v1/search, the endpoint tcloud
  mcp's web_search forwards to), fetches with politeFetch, reduces with
  htmlToText, proposes citing pages via buildPages. Conforms to the loop's
  ResearchWorker contract.
- createVerifyingResearchDriver — a glm-5.2 verifySource pass judging each
  fetched source's on-topic relevance + near-duplicate against the round,
  fail-closed on parse/router failure.
- createTangleRouterClient — dependency-free router client over fetch
  (search + chat), so it works with or without the tcloud CLI installed.
  Reasoning-model floor: glm-5.2 calls get max_tokens >= 1200 so visible
  content isn't starved by reasoning_content.

Live arm: injects the real worker + verifier + topic-relevant readiness
specs, runs both arms at equal agent-pass budget, cost-gates with a cheap
glm-5.2 smoke first, asserts the worker actually web-searched, and reports
admitted-source count + coverage per arm with agent-eval's pairedBootstrap.
The offline arm is unchanged (CI, $0, deterministic) — the arm-runners gain
defaulted parameters only.
---
 src/index.ts                                  |   1 +
 src/web-research-worker.ts                    | 497 ++++++++++++++++++
 .../loops/research-loop-equal-compute.test.ts | 277 ++++++++--
 3 files changed, 735 insertions(+), 40 deletions(-)
 create mode 100644 src/web-research-worker.ts

diff --git a/src/index.ts b/src/index.ts
index 2ce4f8c..9487362 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -26,5 +26,6 @@ export * from './store'
 export * from './two-agent-research-loop'
 export * from './types'
 export * from './validate'
+export * from './web-research-worker'
 export * from './wikilinks'
 export * from './write-protocol'
diff --git a/src/web-research-worker.ts b/src/web-research-worker.ts
new file mode 100644
index 0000000..ff147a1
--- /dev/null
+++ b/src/web-research-worker.ts
@@ -0,0 +1,497 @@
+/**
+ * Real web-research worker + verifying driver for `runTwoAgentResearchLoop`.
+ *
+ * This is the GENERAL, any-topic implementation behind the two-agent research
+ * loop's live arm. Given the open knowledge gaps the readiness gate surfaces,
+ * the worker:
+ *
+ *   1. asks an LLM (glm-5.2 by default) to turn each gap into focused web
+ *      search queries,
+ *   2. runs a REAL web search over the Tangle router (`POST /v1/search` — the
+ *      same endpoint `tcloud mcp`'s `web_search` tool forwards to), so there is
+ *      no hardcoded corpus,
+ *   3. fetches the top results with the repo's polite, cached `politeFetch` and
+ *      reduces each page to text with `htmlToText`,
+ *   4. proposes the readable, verifiable pages as `ResearchSourceProposal`s plus
+ *      a `buildPages` that writes citing `knowledge/*.md` pages from the sources
+ *      the driver accepts.
+ *
+ * The verifying DRIVER is the differentiated role from the two-agent loop: a
+ * second LLM pass that judges each fetched source's on-topic relevance to the
+ * goal + open gaps and rejects off-topic / spam / already-covered material. The
+ * worker ADDS; the driver GATES. Together they build a cleaner knowledge base
+ * than a single agent at the same compute budget.
+ *
+ * Dependency-free on purpose: it talks to the router over `fetch` directly with
+ * the published OpenAI-compatible chat shape and the `/v1/search` shape, so it
+ * works whether or not the `tcloud` CLI is installed. Point it at any router by
+ * passing `baseUrl`; supply the key via `apiKey` or `TANGLE_API_KEY`.
+ */
+
+import { htmlToText } from './sources/html'
+import { politeFetch } from './sources/http'
+import type {
+  KnowledgeGap,
+  ResearchContribution,
+  ResearchDriver,
+  ResearchSourceProposal,
+  ResearchWorker,
+  SourceVerdict,
+  SourceVerificationContext,
+  WorkerResearchContext,
+} from './two-agent-research-loop'
+import type { SourceRecord } from './types'
+
+/** Default router model. Plain id (no namespace) — see CLAUDE/creds notes. */
+const DEFAULT_MODEL = 'glm-5.2'
+/** Default router base. */
+const DEFAULT_BASE_URL = 'https://router.tangle.tools/v1'
+/**
+ * glm-5.2 spends its first tokens on hidden `reasoning_content`; below ~1200
+ * output tokens it returns EMPTY visible content. Floor every call so a
+ * reasoning model never silently yields nothing. (Verified creds note.)
+ */
+const MIN_MAX_TOKENS = 1200
+
+/** One live web result, as the router's `/v1/search` returns it. */
+export interface WebSearchHit {
+  title: string
+  url: string
+  snippet?: string
+}
+
+/**
+ * The two router capabilities the worker/driver need. Injectable so tests can
+ * stub the network; the default talks to the live Tangle router over `fetch`.
+ */
+export interface RouterClient {
+  /** Live web search — returns title/url/snippet hits. */
+  search(query: string, opts?: { maxResults?: number }): Promise<WebSearchHit[]>
+  /** Chat completion — returns the assistant message's visible text. */
+  chat(
+    messages: { role: 'system' | 'user'; content: string }[],
+    maxTokens?: number,
+  ): Promise<string>
+}
+
+export interface TangleRouterOptions {
+  /** Router base URL. Defaults to `https://router.tangle.tools/v1`. */
+  baseUrl?: string
+  /** Bearer key. Defaults to `process.env.TANGLE_API_KEY`. */
+  apiKey?: string
+  /** Chat model id. Defaults to `glm-5.2`. */
+  model?: string
+  /** Optional preferred search provider (exa | you | perplexity | …). */
+  searchProvider?: string
+  signal?: AbortSignal
+}
+
+/** A small error so a failed router call fails loud rather than returning junk. */
+export class RouterError extends Error {
+  constructor(
+    public readonly status: number,
+    message: string,
+  ) {
+    super(`router ${status}: ${message}`)
+    this.name = 'RouterError'
+  }
+}
+
+/**
+ * Build a dependency-free Tangle router client over `fetch`. This is the same
+ * wire surface the `tcloud` SDK + `tcloud mcp` use (`/v1/search` for web search,
+ * `/v1/chat/completions` for chat) so it needs no CLI installed.
+ */
+export function createTangleRouterClient(options: TangleRouterOptions = {}): RouterClient {
+  const baseUrl = (options.baseUrl ?? DEFAULT_BASE_URL).replace(/\/$/, '')
+  const apiKey = options.apiKey ?? process.env.TANGLE_API_KEY
+  if (!apiKey) {
+    throw new RouterError(401, 'no TANGLE_API_KEY (pass apiKey or set the env var)')
+  }
+  const model = options.model ?? DEFAULT_MODEL
+  const headers = {
+    'Content-Type': 'application/json',
+    Authorization: `Bearer ${apiKey}`,
+  }
+
+  return {
+    async search(query, opts) {
+      const res = await fetch(`${baseUrl}/search`, {
+        method: 'POST',
+        headers,
+        signal: options.signal,
+        body: JSON.stringify({
+          query,
+          ...(options.searchProvider ? { provider: options.searchProvider } : {}),
+          ...(opts?.maxResults != null ? { maxResults: opts.maxResults } : {}),
+        }),
+      })
+      if (!res.ok) {
+        throw new RouterError(res.status, await res.text().catch(() => res.statusText))
+      }
+      const body = (await res.json()) as { data?: WebSearchHit[] }
+      return (body.data ?? [])
+        .filter((hit) => typeof hit?.url === 'string' && hit.url.length > 0)
+        .map((hit) => ({ title: hit.title ?? hit.url, url: hit.url, snippet: hit.snippet }))
+    },
+    async chat(messages, maxTokens) {
+      // Reasoning-model floor: never let glm-5.2 spend the whole budget on
+      // hidden reasoning and return empty visible content.
+      const max_tokens = Math.max(MIN_MAX_TOKENS, maxTokens ?? MIN_MAX_TOKENS)
+      const res = await fetch(`${baseUrl}/chat/completions`, {
+        method: 'POST',
+        headers,
+        signal: options.signal,
+        body: JSON.stringify({ model, messages, max_tokens, temperature: 0.2, stream: false }),
+      })
+      if (!res.ok) {
+        throw new RouterError(res.status, await res.text().catch(() => res.statusText))
+      }
+      const body = (await res.json()) as {
+        choices?: { message?: { content?: string } }[]
+      }
+      return body.choices?.[0]?.message?.content ?? ''
+    },
+  }
+}
+
+export interface WebResearchWorkerOptions {
+  /** Router client. Defaults to a live Tangle router client from env creds. */
+  router?: RouterClient
+  router_options?: TangleRouterOptions
+  /** Max search queries the LLM may form per gap. Default 2. */
+  queriesPerGap?: number
+  /** Max web results fetched per query. Default 3. */
+  resultsPerQuery?: number
+  /** Hard cap on sources proposed per round (across all gaps). Default 6. */
+  maxSourcesPerRound?: number
+  /** Disk cache dir for `politeFetch`. Optional; speeds repeat runs. */
+  cacheDir?: string
+  /** Minimum readable text length to keep a fetched page. Default 200. */
+  minTextChars?: number
+  /** Max chars of page text stored per source (keeps pages bounded). Default 4000. */
+  maxTextChars?: number
+}
+
+/** Resolve the router client lazily so a worker with an injected client never reads env. */
+function resolveRouter(opts: {
+  router?: RouterClient
+  router_options?: TangleRouterOptions
+}): RouterClient {
+  return opts.router ?? createTangleRouterClient(opts.router_options)
+}
+
+/**
+ * The real web-research worker. Conforms to the loop's `ResearchWorker`
+ * contract: given the open gaps, it returns the sources it found plus a
+ * `buildPages` that emits citing pages from the sources the driver accepted.
+ */
+export function createWebResearchWorker(options: WebResearchWorkerOptions = {}): ResearchWorker {
+  const queriesPerGap = Math.max(1, options.queriesPerGap ?? 2)
+  const resultsPerQuery = Math.max(1, options.resultsPerQuery ?? 3)
+  const maxSourcesPerRound = Math.max(1, options.maxSourcesPerRound ?? 6)
+  const minTextChars = Math.max(1, options.minTextChars ?? 200)
+  const maxTextChars = Math.max(minTextChars, options.maxTextChars ?? 4000)
+
+  return async (ctx: WorkerResearchContext): Promise<ResearchContribution> => {
+    const router = resolveRouter(options)
+    // Target the BLOCKING gaps first; fall back to all gaps if none are blocking.
+    const targetGaps = ctx.gaps.filter((gap) => gap.blocking)
+    const gaps = targetGaps.length > 0 ? targetGaps : ctx.gaps
+    if (gaps.length === 0) {
+      return { sources: [], notes: 'no open gaps to research' }
+    }
+
+    const queries = await formSearchQueries(router, ctx, gaps, queriesPerGap)
+    const proposals: ResearchSourceProposal[] = []
+    const seenUris = new Set<string>()
+
+    for (const query of queries) {
+      if (proposals.length >= maxSourcesPerRound) break
+      if (ctx.signal?.aborted) break
+      let hits: WebSearchHit[]
+      try {
+        hits = await router.search(query, { maxResults: resultsPerQuery })
+      } catch (error) {
+        // A single failed query must not sink the round — record nothing, move on.
+        if ((error as { name?: string }).name === 'AbortError') break
+        continue
+      }
+      for (const hit of hits) {
+        if (proposals.length >= maxSourcesPerRound) break
+        if (seenUris.has(hit.url)) continue
+        const fetched = await politeFetch(hit.url, {
+          signal: ctx.signal,
+          cacheDir: options.cacheDir,
+        })
+        if (!fetched.verifiable) continue
+        const text = htmlToText(fetched.body).slice(0, maxTextChars)
+        if (text.length < minTextChars) continue
+        seenUris.add(hit.url)
+        proposals.push({
+          uri: hit.url,
+          title: hit.title || hit.url,
+          text,
+          // We just fetched + verified this page, so stamp `lastVerifiedAt` with
+          // fetch time. Do NOT set `validUntil`: a live page has no inherent
+          // future expiry, and the readiness freshness check treats any
+          // `validUntil <= now` as EXPIRED (score 0). The page's `Last-Modified`
+          // is a PAST date, so writing it here would mark every real source stale
+          // and zero out coverage. Record it as provenance metadata instead.
+          lastVerifiedAt: fetched.fetchedAt,
+          metadata: {
+            discoveredVia: query,
+            snippet: hit.snippet ?? '',
+            goal: ctx.goal,
+            sourceUpdatedAt: fetched.sourceUpdatedAt,
+          },
+        })
+      }
+    }
+
+    return {
+      sources: proposals,
+      buildPages: buildCitingPages(proposals),
+      notes: `web-research worker: ${queries.length} queries → ${proposals.length} fetched sources`,
+    }
+  }
+}
+
+/**
+ * Ask the LLM to turn the open gaps into focused web search queries. Falls back
+ * to the gap's own readiness query if the model returns nothing parseable, so a
+ * model hiccup degrades to a sane search rather than an empty round.
+ */
+async function formSearchQueries(
+  router: RouterClient,
+  ctx: WorkerResearchContext,
+  gaps: KnowledgeGap[],
+  queriesPerGap: number,
+): Promise<string[]> {
+  const gapLines = gaps
+    .map((gap, i) => `${i + 1}. ${gap.description} (readiness query: "${gap.query}")`)
+    .join('\n')
+  const want = gaps.length * queriesPerGap
+  const system =
+    'You are a research librarian. Turn knowledge gaps into precise web search queries that will ' +
+    'surface authoritative primary sources (papers, docs, standards, official pages). ' +
+    'Return ONLY a JSON array of query strings, no prose.'
+  const user = [
+    `Research goal: ${ctx.goal}`,
+    ctx.steer ? `Steer from the coordinator:\n${ctx.steer}` : '',
+    `Open knowledge gaps:\n${gapLines}`,
+    `Return up to ${want} search query strings as a JSON array (e.g. ["query one","query two"]).`,
+  ]
+    .filter(Boolean)
+    .join('\n\n')
+
+  let raw = ''
+  try {
+    raw = await router.chat(
+      [
+        { role: 'system', content: system },
+        { role: 'user', content: user },
+      ],
+      MIN_MAX_TOKENS,
+    )
+  } catch {
+    raw = ''
+  }
+  const parsed = parseQueryList(raw)
+  const fromLlm = parsed.slice(0, want)
+  if (fromLlm.length > 0) return dedupeStrings(fromLlm)
+  // Degrade to the readiness queries themselves — still a real search.
+  return dedupeStrings(gaps.map((gap) => gap.query || gap.description))
+}
+
+/** Parse a JSON array of query strings, tolerant of code fences / surrounding prose. */
+function parseQueryList(raw: string): string[] {
+  const text = raw.trim()
+  if (!text) return []
+  const candidates: string[] = []
+  // Prefer a fenced or bare JSON array.
+  const arrayMatch = text.match(/\[[\s\S]*\]/)
+  if (arrayMatch) {
+    try {
+      const arr = JSON.parse(arrayMatch[0]) as unknown[]
+      for (const item of arr)
+        if (typeof item === 'string' && item.trim()) candidates.push(item.trim())
+    } catch {
+      /* fall through to line parsing */
+    }
+  }
+  if (candidates.length === 0) {
+    for (const line of text.split('\n')) {
+      const cleaned = line
+        .replace(/^\s*(?:[-*]|\d+[.)])\s*/, '')
+        .replace(/^["']|["']$/g, '')
+        .trim()
+      if (cleaned && cleaned.length > 2 && !cleaned.startsWith('{')) candidates.push(cleaned)
+    }
+  }
+  return candidates
+}
+
+function dedupeStrings(values: string[]): string[] {
+  const seen = new Set<string>()
+  const out: string[] = []
+  for (const value of values) {
+    const key = value.toLowerCase().trim()
+    if (key && !seen.has(key)) {
+      seen.add(key)
+      out.push(value.trim())
+    }
+  }
+  return out
+}
+
+/**
+ * Build the curated `knowledge/*.md` pages from the sources the driver accepted.
+ * Each page cites the registered source by its assigned `record.id` (matched back
+ * to the proposal via `metadata.originalUri`, which `addSourceText` stashes).
+ */
+function buildCitingPages(
+  proposals: ResearchSourceProposal[],
+): (acceptedSources: SourceRecord[]) => string | undefined {
+  return (acceptedSources) => {
+    if (acceptedSources.length === 0) return undefined
+    const blocks = acceptedSources.map((record) => {
+      const proposal = proposals.find((p) => p.uri === record.metadata?.originalUri)
+      const uri = proposal?.uri ?? record.id
+      const slug =
+        uri
+          .replace(/^https?:\/\//, '')
+          .replace(/[^a-z0-9]+/gi, '-')
+          .replace(/^-+|-+$/g, '')
+          .slice(0, 120) || record.id
+      const title = proposal?.title ?? record.id
+      const body = proposal?.text ?? ''
+      return [
+        `---FILE: knowledge/${slug}.md---`,
+        '---',
+        `title: ${escapeYaml(title)}`,
+        `sources: ["${record.id}"]`,
+        `source_url: ${uri}`,
+        '---',
+        `# ${title}`,
+        '',
+        body,
+        '',
+        `Source: ${uri}`,
+        '---END FILE---',
+      ].join('\n')
+    })
+    return blocks.join('\n')
+  }
+}
+
+function escapeYaml(value: string): string {
+  // Keep the frontmatter line single-valued and safe: collapse newlines, strip
+  // quotes that would break the scalar.
+  return value
+    .replace(/[\r\n]+/g, ' ')
+    .replace(/"/g, "'")
+    .trim()
+}
+
+export interface VerifyingDriverOptions {
+  router?: RouterClient
+  router_options?: TangleRouterOptions
+  /**
+   * When the LLM verdict can't be parsed, default to REJECT (fail-closed) so a
+   * model hiccup never poisons the KB with an unverified source. Set `true` to
+   * accept-on-parse-failure only if you have a reason to. Default false.
+   */
+  acceptOnParseFailure?: boolean
+}
+
+/**
+ * The verifying driver: a real LLM pass that judges each candidate source's
+ * on-topic relevance to the goal + open gaps and whether it duplicates material
+ * already accepted this round. This is the differentiated coordinator role — it
+ * GATES the worker's additions; it adds nothing itself.
+ *
+ * The loop already dedups exact-uri duplicates and only calls this on genuinely
+ * new candidates, so the verifier focuses on relevance + near-duplicate
+ * judgement, not bookkeeping.
+ */
+export function createVerifyingResearchDriver(
+  options: VerifyingDriverOptions = {},
+): ResearchDriver {
+  const acceptOnParseFailure = options.acceptOnParseFailure ?? false
+  return {
+    async verifySource(
+      source: ResearchSourceProposal,
+      ctx: SourceVerificationContext,
+    ): Promise<SourceVerdict> {
+      const router = resolveRouter(options)
+      const gapLines = ctx.gaps
+        .map((gap) => `- ${gap.description} (query: "${gap.query}")`)
+        .join('\n')
+      const acceptedTitles = ctx.acceptedThisRound
+        .map((accepted) => `- ${accepted.title ?? accepted.uri}`)
+        .join('\n')
+      const excerpt = source.text.slice(0, 1500)
+      const system =
+        'You verify whether a fetched web source belongs in a curated knowledge base. ' +
+        'Accept a source ONLY if it is genuinely on-topic for the research goal and helps close ' +
+        'one of the open gaps, AND it is not a near-duplicate of an already-accepted source. ' +
+        'Reject spam, listicles, off-topic pages, marketing, and near-duplicates. ' +
+        'Respond with ONLY a JSON object: {"accept": true|false, "reason": "<short reason>"}.'
+      const user = [
+        `Research goal: ${ctx.goal}`,
+        `Open gaps:\n${gapLines || '(none specified)'}`,
+        acceptedTitles
+          ? `Already accepted this round:\n${acceptedTitles}`
+          : 'Nothing accepted yet this round.',
+        `Candidate source:\nURL: ${source.uri}\nTitle: ${source.title ?? '(none)'}\nExcerpt:\n${excerpt}`,
+        'Verdict as JSON {"accept": boolean, "reason": string}:',
+      ].join('\n\n')
+
+      let raw = ''
+      try {
+        raw = await router.chat(
+          [
+            { role: 'system', content: system },
+            { role: 'user', content: user },
+          ],
+          MIN_MAX_TOKENS,
+        )
+      } catch (error) {
+        if ((error as { name?: string }).name === 'AbortError') throw error
+        // Router failure: fail-closed (reject) so an unverified source can't slip in.
+        return acceptOnParseFailure
+          ? { accept: true }
+          : { accept: false, reason: `verifier unavailable: ${(error as Error).message}` }
+      }
+
+      const verdict = parseVerdict(raw)
+      if (verdict) return verdict
+      return acceptOnParseFailure
+        ? { accept: true }
+        : { accept: false, reason: 'verifier returned an unparseable verdict' }
+    },
+  }
+}
+
+/** Parse the verifier's `{accept, reason}` JSON, tolerant of fences / prose. */
+function parseVerdict(raw: string): SourceVerdict | null {
+  const text = raw.trim()
+  if (!text) return null
+  const objMatch = text.match(/\{[\s\S]*\}/)
+  if (!objMatch) return null
+  try {
+    const parsed = JSON.parse(objMatch[0]) as { accept?: unknown; reason?: unknown }
+    if (typeof parsed.accept !== 'boolean') return null
+    if (parsed.accept) return { accept: true }
+    return {
+      accept: false,
+      reason:
+        typeof parsed.reason === 'string' && parsed.reason.trim()
+          ? parsed.reason.trim()
+          : 'rejected by verifier',
+    }
+  } catch {
+    return null
+  }
+}
diff --git a/tests/loops/research-loop-equal-compute.test.ts b/tests/loops/research-loop-equal-compute.test.ts
index 74ee214..8ae6544 100644
--- a/tests/loops/research-loop-equal-compute.test.ts
+++ b/tests/loops/research-loop-equal-compute.test.ts
@@ -31,6 +31,12 @@ import {
   type SourceVerdict,
   type WorkerResearchContext,
 } from '../../src/two-agent-research-loop'
+import {
+  createTangleRouterClient,
+  createVerifyingResearchDriver,
+  createWebResearchWorker,
+  type RouterClient,
+} from '../../src/web-research-worker'
 
 // ===========================================================================
 // THE VALUE A/B: does the verifying-driver (two-agent) loop build a CLEANER
@@ -289,17 +295,33 @@ async function junkAdmitted(root: string): Promise<number> {
  * the loops gate: build the readiness report over the current index, count how
  * many blocking specs are NOT in `blockingMissingRequirements`.
  */
-async function coverage(root: string, goal: string): Promise<number> {
+async function coverage(
+  root: string,
+  goal: string,
+  specs: KnowledgeReadinessSpec[] = blockingSpecs,
+): Promise<number> {
   const index = await buildKnowledgeIndex(root)
   // `buildEvalKnowledgeBundle` searches each spec over the index's PAGES and
   // runs the substrate `scoreKnowledgeReadiness` — the SAME scorer both loops
   // gate on. Coverage = fraction of blocking specs NOT in the missing set.
-  const { report } = buildEvalKnowledgeBundle({ taskId: goal, index, specs: blockingSpecs })
-  const blockingCount = blockingSpecs.filter((s) => s.importance === 'blocking').length
+  const { report } = buildEvalKnowledgeBundle({ taskId: goal, index, specs })
+  const blockingCount = specs.filter((s) => s.importance === 'blocking').length
   const missing = report.blockingMissingRequirements.length
   return blockingCount === 0 ? 1 : (blockingCount - missing) / blockingCount
 }
 
+/**
+ * Total sources that reached the KB. For REAL research there is no `junk/`
+ * prefix oracle — the live signal is that the verifying driver admits FEWER
+ * sources because it rejects the off-topic ones the single-agent loop keeps. So
+ * the live A/B compares admitted-source COUNTS (cleaner = fewer admitted at
+ * equal-or-higher coverage), the real-world analogue of the offline junk count.
+ */
+async function admittedSourceCount(root: string): Promise<number> {
+  const index = await buildKnowledgeIndex(root)
+  return index.sources.length
+}
+
 /**
  * The source-record id `addSourceText` will assign — deterministic from the
  * text+uri (see src/sources.ts: `stableId('src', `${sha256(text)}:${uri}`)`). We
@@ -321,17 +343,38 @@ function predictedSourceId(source: ResearchSourceProposal): string {
  * uses (so both arms converge by the same criterion, not at unequal effort).
  * Returns the actual passes spent so the test can compare compute.
  */
+/**
+ * A round of primary research, abstracted so the SAME arm-runners drive both the
+ * offline naive proposer AND a real web-research worker. Given the goal + open
+ * gaps + optional steer, return the sources found this pass. The offline default
+ * is `makeNaiveProposals` over the planted pool; the live arm injects the real
+ * `createWebResearchWorker`.
+ */
+type ProposeSources = (
+  ctx: {
+    goal: string
+    gaps: { description: string; query: string }[]
+    steer?: string
+    root: string
+  },
+  onPass: () => void,
+) => Promise<ResearchSourceProposal[]>
+
+const offlinePropose: ProposeSources = (ctx, onPass) => makeNaiveProposals(ctx, onPass)
+
 async function runSingleAgentArm(
   root: string,
   goal: string,
   maxIterations: number,
+  propose: ProposeSources = offlinePropose,
+  specs: KnowledgeReadinessSpec[] = blockingSpecs,
 ): Promise<{ passes: number }> {
   let passes = 0
   await runKnowledgeResearchLoop({
     root,
     goal,
     maxIterations,
-    readinessSpecs: blockingSpecs,
+    readinessSpecs: specs,
     async step(context): Promise<KnowledgeResearchLoopDecision> {
       passes += 1
       const report = context.readiness?.report
@@ -348,7 +391,7 @@ async function runSingleAgentArm(
             ? (req.metadata.query as string)
             : req.description,
       }))
-      const proposals = await makeNaiveProposals({ goal, gaps }, () => {})
+      const proposals = await propose({ goal, gaps, root }, () => {})
       if (proposals.length === 0) return { notes: 'no new proposals' }
       // Register sources AND write citing pages in one step — exactly like the
       // two-agent worker (sources + buildPages), but with NO driver verification
@@ -370,16 +413,32 @@ async function runTwoAgentArm(
   root: string,
   goal: string,
   rounds: number,
+  arm?: { worker: ResearchWorker; driver: ResearchDriver },
+  specs: KnowledgeReadinessSpec[] = blockingSpecs,
 ): Promise<{ passes: number }> {
   let workerPasses = 0
+  // Default arm = the offline naive proposer + prefix-check verifier. The live
+  // arm injects the real web-research worker + the LLM verifying driver. Either
+  // way the worker pass is counted once per invocation via a thin wrapper, so
+  // the equal-compute accounting below holds for both.
+  const worker: ResearchWorker =
+    arm?.worker ??
+    twoAgentWorker(() => {
+      workerPasses += 1
+    })
+  const countedWorker: ResearchWorker = arm
+    ? async (ctx) => {
+        workerPasses += 1
+        return worker(ctx)
+      }
+    : worker
+  const driver: ResearchDriver = arm?.driver ?? verifyingDriver()
   await runTwoAgentResearchLoop({
     root,
     goal,
-    worker: twoAgentWorker(() => {
-      workerPasses += 1
-    }),
-    driver: verifyingDriver(),
-    readinessSpecs: blockingSpecs,
+    worker: countedWorker,
+    driver,
+    readinessSpecs: specs,
     maxRounds: rounds,
   })
   // EQUAL-COMPUTE ACCOUNTING: every round that ran a worker pass also ran one
@@ -475,57 +534,195 @@ describe('research loop A/B at equal compute (offline, controlled lower bound)',
 
 // ===========================================================================
 // LIVE A/B — the real evidence. Skipped offline (no creds), exactly like
-// tests/sources-live.test.ts. Runs BOTH loops on a real research goal with a
-// real worker at equal compute over N goals, and reports a PAIRED comparison
-// of junk-admitted and coverage via agent-eval's pairedBootstrap (no hand-
-// rolled significance). What this adds over the offline arm: the worker's junk
-// is NOT prefix-detectable and the driver's verifier is a real LLM, so a win
-// here is evidence the verifying driver cleans REAL research, not a planted
-// floor.
+// tests/sources-live.test.ts. Runs BOTH loops on a REAL research goal with the
+// REAL web-research worker (glm-5.2 query-gen → live `/v1/search` → politeFetch
+// → htmlToText) and a REAL LLM verifying driver (glm-5.2 on-topic judgement) at
+// equal compute, then reports a PAIRED comparison via agent-eval's
+// pairedBootstrap (no hand-rolled significance).
+//
+// What this adds over the offline arm: there is NO planted `junk/` pool and NO
+// prefix-check verifier — the worker fetches whatever the web returns and the
+// driver is an LLM. So the live cleanliness signal is admitted-source COUNT:
+// the verifying driver rejects off-topic fetches, so the two-agent KB admits
+// FEWER sources at equal-or-higher coverage. A win here is evidence the
+// verifying driver cleans REAL research, not a planted floor.
+//
+// Gate: `AGENT_KNOWLEDGE_LIVE=1` + a TANGLE_API_KEY with glm-5.2 credits.
+//   AGENT_KNOWLEDGE_LIVE_GOALS  — `|`-separated goals (default: self-speculative decoding)
+//   AGENT_KNOWLEDGE_LIVE_BUDGET — agent-pass ceiling B per arm (default 4)
+//   AGENT_KNOWLEDGE_LIVE_MODEL  — router chat model (default glm-5.2)
 // ===========================================================================
+
+/**
+ * Topic-relevant blocking readiness specs for a live goal. General: the gaps
+ * are phrased so a real web search can close them, and the search/readiness
+ * query carries the goal so coverage scores against fetched pages. Override the
+ * default goal via `AGENT_KNOWLEDGE_LIVE_GOALS` (this builds matching specs).
+ */
+function liveSpecsForGoal(liveGoal: string): KnowledgeReadinessSpec[] {
+  return [
+    defineReadinessSpec({
+      id: 'topic/definition',
+      description: `what ${liveGoal} is and how it works`,
+      query: `${liveGoal} how it works method`,
+      requiredFor: ['ResearchAgent'],
+      importance: 'blocking',
+      minSources: 1,
+      minHits: 1,
+    }),
+    defineReadinessSpec({
+      id: 'topic/results',
+      description: `reported results, speedups, or trade-offs for ${liveGoal}`,
+      query: `${liveGoal} speedup results benchmark`,
+      requiredFor: ['ResearchAgent'],
+      importance: 'blocking',
+      minSources: 1,
+      minHits: 1,
+    }),
+  ]
+}
+
 describe.skipIf(!process.env.AGENT_KNOWLEDGE_LIVE)(
   'live: research loop A/B at equal compute',
   () => {
-    it('two-agent vs single-agent over N goals — paired comparison of junk-admitted', async () => {
-      // The set of real research goals to A/B over. A live harness would swap
-      // the offline naive proposer for a real researcher backend and the
-      // prefix-check verifier for an LLM verifySource. Both arms still run at
-      // the same agent-pass budget B; only the worker/verifier change.
-      const goals = (process.env.AGENT_KNOWLEDGE_LIVE_GOALS ?? goal).split('|').map((g) => g.trim())
-      const budgetPasses = Number(process.env.AGENT_KNOWLEDGE_LIVE_BUDGET ?? 6)
-
-      const twoAgentJunkByGoal: number[] = []
-      const singleAgentJunkByGoal: number[] = []
+    it('two-agent (real worker + LLM verifier) vs single-agent — paired comparison', async () => {
+      const goals = (process.env.AGENT_KNOWLEDGE_LIVE_GOALS ?? 'self-speculative decoding')
+        .split('|')
+        .map((g) => g.trim())
+        .filter(Boolean)
+      const budgetPasses = Number(process.env.AGENT_KNOWLEDGE_LIVE_BUDGET ?? 4)
+      const model = process.env.AGENT_KNOWLEDGE_LIVE_MODEL ?? 'glm-5.2'
+
+      // ONE shared router client for the whole run (web search + chat).
+      const router: RouterClient = createTangleRouterClient({ model })
+
+      // COST GATE: a cheap glm-5.2 smoke BEFORE the multi-arm burn. Proves the
+      // key works + the reasoning-token floor returns visible content. If this
+      // returns empty or throws, the full A/B can't produce real numbers — fail
+      // fast instead of spending the whole budget to discover it.
+      const smoke = await router.chat(
+        [
+          { role: 'system', content: 'Reply with exactly the word: OK' },
+          { role: 'user', content: 'Say OK.' },
+        ],
+        1200,
+      )
+      console.log(`[LIVE smoke] glm-5.2 visible content length=${smoke.trim().length}`)
+      expect(smoke.trim().length).toBeGreaterThan(0)
+
+      const realWorker = createWebResearchWorker({
+        router,
+        resultsPerQuery: 3,
+        queriesPerGap: 1,
+        maxSourcesPerRound: 6,
+      })
+      const realDriver = createVerifyingResearchDriver({ router })
+
+      const twoAdmittedByGoal: number[] = []
+      const singleAdmittedByGoal: number[] = []
+      let anySourceFetched = false
+
       for (const liveGoal of goals) {
+        const specs = liveSpecsForGoal(liveGoal)
         const twoRoot = await mkdtemp(join(tmpdir(), 'live-two-'))
         const singleRoot = await mkdtemp(join(tmpdir(), 'live-single-'))
         try {
-          await runTwoAgentArm(twoRoot, liveGoal, budgetPasses / 2)
-          await runSingleAgentArm(singleRoot, liveGoal, budgetPasses)
-          twoAgentJunkByGoal.push(await junkAdmitted(twoRoot))
-          singleAgentJunkByGoal.push(await junkAdmitted(singleRoot))
+          // TWO-AGENT arm: real worker proposes, real LLM driver verifies.
+          const two = await runTwoAgentArm(
+            twoRoot,
+            liveGoal,
+            budgetPasses / 2,
+            { worker: realWorker, driver: realDriver },
+            specs,
+          )
+          // SINGLE-AGENT arm: the SAME real worker, NO verifier gate, more iters
+          // to spend the same agent-pass budget the two-agent loop burns on
+          // verification.
+          const single = await runSingleAgentArm(
+            singleRoot,
+            liveGoal,
+            budgetPasses,
+            (ctx, onPass) => realWorkerPropose(realWorker, ctx, onPass),
+            specs,
+          )
+
+          const twoAdmitted = await admittedSourceCount(twoRoot)
+          const singleAdmitted = await admittedSourceCount(singleRoot)
+          const twoCoverage = await coverage(twoRoot, liveGoal, specs)
+          const singleCoverage = await coverage(singleRoot, liveGoal, specs)
+          if (twoAdmitted > 0 || singleAdmitted > 0) anySourceFetched = true
+
+          twoAdmittedByGoal.push(twoAdmitted)
+          singleAdmittedByGoal.push(singleAdmitted)
+
+          console.log(
+            `[LIVE A/B ${JSON.stringify(liveGoal)} @ B<=${budgetPasses}] ` +
+              `two-agent: passes=${two.passes} admitted=${twoAdmitted} coverage=${twoCoverage.toFixed(2)} | ` +
+              `single-agent: passes=${single.passes} admitted=${singleAdmitted} coverage=${singleCoverage.toFixed(2)}`,
+          )
         } finally {
           await rm(twoRoot, { recursive: true, force: true })
           await rm(singleRoot, { recursive: true, force: true })
         }
       }
 
-      // Paired bootstrap on (single − two) junk deltas: a POSITIVE delta means
-      // the single-agent loop admitted MORE junk, i.e. the two-agent loop is
-      // cleaner. `low > 0` is the gate — the cleanliness gain is real at the
-      // confidence level, not luck. Do NOT hand-roll this; reuse the substrate.
-      const result = pairedBootstrap(twoAgentJunkByGoal, singleAgentJunkByGoal, {
+      // The live arm is only evidence if the worker actually web-searched and
+      // fetched real pages. Zero sources across both arms = the worker never
+      // reached the web (creds/network) — that is a FALSE null, fail loud.
+      expect(anySourceFetched).toBe(true)
+
+      // Paired bootstrap on (single − two) admitted-source deltas: a POSITIVE
+      // delta means the single-agent loop admitted MORE sources, i.e. the
+      // verifying driver kept the KB cleaner. `low > 0` is the significance
+      // gate. Reuse the substrate; do NOT hand-roll significance.
+      const result = pairedBootstrap(twoAdmittedByGoal, singleAdmittedByGoal, {
         statistic: 'mean',
         seed: 1,
       })
       console.log(
-        `[LIVE A/B] n=${result.n} mean(single-two junk)=${result.mean.toFixed(3)} ` +
+        `[LIVE A/B] n=${result.n} mean(single-two admitted)=${result.mean.toFixed(3)} ` +
           `CI=[${result.low.toFixed(3)}, ${result.high.toFixed(3)}] — ` +
           `two-agent cleaner iff low > 0`,
       )
-      // At equal compute, the two-agent loop should admit no more junk on
-      // average; the bootstrap lower bound says whether that is significant.
+      // At equal compute the verifying driver should admit no MORE sources than
+      // the ungated single-agent loop on average; the bootstrap lower bound says
+      // whether the cleanliness gain is significant.
       expect(result.mean).toBeGreaterThanOrEqual(0)
-    }, 120_000)
+    }, 600_000)
   },
 )
+
+/**
+ * Drive the real web-research worker as a single-agent proposer: build a
+ * `WorkerResearchContext` from the loop context and return the sources it found.
+ * Charges one pass per invocation via `onPass`, matching the two-agent worker.
+ */
+async function realWorkerPropose(
+  worker: ResearchWorker,
+  ctx: {
+    goal: string
+    gaps: { description: string; query: string }[]
+    root: string
+    steer?: string
+  },
+  onPass: () => void,
+): Promise<ResearchSourceProposal[]> {
+  onPass()
+  const index = await buildKnowledgeIndex(ctx.root)
+  const readiness = buildEvalKnowledgeBundle({ taskId: ctx.goal, index, specs: [] })
+  const contribution = await worker({
+    root: ctx.root,
+    goal: ctx.goal,
+    round: 1,
+    index,
+    gaps: ctx.gaps.map((gap) => ({
+      id: gap.description,
+      description: gap.description,
+      query: gap.query,
+      blocking: true,
+    })),
+    steer: ctx.steer,
+    readiness,
+  })
+  return contribution.sources ?? []
+}