From 45688e9df41ed0473ae6ccaddcfa075608abd9a1 Mon Sep 17 00:00:00 2001 From: Rhys Sullivan Date: Thu, 25 Jun 2026 00:26:23 -0700 Subject: [PATCH 1/4] Add transparent MCP connection mode (?codemode=false) By default the MCP server runs in code mode: it advertises a single `execute` tool and the model writes TypeScript that calls `tools.search`, `tools.describe.tool`, and the connection tools. That keeps the tool list tiny, but clients that do lazy tool loading expect every tool enumerated directly so they can fetch schemas on demand. This adds a `?codemode=false` query parameter that switches the session into transparent mode: instead of `execute`, the server lists every directly-callable tool (connection tools plus the static core and plugin tools) with its own input schema, and routes `tools/call` straight to a single-tool invoke. Code mode stays the default. Threading: - New engine seams `listTools` / `invokeTool` / `invokeToolWithPause` alongside the existing code-execution methods, carried through every host (cloud, cloudflare, local, self-host) and the usage decorator. - The MCP host registers low-level `ListTools` / `CallTools` handlers in transparent mode and keeps the high-level `registerTool` path for code mode; the session reads the flag off the connection URL. A normalizer stamps `type: "object"` onto any advertised input schema whose root lacks one (a union-root tool such as add-server otherwise compiles to `anyOf` with no top-level type, which makes the MCP client reject the whole tools/list response). Covered by a cross-target e2e scenario that seeds an OpenAPI connection, opens a transparent session, asserts the tools are dumped directly, and makes a verifiable direct core-tool call. Green on self-host and on the workerd Durable Object path. --- apps/cloud/src/api/protected.test.ts | 7 + apps/cloud/src/engine/execution-usage.ts | 11 + apps/cloud/src/mcp/session-durable-object.ts | 2 + .../src/mcp/session-durable-object.ts | 2 + apps/local/src/mcp.ts | 2 + e2e/scenarios/mcp-codemode-off.test.ts | 214 +++++++++++ e2e/src/surfaces/mcp.ts | 26 +- e2e/vitest.config.ts | 1 + packages/core/execution/src/engine.ts | 280 +++++++++++---- packages/core/execution/src/index.ts | 1 + packages/core/execution/src/tool-invoker.ts | 2 +- packages/core/sdk/src/executor.ts | 72 +++- packages/core/sdk/src/tool.ts | 5 + .../hosts/cloudflare/src/mcp/do-headers.ts | 8 +- packages/hosts/cloudflare/src/mcp/seams.ts | 3 + .../src/mcp/session-durable-object.ts | 3 + .../hosts/cloudflare/src/mcp/session-store.ts | 2 + packages/hosts/mcp/src/browser-approval.ts | 12 + .../hosts/mcp/src/in-memory-session-store.ts | 13 +- packages/hosts/mcp/src/tool-server.test.ts | 8 + packages/hosts/mcp/src/tool-server.ts | 336 +++++++++++++++--- 21 files changed, 859 insertions(+), 151 deletions(-) create mode 100644 e2e/scenarios/mcp-codemode-off.test.ts diff --git a/apps/cloud/src/api/protected.test.ts b/apps/cloud/src/api/protected.test.ts index 5a1a02283..0f926fd16 100644 --- a/apps/cloud/src/api/protected.test.ts +++ b/apps/cloud/src/api/protected.test.ts @@ -18,6 +18,13 @@ const makeBaseEngine = (): ExecutionEngine => }), getPausedExecution: () => Effect.succeed(null), getDescription: Effect.succeed("desc"), + listTools: Effect.succeed([]), + invokeTool: () => Effect.succeed({ result: "ok", logs: [] }), + invokeToolWithPause: () => + Effect.succeed({ + status: "completed", + result: { result: "ok", logs: [] }, + }), }) as ExecutionEngine; describe("withExecutionUsageTracking", () => { diff --git a/apps/cloud/src/engine/execution-usage.ts b/apps/cloud/src/engine/execution-usage.ts index cd4489984..2e9af2273 100644 --- a/apps/cloud/src/engine/execution-usage.ts +++ b/apps/cloud/src/engine/execution-usage.ts @@ -20,4 +20,15 @@ export const withExecutionUsageTracking = ( resume: (executionId, response) => engine.resume(executionId, response), getPausedExecution: (executionId) => engine.getPausedExecution(executionId), getDescription: engine.getDescription, + // listTools is discovery, not an execution, so it doesn't count as usage. + listTools: engine.listTools, + // A direct tool invocation is an execution, so it counts the same as execute. + invokeTool: (name, args, options) => + engine + .invokeTool(name, args, options) + .pipe(Effect.tap(() => Effect.sync(() => trackUsage(organizationId)))), + invokeToolWithPause: (name, args) => + engine + .invokeToolWithPause(name, args) + .pipe(Effect.tap(() => Effect.sync(() => trackUsage(organizationId)))), }); diff --git a/apps/cloud/src/mcp/session-durable-object.ts b/apps/cloud/src/mcp/session-durable-object.ts index 5513ddb90..b7e6f3f90 100644 --- a/apps/cloud/src/mcp/session-durable-object.ts +++ b/apps/cloud/src/mcp/session-durable-object.ts @@ -173,6 +173,7 @@ export class McpSessionDO extends McpSessionDOBase { organizationSlug: org.slug, userId: token.userId, elicitationMode: token.elicitationMode, + codeMode: token.codeMode, } satisfies SessionMeta; }).pipe( Effect.withSpan("McpSessionDO.resolveSessionMeta"), @@ -208,6 +209,7 @@ export class McpSessionDO extends McpSessionDOBase { parentSpan: () => self.currentParentSpan(), debug: env.EXECUTOR_MCP_DEBUG === "true", browserApprovalStore: self.browserApprovalStore, + codeMode: sessionMeta.codeMode, elicitationMode: sessionElicitationMode === "browser" ? { diff --git a/apps/host-cloudflare/src/mcp/session-durable-object.ts b/apps/host-cloudflare/src/mcp/session-durable-object.ts index 26caac4df..8c36474cf 100644 --- a/apps/host-cloudflare/src/mcp/session-durable-object.ts +++ b/apps/host-cloudflare/src/mcp/session-durable-object.ts @@ -62,6 +62,7 @@ export class McpSessionDO extends McpSessionDOBase { organizationSlug: this.cfConfig.organizationSlug, userId: token.userId, elicitationMode: token.elicitationMode, + codeMode: token.codeMode, } satisfies SessionMeta); } @@ -88,6 +89,7 @@ export class McpSessionDO extends McpSessionDOBase { const mcpServer = yield* createExecutorMcpServer({ engine, browserApprovalStore: self.browserApprovalStore, + codeMode: sessionMeta.codeMode, elicitationMode: elicitationMode === "browser" ? { diff --git a/apps/local/src/mcp.ts b/apps/local/src/mcp.ts index b26114a5f..4f1f02999 100644 --- a/apps/local/src/mcp.ts +++ b/apps/local/src/mcp.ts @@ -12,6 +12,7 @@ import { approvalUrlForRequest, decodeResumeResponse, formatResumeAcknowledgement, + readCodeMode, readElicitationMode, } from "@executor-js/host-mcp/browser-approval"; import { makeInProcessBrowserApprovalStore } from "@executor-js/host-mcp/browser-approval-store"; @@ -144,6 +145,7 @@ export const createMcpRequestHandler = (config: ExecutorMcpServerConfig): McpReq createExecutorMcpServer({ ...config, browserApprovalStore: approvals.store, + codeMode: readCodeMode(request), elicitationMode: elicitationMode === "browser" ? { diff --git a/e2e/scenarios/mcp-codemode-off.test.ts b/e2e/scenarios/mcp-codemode-off.test.ts new file mode 100644 index 000000000..2707516c7 --- /dev/null +++ b/e2e/scenarios/mcp-codemode-off.test.ts @@ -0,0 +1,214 @@ +// Transparent connection mode (`?codemode=false`). By default an Executor MCP +// session runs in "code mode": one `execute` tool the model writes TypeScript +// against, discovering connections through `tools.search()` / +// `tools.describe.tool()` and calling them as `tools.<...>()` inside the +// sandbox. Some clients instead want every tool enumerated directly (lazy / +// on-demand tool loading), so the session accepts `?codemode=false` and dumps +// the whole catalog as individually-callable MCP tools. This mirrors the +// `?codemode=false` switch in Cloudflare's MCP server. +// +// The seam under test: the SAME connected identity, opened with the query +// param, advertises its tools by name instead of behind `execute`, and a +// by-name call routes straight to the tool invoker and returns the tool's real +// result. A default (code-mode) session of the same identity is the contrast: +// it still advertises only `execute`. +// +// Cross-target: runs on every host that threads the codeMode flag through to the +// MCP server (cloud's Durable Object, self-host's in-process server, Cloudflare's +// DO). The connection tools are seeded from an OpenAPI fixture whose baseUrl is +// never contacted, and the verifiable direct call uses a built-in core tool, so +// the scenario is fully hermetic. +import { randomBytes, randomUUID } from "node:crypto"; + +import { expect } from "@effect/vitest"; +import { Effect } from "effect"; +import { composePluginApi } from "@executor-js/api/server"; +import { openApiHttpPlugin } from "@executor-js/plugin-openapi/api"; +import { + AuthTemplateSlug, + ConnectionName, + IntegrationSlug, + ProviderItemId, +} from "@executor-js/sdk/shared"; + +import { scenario } from "../src/scenario"; +import { Api, Mcp, Target } from "../src/services"; + +const api = composePluginApi([openApiHttpPlugin()] as const); + +// A built-in core tool present on every target. In transparent mode it is +// callable directly by this wire name (a static core tool's address has no +// `tools.` prefix, so it survives `addressToPath` unchanged), and it returns +// real data (the policy listing) we can verify. +const CORE_TOOL = "executor.coreTools.policies.list"; + +// Minimal three-operation spec: three operations become three connection tools. +// The baseUrl is never contacted; we only need the tools to exist in the +// catalog so transparent mode has something to dump. +const ordersOpenApiSpec = (baseUrl: string): string => + JSON.stringify({ + openapi: "3.0.3", + info: { title: "Orders API", version: "1.0.0" }, + servers: [{ url: baseUrl }], + paths: { + "/orders/{orderId}": { + get: { + operationId: "getOrder", + summary: "Fetch a single order", + parameters: [{ name: "orderId", in: "path", required: true, schema: { type: "string" } }], + responses: { "200": { description: "An order." } }, + }, + }, + "/orders": { + get: { + operationId: "listOrders", + summary: "List orders", + responses: { "200": { description: "The orders." } }, + }, + post: { + operationId: "createOrder", + summary: "Create an order", + responses: { "201": { description: "The created order." } }, + }, + }, + }, + }); + +// The engine advertises each tool under `addressToPath(address)`: a leading +// proxy-root `tools.` is stripped, everything else is left as-is. Deriving the +// expected name from the same catalog the engine reads keeps the assertion from +// drifting if the address format changes. +const wireName = (address: string): string => + address.startsWith("tools.") ? address.slice("tools.".length) : address; + +const apiKeyTemplate = [ + { + slug: "apiKey", + type: "apiKey", + headers: { "x-api-key": [{ type: "variable", name: "token" }] }, + }, +] as const; + +scenario( + "MCP · ?codemode=false dumps every tool directly instead of `execute`", + { timeout: 120_000 }, + Effect.gen(function* () { + const target = yield* Target; + const { client } = yield* Api; + const mcp = yield* Mcp; + const identity = yield* target.newIdentity(); + const apiClient = yield* client(api, identity); + + // Unique slug per run keeps parallel/repeated runs out of each other's + // catalog (selfhost shares the bootstrap-admin identity). + const nonce = randomBytes(4).toString("hex"); + const slug = `codemode-orders-${nonce}`; + const specBaseUrl = "http://127.0.0.1:59999"; // never contacted + + const cleanup = Effect.gen(function* () { + yield* apiClient.connections + .remove({ + params: { + owner: "org", + integration: IntegrationSlug.make(slug), + name: ConnectionName.make("main"), + }, + }) + .pipe(Effect.ignore); + yield* apiClient.integrations + .remove({ params: { slug: IntegrationSlug.make(slug) } }) + .pipe(Effect.ignore); + }); + + yield* Effect.ensuring( + Effect.gen(function* () { + // Seed an integration + connection so there are connection tools to dump. + const added = yield* apiClient.openapi.addSpec({ + payload: { + spec: { kind: "blob", value: ordersOpenApiSpec(specBaseUrl) }, + slug, + baseUrl: specBaseUrl, + authenticationTemplate: apiKeyTemplate, + }, + }); + expect(added.toolCount, "the orders fixture's operations became tools").toBe(3); + + const providers = yield* apiClient.providers.list(); + yield* apiClient.connections.create({ + payload: { + owner: "org", + name: ConnectionName.make("main"), + integration: IntegrationSlug.make(slug), + template: AuthTemplateSlug.make("apiKey"), + from: { provider: providers[0]!, id: ProviderItemId.make(randomUUID()) }, + }, + }); + + // Derive the exact wire names transparent mode must advertise from the + // catalog itself, applying the same `tools.`-strip the engine does. + const catalog = yield* apiClient.tools.list({ + query: { integration: IntegrationSlug.make(slug) }, + }); + const expectedConnectionTools = catalog.map((tool) => wireName(String(tool.address))); + expect( + expectedConnectionTools.length, + "the three connection tools are in the catalog", + ).toBe(3); + + // A policy with an unrelated pattern: it does NOT gate `policies.list`, + // so the direct call below runs ungated. Its id only has to appear in + // the listing to prove the tool actually executed and returned data. + const policy = yield* apiClient.policies.create({ + payload: { owner: "org", pattern: `codemode.gate.${nonce}`, action: "block" }, + }); + + yield* Effect.ensuring( + Effect.gen(function* () { + // 1) Transparent mode: the tool list IS the tools, not `execute`. + const transparent = mcp.session(identity, { codeMode: false }); + const transparentTools = yield* transparent.listTools(); + + expect(transparentTools, "code mode's `execute` is gone").not.toContain("execute"); + expect( + transparentTools, + "the code-mode meta-tool `search` is not advertised", + ).not.toContain("search"); + expect( + transparentTools, + "the code-mode meta-tool `describe.tool` is not advertised", + ).not.toContain("describe.tool"); + + for (const name of expectedConnectionTools) { + expect(transparentTools, `connection tool ${name} is advertised directly`).toContain( + name, + ); + } + expect(transparentTools, "built-in core tools are dumped too").toContain(CORE_TOOL); + + // 2) A direct call by name runs the tool and returns its real result. + const result = yield* transparent.call(CORE_TOOL, {}); + expect(result.ok, "the direct tool call completed without error").toBe(true); + expect( + result.text, + "the listing the tool returned includes the policy we created", + ).toContain(policy.id); + + // 3) Contrast: the same identity in default (code) mode still gets + // the single `execute` tool and does NOT dump the connection tools. + // The query param is the only thing that flips behavior. + const codeModeSession = mcp.session(identity); + const codeModeTools = yield* codeModeSession.listTools(); + expect(codeModeTools, "code mode still advertises `execute`").toContain("execute"); + expect(codeModeTools, "code mode does not dump the connection tools").not.toContain( + expectedConnectionTools[0]!, + ); + }), + apiClient.policies + .remove({ params: { policyId: policy.id }, payload: { owner: "org" } }) + .pipe(Effect.ignore), + ); + }), + cleanup, + ); + }), +); diff --git a/e2e/src/surfaces/mcp.ts b/e2e/src/surfaces/mcp.ts index 0e47555bb..a7607dfed 100644 --- a/e2e/src/surfaces/mcp.ts +++ b/e2e/src/surfaces/mcp.ts @@ -160,7 +160,13 @@ export interface McpSurface { readonly url: string; readonly session: ( identity: Identity, - options?: { readonly elicitationMode?: McpElicitationMode }, + options?: { + readonly elicitationMode?: McpElicitationMode; + /** Pass `false` to add `?codemode=false`, switching the session into + * transparent mode (every tool registered directly instead of behind the + * single `execute` tool). Omitted/`true` keeps the default code mode. */ + readonly codeMode?: boolean; + }, ) => McpSession; /** * Mint a real MCP bearer headlessly: protected-resource discovery → @@ -268,12 +274,18 @@ export const makeMcpSurface = (target: Target, runDir?: string): McpSurface => ( // identity's OAuth isolated. The traceparent ledger keys off the URL, not // this name, so it is unaffected. const serverName = `${target.name}-${randomUUID().slice(0, 8)}`; - // `browser` mode is selected per the ecosystem convention — an - // `?elicitation_mode=` query on the MCP endpoint — so a paused execution - // yields an approvalUrl instead of letting the model resume inline. - const sessionUrl = options?.elicitationMode - ? `${target.mcpUrl}?elicitation_mode=${options.elicitationMode}` - : target.mcpUrl; + // Session config rides query params on the MCP endpoint, per ecosystem + // convention: `?elicitation_mode=` (a paused execution yields an approvalUrl + // instead of letting the model resume inline) and `?codemode=false` (every + // tool registered directly instead of behind the single `execute` tool). + const sessionUrl = (() => { + const url = new URL(target.mcpUrl); + if (options?.elicitationMode) { + url.searchParams.set("elicitation_mode", options.elicitationMode); + } + if (options?.codeMode === false) url.searchParams.set("codemode", "false"); + return url.toString(); + })(); let runtimePromise: Promise | undefined; let connected = false; diff --git a/e2e/vitest.config.ts b/e2e/vitest.config.ts index 6e3d5a56b..88f7e184b 100644 --- a/e2e/vitest.config.ts +++ b/e2e/vitest.config.ts @@ -49,6 +49,7 @@ export default defineConfig({ include: [ "scenarios/browser-approval.test.ts", "scenarios/microsoft-graph-full.test.ts", + "scenarios/mcp-codemode-off.test.ts", "cloudflare/**/*.test.ts", ], fileParallelism: false, diff --git a/packages/core/execution/src/engine.ts b/packages/core/execution/src/engine.ts index 665548852..36f4a6040 100644 --- a/packages/core/execution/src/engine.ts +++ b/packages/core/execution/src/engine.ts @@ -13,6 +13,7 @@ import { CodeExecutionError } from "@executor-js/codemode-core"; import type { CodeExecutor, ExecuteResult, SandboxToolInvoker } from "@executor-js/codemode-core"; import { + addressToPath, defaultToolDiscoveryProvider, makeExecutorToolInvoker, listExecutorSources, @@ -41,6 +42,17 @@ export type PausedExecution = { readonly elicitationContext: ElicitationContext; }; +/** One directly-callable tool, as enumerated for non-code-mode MCP. The + * `name` is the sandbox-callable path (`...` + * or a static fqid), which doubles as the wire tool name clients call back + * with. `inputSchema` is self-contained JSON Schema (shared `$defs` already + * inlined by `tools.list({ includeSchemas: true })`). */ +export type ToolListing = { + readonly name: string; + readonly description?: string; + readonly inputSchema: unknown; +}; + /** Internal representation with Effect runtime state for pause/resume. */ type InternalPausedExecution = PausedExecution & { readonly response: Deferred.Deferred; @@ -210,6 +222,39 @@ const readOptionalOffset = (value: unknown, toolName: string): number | Executio return Math.floor(value); }; +/** Pull a human-readable message off an engine-level invoke failure (always an + * `ExecutionToolError` once the base invoker has mapped expected failures into + * the success channel), falling back to a string render. */ +// oxlint-disable executor/no-unknown-error-message -- boundary: SandboxToolInvoker.invoke declares an `unknown` error channel (kernel contract); the tag guard narrows it to ExecutionToolError before rendering, with a String() fallback for the impossible-defect case +const toolErrorMessage = (error: unknown): string => + Predicate.isTagged(error, "ExecutionToolError") && + "message" in error && + typeof error.message === "string" + ? error.message + : String(error); +// oxlint-enable executor/no-unknown-error-message + +/** + * Invoke a single tool through the base invoker and normalize the outcome into + * an `ExecuteResult`. The base invoker already routes expected tool failures + * (HTTP errors, auth walls, not-found, bad args) into the success channel as a + * `ToolResult` envelope; only engine-level failures (user-declined approval, + * opaque defects) reach the error channel, and those become `result.error`. + * Shared by the native and pause-mode non-code-mode paths so both render the + * same way. + */ +const invokeToolAsExecuteResult = ( + invoker: SandboxToolInvoker, + name: string, + args: unknown, +): Effect.Effect => + invoker.invoke({ path: name, args }).pipe( + Effect.map((result): ExecuteResult => ({ result })), + Effect.catch((error: unknown) => + Effect.succeed({ result: undefined, error: toolErrorMessage(error) }), + ), + ); + const makeFullInvoker = ( executor: Executor, invokeOptions: InvokeOptions, @@ -392,6 +437,29 @@ export type ExecutionEngine * Get the dynamic tool description (workflow + namespaces). */ readonly getDescription: Effect.Effect; + + /** + * Enumerate every directly-callable tool with a self-contained input schema. + * Backs the non-code-mode MCP surface that exposes each tool individually + * instead of behind the single `execute` tool. + */ + readonly listTools: Effect.Effect; + + /** + * Invoke a single tool by its wire name with elicitation handled inline by + * the provided handler. The non-code-mode counterpart to {@link execute}. + */ + readonly invokeTool: ( + name: string, + args: unknown, + options: { readonly onElicitation: ElicitationHandler }, + ) => Effect.Effect; + + /** + * Invoke a single tool by its wire name, intercepting an approval gate as a + * pause point. The non-code-mode counterpart to {@link executeWithPause}. + */ + readonly invokeToolWithPause: (name: string, args: unknown) => Effect.Effect; }; export const createExecutionEngine = ( @@ -450,84 +518,109 @@ export const createExecutionEngine = Effect.Effect, + attributes: Record, + ): Effect.Effect => + Effect.gen(function* () { + // Queue preserves pauses that arrive before the previous approval has + // returned to the caller, which can happen with concurrent tool calls. + const pauseQueue = yield* Queue.unbounded>(); - // Queue preserves pauses that arrive before the previous approval has - // returned to the caller, which can happen with concurrent tool calls. - const pauseQueue = yield* Queue.unbounded>(); - - // Will be set once the fiber is forked. - let fiber: Fiber.Fiber; - - const elicitationHandler: ElicitationHandler = (ctx) => - Effect.gen(function* () { - const responseDeferred = yield* Deferred.make(); - // Globally unique — engine instances are rebuilt on host restarts - // (Durable Object cold restores, redeploys), so a counter would - // re-mint the same ids and let a stale client resume bind to a - // different execution's pause. - const id = `exec_${crypto.randomUUID()}`; - - const paused: InternalPausedExecution = { - id, - elicitationContext: ctx, - response: responseDeferred, - fiber: fiber!, - pauseQueue, - }; - pausedExecutions.set(id, paused); - - yield* Queue.offer(pauseQueue, paused); - - // Suspend until resume() completes responseDeferred. - return yield* Deferred.await(responseDeferred); - }); + // Will be set once the fiber is forked. + let fiber: Fiber.Fiber; - const invoker = makeFullInvoker( - executor, - { onElicitation: elicitationHandler }, - toolDiscoveryProvider, - ); - fiber = yield* Effect.forkDetach( - codeExecutor.execute(code, invoker).pipe(Effect.withSpan("executor.code.exec")), + const elicitationHandler: ElicitationHandler = (ctx) => + Effect.gen(function* () { + const responseDeferred = yield* Deferred.make(); + // Globally unique — engine instances are rebuilt on host restarts + // (Durable Object cold restores, redeploys), so a counter would + // re-mint the same ids and let a stale client resume bind to a + // different execution's pause. + const id = `exec_${crypto.randomUUID()}`; + + const paused: InternalPausedExecution = { + id, + elicitationContext: ctx, + response: responseDeferred, + fiber: fiber!, + pauseQueue, + }; + pausedExecutions.set(id, paused); + + yield* Queue.offer(pauseQueue, paused); + + // Suspend until resume() completes responseDeferred. + return yield* Deferred.await(responseDeferred); + }); + + fiber = yield* Effect.forkDetach(run(elicitationHandler)); + + // When the fiber settles on its own (sandbox timeout, failure) while + // pauses are still outstanding, drop them: getPausedExecution must not + // report a pause whose fiber can no longer consume a response, and the + // map must not grow forever. A resume retry still finds the terminal + // outcome via the settled-outcome cache. + const sandboxFiber = fiber; + yield* Effect.forkDetach( + Fiber.await(sandboxFiber).pipe( + Effect.flatMap((exit) => + Effect.sync(() => { + const outcome = Exit.map( + exit, + (result): ExecutionResult => ({ status: "completed", result }), + ); + for (const [id, paused] of pausedExecutions) { + if (paused.fiber !== sandboxFiber) continue; + pausedExecutions.delete(id); + recordSettledOutcome(id, outcome); + } + }), + ), + ), + ); + + return (yield* awaitCompletionOrPause(fiber, pauseQueue)) as ExecutionResult; + }).pipe(Effect.withSpan("mcp.execute", { attributes })); + + /** Code-mode pause/resume: run the dynamic worker over the full invoker. */ + const startPausableExecution = (code: string): Effect.Effect => + runWithPause( + (onElicitation) => + codeExecutor + .execute(code, makeFullInvoker(executor, { onElicitation }, toolDiscoveryProvider)) + .pipe(Effect.withSpan("executor.code.exec")), + { "mcp.execute.mode": "pausable", "mcp.execute.code_length": code.length }, ); - // When the fiber settles on its own (sandbox timeout, failure) while - // pauses are still outstanding, drop them: getPausedExecution must not - // report a pause whose fiber can no longer consume a response, and the - // map must not grow forever. A resume retry still finds the terminal - // outcome via the settled-outcome cache. - const sandboxFiber = fiber; - yield* Effect.forkDetach( - Fiber.await(sandboxFiber).pipe( - Effect.flatMap((exit) => - Effect.sync(() => { - const outcome = Exit.map( - exit, - (result): ExecutionResult => ({ status: "completed", result }), - ); - for (const [id, paused] of pausedExecutions) { - if (paused.fiber !== sandboxFiber) continue; - pausedExecutions.delete(id); - recordSettledOutcome(id, outcome); - } - }), + /** + * Non-code-mode pause/resume: invoke a single tool by its wire name. The + * tool's `ToolResult` envelope (or an engine-level error) is carried back as + * an `ExecuteResult` so the host renders it the same way it renders a + * code-mode result. Approval-gated tools pause through the same machinery. + */ + const invokeToolWithPause = (name: string, args: unknown): Effect.Effect => + runWithPause( + (onElicitation) => + invokeToolAsExecuteResult( + makeExecutorToolInvoker(executor, { invokeOptions: { onElicitation } }), + name, + args, ), - ), + { "mcp.execute.mode": "pausable_tool", "mcp.tool.name": name }, ); - return (yield* awaitCompletionOrPause(fiber, pauseQueue)) as ExecutionResult; - }); - /** * Resume a paused execution. Completes the response Deferred to unblock the * fiber, then races completion against the next queued or future pause. @@ -602,6 +695,52 @@ export const createExecutionEngine = = executor.tools + .list({ includeSchemas: true }) + .pipe( + Effect.map((tools) => + tools.map( + (tool): ToolListing => ({ + name: addressToPath(String(tool.address)), + description: tool.description, + inputSchema: tool.inputSchema ?? { type: "object" }, + }), + ), + ), + // oxlint-disable-next-line executor/no-effect-escape-hatch -- boundary: ExecutionEngine.listTools exposes no error channel; a catalog read the listing surface can't recover from dies rather than forcing every caller to thread a typed error + Effect.orDie, + Effect.withSpan("mcp.list_tools"), + ); + return { execute: runInlineExecution, executeWithPause: startPausableExecution, @@ -609,5 +748,8 @@ export const createExecutionEngine = Effect.sync(() => pausedExecutions.get(executionId) ?? null), getDescription: buildExecuteDescription(executor), + listTools, + invokeTool: invokeToolInline, + invokeToolWithPause, }; }; diff --git a/packages/core/execution/src/index.ts b/packages/core/execution/src/index.ts index 18955fc67..8388fcd93 100644 --- a/packages/core/execution/src/index.ts +++ b/packages/core/execution/src/index.ts @@ -7,6 +7,7 @@ export { type ExecutionResult, type PausedExecution, type ResumeResponse, + type ToolListing, } from "./engine"; export { buildExecuteDescription } from "./description"; diff --git a/packages/core/execution/src/tool-invoker.ts b/packages/core/execution/src/tool-invoker.ts index 8ce6d31bc..5c324d77a 100644 --- a/packages/core/execution/src/tool-invoker.ts +++ b/packages/core/execution/src/tool-invoker.ts @@ -66,7 +66,7 @@ const pathToAddress = (path: string): ToolAddress => { /** Strip the proxy-root `tools.` prefix from a full address so it becomes the * sandbox-callable path the model writes after `tools.`. */ -const addressToPath = (address: string): string => +export const addressToPath = (address: string): string => address.startsWith(ADDRESS_PREFIX) ? address.slice(ADDRESS_PREFIX.length) : address; type DescribedTool = { diff --git a/packages/core/sdk/src/executor.ts b/packages/core/sdk/src/executor.ts index 38e12a3c6..7a587aea0 100644 --- a/packages/core/sdk/src/executor.ts +++ b/packages/core/sdk/src/executor.ts @@ -132,7 +132,7 @@ import { import { ToolSchemaView, type IntegrationDetectionResult } from "./types"; import { type Tool, type ToolAnnotations, type ToolDef, type ToolListFilter } from "./tool"; import { buildToolTypeScriptPreview } from "./schema-types"; -import { collectReferencedDefinitions } from "./schema-refs"; +import { collectReferencedDefinitions, reattachDefs } from "./schema-refs"; import { refreshAccessToken, exchangeClientCredentials, @@ -2473,22 +2473,56 @@ export const createExecutor = => Effect.gen(function* () { yield* syncStaleConnectionTools; - // Projected: the list surface is metadata (address, description, - // annotations) — loading every tool's input/output schema JSON made - // an unbounded list scale with schema bytes, not tool count. - const rows = yield* core.findMany("tool", { - where: (b: AnyCb) => - b.and( - filter?.integration === undefined - ? true - : b("integration", "=", String(filter.integration)), - filter?.owner === undefined ? true : b("owner", "=", filter.owner), - filter?.connection === undefined - ? true - : b("connection", "=", String(filter.connection)), - ), - select: TOOL_INVOCATION_COLUMNS, - }); + const includeSchemas = filter?.includeSchemas ?? false; + // Projected by default: the list surface is metadata (address, + // description, annotations) — loading every tool's input/output + // schema JSON made an unbounded list scale with schema bytes, not + // tool count. Callers that enumerate tools as directly-callable + // definitions (non-code-mode MCP) opt in with `includeSchemas`, which + // loads the full rows and inlines each tool's referenced shared + // `$defs` so the returned schemas are self-contained. + const where = (b: AnyCb) => + b.and( + filter?.integration === undefined + ? true + : b("integration", "=", String(filter.integration)), + filter?.owner === undefined ? true : b("owner", "=", filter.owner), + filter?.connection === undefined + ? true + : b("connection", "=", String(filter.connection)), + ); + const rows = includeSchemas + ? yield* core.findMany("tool", { where }) + : yield* core.findMany("tool", { where, select: TOOL_INVOCATION_COLUMNS }); + + // Shared `$defs` grouped by connection — definition names are unique + // per connection, not globally, so each tool's `$ref`s must resolve + // against its own connection's defs. One bulk query keeps this from + // becoming an N+1 over `tools.schema`. + const defsByConnection = new Map>(); + if (includeSchemas) { + const definitionRows = yield* core.findMany("definition", { where }); + for (const def of definitionRows) { + const key = `${def.owner}|${def.integration}|${def.connection}`; + let defs = defsByConnection.get(key); + if (!defs) { + defs = new Map(); + defsByConnection.set(key, defs); + } + defs.set(def.name, decodeJsonColumn(def.schema)); + } + } + const selfContain = (tool: Tool): Tool => { + if (!includeSchemas) return tool; + const defs = defsByConnection.get(`${tool.owner}|${tool.integration}|${tool.connection}`); + if (!defs || defs.size === 0) return tool; + return { + ...tool, + inputSchema: reattachDefs(tool.inputSchema, defs), + outputSchema: reattachDefs(tool.outputSchema, defs), + }; + }; + const includeBlocked = filter?.includeBlocked ?? false; const policyRows = yield* core.findMany("tool_policy", {}); const tools: Tool[] = []; @@ -2504,8 +2538,10 @@ export const createExecutor = { }; // The elicitation-mode query contract (`?elicitation_mode=` plus the legacy -// `?allow_model_resume` alias) is shared with every host that serves the -// browser-approval flow. Re-exported here so the worker dispatcher's existing -// import site (`./do-headers`) is unchanged. +// `?allow_model_resume` alias) and the code-mode query contract (`?codemode=`) +// are shared with every host that serves the browser-approval flow. Re-exported +// here so the worker dispatcher's existing import site (`./do-headers`) is +// unchanged. export { + readCodeMode, readElicitationMode, type McpElicitationMode, } from "@executor-js/host-mcp/browser-approval"; diff --git a/packages/hosts/cloudflare/src/mcp/seams.ts b/packages/hosts/cloudflare/src/mcp/seams.ts index 330f78200..fb1b25a03 100644 --- a/packages/hosts/cloudflare/src/mcp/seams.ts +++ b/packages/hosts/cloudflare/src/mcp/seams.ts @@ -11,6 +11,9 @@ export interface McpSessionInit { readonly organizationId: string; readonly userId: string; readonly elicitationMode: McpElicitationMode; + /** Whether the session runs in code mode (single `execute` tool). Defaults to + * `true`; `false` selects transparent mode (every tool registered directly). */ + readonly codeMode?: boolean; /** Public origin of the create request (`https://host`), so the DO derives a * web base URL zero-config when the host configures no static one. */ readonly webOrigin?: string; diff --git a/packages/hosts/cloudflare/src/mcp/session-durable-object.ts b/packages/hosts/cloudflare/src/mcp/session-durable-object.ts index e23a99c7b..64904368a 100644 --- a/packages/hosts/cloudflare/src/mcp/session-durable-object.ts +++ b/packages/hosts/cloudflare/src/mcp/session-durable-object.ts @@ -127,6 +127,9 @@ export interface SessionMeta { readonly organizationSlug?: string; readonly userId: string; readonly elicitationMode?: "browser" | "model" | "native"; + /** Whether the session runs in code mode (single `execute` tool). Defaults to + * `true`; `false` selects transparent mode (every tool registered directly). */ + readonly codeMode?: boolean; /** Public origin captured at session create — used to derive the runtime's * web base URL when the host configures no static one. */ readonly webOrigin?: string; diff --git a/packages/hosts/cloudflare/src/mcp/session-store.ts b/packages/hosts/cloudflare/src/mcp/session-store.ts index ff9fdf444..18970d106 100644 --- a/packages/hosts/cloudflare/src/mcp/session-store.ts +++ b/packages/hosts/cloudflare/src/mcp/session-store.ts @@ -29,6 +29,7 @@ import { import { currentPropagationHeaders, + readCodeMode, readElicitationMode, withMcpResponseHeaders, withPropagationHeaders, @@ -178,6 +179,7 @@ const createSession = ( organizationId: token.organizationId, userId: token.accountId, elicitationMode: readElicitationMode(request), + codeMode: readCodeMode(request), // The public origin the client reached us at — lets the DO derive a web // base URL with no static config (we read the real URL, not a spoofable // forwarded host). diff --git a/packages/hosts/mcp/src/browser-approval.ts b/packages/hosts/mcp/src/browser-approval.ts index 0bea2ea3b..2bba1c31e 100644 --- a/packages/hosts/mcp/src/browser-approval.ts +++ b/packages/hosts/mcp/src/browser-approval.ts @@ -51,6 +51,18 @@ export const readElicitationMode = (request: Request): McpElicitationMode => { return "model"; }; +/** + * Read whether the session runs in code mode off an MCP request's `?codemode=` + * query. Code mode (the default) exposes a single `execute` tool the agent + * drives with TypeScript; `?codemode=false` selects transparent mode, where + * every available tool is registered as a directly-callable MCP tool. Only the + * literal value `false` turns code mode off; anything else keeps the default. + */ +export const readCodeMode = (request: Request): boolean => { + const url = new URL(request.url); + return url.searchParams.get("codemode") !== "false"; +}; + /** * Build the console approval URL for a paused execution: * `/resume/?mcp_session_id=`. The diff --git a/packages/hosts/mcp/src/in-memory-session-store.ts b/packages/hosts/mcp/src/in-memory-session-store.ts index 5c67ecbe5..5749e1ea3 100644 --- a/packages/hosts/mcp/src/in-memory-session-store.ts +++ b/packages/hosts/mcp/src/in-memory-session-store.ts @@ -8,6 +8,7 @@ import { buildResumeApprovalUrl, decodeResumeResponse, formatResumeAcknowledgement, + readCodeMode, readElicitationMode, } from "./browser-approval"; import { @@ -66,6 +67,12 @@ export interface McpBuildServerOptions { | { readonly mode: "model" } | { readonly mode: "native" }; readonly browserApprovalStore?: BrowserApprovalStore; + /** + * Whether the session runs in code mode (the single `execute` tool the agent + * drives with TypeScript). Defaults to `true`; `?codemode=false` selects + * transparent mode, where every available tool is registered directly. + */ + readonly codeMode?: boolean; } /** Build the per-session `McpServer` + engine for a principal (the host's engine + tools). */ @@ -200,7 +207,10 @@ export const makeInMemoryMcpSessionStore = ( request: Request, sessionId: () => string | null, ): McpBuildServerOptions => { - if (readElicitationMode(request) !== "browser") return { elicitationMode: { mode: "model" } }; + const codeMode = readCodeMode(request); + if (readElicitationMode(request) !== "browser") { + return { elicitationMode: { mode: "model" }, codeMode }; + } return { elicitationMode: { mode: "browser", @@ -214,6 +224,7 @@ export const makeInMemoryMcpSessionStore = ( }), }, browserApprovalStore: approvals.store, + codeMode, }; }; diff --git a/packages/hosts/mcp/src/tool-server.test.ts b/packages/hosts/mcp/src/tool-server.test.ts index d42b5f741..717eac2ae 100644 --- a/packages/hosts/mcp/src/tool-server.test.ts +++ b/packages/hosts/mcp/src/tool-server.test.ts @@ -30,6 +30,9 @@ const makeStubEngine = (overrides: { execute?: ExecutionEngine["execute"]; executeWithPause?: ExecutionEngine["executeWithPause"]; resume?: ExecutionEngine["resume"]; + listTools?: ExecutionEngine["listTools"]; + invokeTool?: ExecutionEngine["invokeTool"]; + invokeToolWithPause?: ExecutionEngine["invokeToolWithPause"]; description?: string; }): ExecutionEngine => ({ execute: overrides.execute ?? (() => Effect.succeed({ result: "default" })), @@ -39,6 +42,11 @@ const makeStubEngine = (overrides: { resume: overrides.resume ?? (() => Effect.succeed(null)), getPausedExecution: () => Effect.succeed(null), getDescription: Effect.succeed(overrides.description ?? "test executor"), + listTools: overrides.listTools ?? Effect.succeed([]), + invokeTool: overrides.invokeTool ?? (() => Effect.succeed({ result: "default" })), + invokeToolWithPause: + overrides.invokeToolWithPause ?? + (() => Effect.succeed({ status: "completed", result: { result: "default" } })), }); /** Connect a real MCP Client to our executor MCP server over in-memory transports. */ diff --git a/packages/hosts/mcp/src/tool-server.ts b/packages/hosts/mcp/src/tool-server.ts index ff7989b55..e7b07e3fb 100644 --- a/packages/hosts/mcp/src/tool-server.ts +++ b/packages/hosts/mcp/src/tool-server.ts @@ -1,7 +1,12 @@ import { Effect, Match, Option, Schema } from "effect"; import * as Cause from "effect/Cause"; import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; -import { ContentBlockSchema, type ContentBlock } from "@modelcontextprotocol/sdk/types.js"; +import { + CallToolRequestSchema, + ContentBlockSchema, + ListToolsRequestSchema, + type ContentBlock, +} from "@modelcontextprotocol/sdk/types.js"; import type { jsonSchemaValidator, JsonSchemaType, @@ -10,12 +15,13 @@ import type { import { Validator } from "@cfworker/json-schema"; import * as z from "zod/v4"; -import { isToolFile } from "@executor-js/sdk"; +import { isToolFile, isToolResult } from "@executor-js/sdk"; import type { ElicitationResponse, ElicitationHandler, ElicitationContext, ElicitationRequest, + ToolError, ToolFileValue, } from "@executor-js/sdk"; import type * as Tracer from "effect/Tracer"; @@ -91,6 +97,15 @@ type SharedMcpServerConfig = { readonly mode: "native"; }; readonly browserApprovalStore?: BrowserApprovalStore; + /** + * When `false`, run in transparent (non-code) mode: every available tool is + * enumerated as a directly-callable MCP tool instead of being reached through + * the single `execute` code tool. Defaults to `true` (code mode). Selected by + * a `?codemode=false` query param on the MCP endpoint, mirroring the same + * switch Cloudflare's MCP exposes — for clients that lazily load tools and + * need them listed individually. + */ + readonly codeMode?: boolean; }; export type ExecutorMcpServerConfig = @@ -419,6 +434,78 @@ const toMcpPausedResult = (formatted: ReturnType): structuredContent: formatted.structured, }); +// --------------------------------------------------------------------------- +// Non-code-mode result formatting +// +// In non-code mode each tool is called directly, so the execution result's +// `result` field is the tool's own `ToolResult` envelope rather than a +// script's return value. Unwrap it: render `data` on success (a `ToolFile` +// becomes native MCP content), surface the `ToolError` on an expected +// failure, and drop transport `http` metadata (a non-code client wants the +// payload, not pagination headers). +// --------------------------------------------------------------------------- + +const renderToolValueText = (value: unknown): string => + typeof value === "string" ? value : JSON.stringify(value ?? null, null, 2); + +// Transparent mode advertises every tool's input schema verbatim over the wire, +// and the MCP client validates each `inputSchema` root against `type: "object"`. +// A tool whose top-level input is a union (e.g. `executor.mcp.addServer`) +// compiles to `{ anyOf: [...] }` with no root `type`, which makes the client +// reject the ENTIRE `tools/list` response. Executor always invokes tools with a +// named-args object, so it is safe to stamp `type: "object"` onto any root that +// lacks one while preserving the rest of the schema (the union variants stay in +// `anyOf`; MCP's `.passthrough()` keeps the extra keys, and the tool invoker +// still enforces the real schema). +const toMcpInputSchema = (schema: unknown): Record => { + if (schema !== null && typeof schema === "object" && !Array.isArray(schema)) { + const record = schema as Record; + if (record.type === "object") return record; + if (record.type === undefined) return { ...record, type: "object" }; + } + return { type: "object" }; +}; + +const toolErrorText = (error: ToolError): string => { + const status = error.status != null ? ` (status ${error.status})` : ""; + // oxlint-disable-next-line executor/no-unknown-error-message -- boundary: ToolError is a typed struct whose `message` is a schema field, not an unknown error + return `Error [${error.code}]${status}: ${error.message}`; +}; + +const renderToolData = (data: unknown): McpToolResult => { + if (isToolFile(data)) return { content: toolFileContent(data) }; + return { + content: [{ type: "text", text: renderToolValueText(data) }], + ...(isRecord(data) ? { structuredContent: data } : {}), + }; +}; + +const toNonCodeMcpResult = (result: FormattedExecuteInput): McpToolResult => { + // Engine-level failure (declined approval, opaque defect surfaced as a + // string) — not a tool-domain failure, but still an error for the client. + if (result.error) { + return { + content: [{ type: "text", text: `Error: ${result.error}` }], + structuredContent: { status: "error", error: result.error }, + isError: true, + }; + } + const value = result.result; + if (isToolResult(value)) { + if (!value.ok) { + return { + content: [{ type: "text", text: toolErrorText(value.error) }], + structuredContent: { status: "error", error: value.error }, + isError: true, + }; + } + return renderToolData(value.data); + } + // Defensive: a direct invoke always yields a `ToolResult`, but render any + // bare value rather than dropping it. + return renderToolData(value); +}; + // `execute` failures reaching the MCP host are infra defects — domain // failures from tools are now expressed as `ToolResult` values (success // channel) and flow through `formatExecuteResult`. Emit an opaque @@ -512,6 +599,15 @@ const parseJsonContent = (raw: string): Record | undefined => { return Option.isSome(parsed) ? parsed.value : undefined; }; +// The non-code-mode dispatch reads `resume` arguments off the raw CallTool +// payload (no Zod layer in the low-level handler), so coerce defensively: an +// unknown executionId becomes "" (resolved to a not-found result) and an +// unknown action falls back to "cancel". +const readResumeAction = (value: unknown): "accept" | "decline" | "cancel" => + value === "accept" || value === "decline" || value === "cancel" ? value : "cancel"; + +const readArgString = (value: unknown): string => (typeof value === "string" ? value : ""); + // --------------------------------------------------------------------------- // Server factory // --------------------------------------------------------------------------- @@ -544,6 +640,7 @@ export const createExecutorMcpServer = ( ({ mode: "model", } as const); + const codeMode = config.codeMode ?? true; const resolveParentSpan = (): Tracer.AnySpan | undefined => { const ps = config.parentSpan; @@ -709,77 +806,211 @@ export const createExecutorMcpServer = ( }), ); + // Non-code mode: invoke one named tool directly. Reuses the same + // elicitation/pause machinery as `executeCode`, so an approval-gated tool + // pauses and resumes identically whether the model reached it through + // `execute` or called it by name. + const invokeSingleTool = (name: string, args: unknown): Effect.Effect => + Effect.gen(function* () { + debugLog("invoke_tool.call", { + name, + elicitationMode: elicitationMode.mode, + clientCapabilities: server.server.getClientCapabilities() ?? null, + }); + if (elicitationMode.mode === "native") { + const result = yield* engine.invokeTool(name, args, { + onElicitation: makeMcpElicitationHandler(server, debugLog), + }); + return toNonCodeMcpResult(result); + } + const outcome = yield* engine.invokeToolWithPause(name, args); + debugLog("invoke_tool.paused_flow_result", { + name, + status: outcome.status, + executionId: outcome.status === "paused" ? outcome.execution.id : undefined, + interactionKind: + outcome.status === "paused" + ? pausedInteractionKind(outcome.execution.elicitationContext.request) + : undefined, + }); + return outcome.status === "completed" + ? toNonCodeMcpResult(outcome.result) + : elicitationMode.mode === "browser" + ? yield* requireUserResumeApproval(outcome.execution.id) + : toMcpPausedResult(formatPausedExecution(outcome.execution)); + }).pipe( + Effect.withSpan("mcp.host.tool.invoke", { + attributes: { "mcp.tool.name": name }, + }), + ); + // --- tools --- + // Code mode registers the single `execute` tool (plus mode-specific + // `resume`) via the high-level wrapper. Transparent mode skips that and + // serves every tool through the low-level request handlers instead — the + // two registration styles are mutually exclusive on one server. + + if (codeMode) { + yield* Effect.sync(() => + server.registerTool( + "execute", + { + description, + inputSchema: { code: z.string().trim().min(1) }, + }, + ({ code }) => runToolEffect(executeCode(code)), + ), + ).pipe( + Effect.withSpan("mcp.host.register_tool", { + attributes: { "mcp.tool.name": "execute" }, + }), + ); - yield* Effect.sync(() => - server.registerTool( - "execute", - { - description, - inputSchema: { code: z.string().trim().min(1) }, - }, - ({ code }) => runToolEffect(executeCode(code)), - ), - ).pipe( - Effect.withSpan("mcp.host.register_tool", { - attributes: { "mcp.tool.name": "execute" }, - }), - ); + yield* Effect.sync(() => { + if (elicitationMode.mode === "native") { + return undefined; + } - yield* Effect.sync(() => { - if (elicitationMode.mode === "native") { - return undefined; - } + if (elicitationMode.mode === "model") { + return server.registerTool( + "resume", + { + description: [ + "Resume a paused execution using the executionId returned by execute.", + "This connection explicitly allows model-side resume via elicitation_mode=model.", + ].join("\n"), + inputSchema: { + executionId: z.string().describe("The execution ID from the paused result"), + action: z + .enum(["accept", "decline", "cancel"]) + .describe("How to respond to the interaction"), + content: z + .string() + .describe("Optional JSON-encoded response content for form elicitations") + .default("{}"), + }, + }, + ({ executionId, action, content: rawContent }) => + runToolEffect(resumeExecution(executionId, action, parseJsonContent(rawContent))), + ); + } - if (elicitationMode.mode === "model") { return server.registerTool( "resume", { description: [ - "Resume a paused execution using the executionId returned by execute.", - "This connection explicitly allows model-side resume via elicitation_mode=model.", + "Request user approval to resume a paused execution.", + "Call this with the executionId returned by execute. If the user has not approved in the browser yet, tell them to open the returned approval URL. If they have approved, this returns the resumed execution result.", + "This connection does not allow the model to choose accept, decline, cancel, or content.", ].join("\n"), inputSchema: { executionId: z.string().describe("The execution ID from the paused result"), - action: z - .enum(["accept", "decline", "cancel"]) - .describe("How to respond to the interaction"), - content: z - .string() - .describe("Optional JSON-encoded response content for form elicitations") - .default("{}"), }, }, - ({ executionId, action, content: rawContent }) => - runToolEffect(resumeExecution(executionId, action, parseJsonContent(rawContent))), + ({ executionId }) => runToolEffect(resumeAfterBrowserApproval(executionId)), ); - } - - return server.registerTool( - "resume", - { - description: [ - "Request user approval to resume a paused execution.", - "Call this with the executionId returned by execute. If the user has not approved in the browser yet, tell them to open the returned approval URL. If they have approved, this returns the resumed execution result.", - "This connection does not allow the model to choose accept, decline, cancel, or content.", - ].join("\n"), - inputSchema: { - executionId: z.string().describe("The execution ID from the paused result"), - }, - }, - ({ executionId }) => runToolEffect(resumeAfterBrowserApproval(executionId)), + }).pipe( + Effect.withSpan("mcp.host.register_tool", { + attributes: { "mcp.tool.name": "resume" }, + }), ); - }).pipe( - Effect.withSpan("mcp.host.register_tool", { - attributes: { "mcp.tool.name": "resume" }, - }), - ); + } else { + const toolListings = yield* engine.listTools; + const resumeWireTool = + elicitationMode.mode === "native" + ? undefined + : elicitationMode.mode === "model" + ? { + name: "resume", + description: [ + "Resume a paused tool call using the executionId returned by a paused result.", + "This connection explicitly allows model-side resume via elicitation_mode=model.", + ].join("\n"), + inputSchema: { + type: "object" as const, + properties: { + executionId: { + type: "string", + description: "The execution ID from the paused result", + }, + action: { + type: "string", + enum: ["accept", "decline", "cancel"], + description: "How to respond to the interaction", + }, + content: { + type: "string", + description: "Optional JSON-encoded response content for form elicitations", + default: "{}", + }, + }, + required: ["executionId", "action"], + }, + } + : { + name: "resume", + description: [ + "Request user approval to resume a paused tool call.", + "Call this with the executionId returned by a paused result. If the user has not approved in the browser yet, tell them to open the returned approval URL. If they have approved, this returns the resumed result.", + "This connection does not allow the model to choose accept, decline, cancel, or content.", + ].join("\n"), + inputSchema: { + type: "object" as const, + properties: { + executionId: { + type: "string", + description: "The execution ID from the paused result", + }, + }, + required: ["executionId"], + }, + }; + + const wireTools = [ + ...toolListings.map((tool) => ({ + name: tool.name, + description: tool.description, + inputSchema: toMcpInputSchema(tool.inputSchema), + })), + ...(resumeWireTool ? [resumeWireTool] : []), + ]; + + yield* Effect.sync(() => { + // `registerTool` normally declares this; transparent mode bypasses it. + server.server.registerCapabilities({ tools: { listChanged: false } }); + + server.server.setRequestHandler(ListToolsRequestSchema, () => ({ tools: wireTools })); + + server.server.setRequestHandler(CallToolRequestSchema, (request) => { + const { name } = request.params; + const args = request.params.arguments ?? {}; + if (name === "resume" && elicitationMode.mode !== "native") { + if (elicitationMode.mode === "browser") { + return runToolEffect(resumeAfterBrowserApproval(readArgString(args.executionId))); + } + return runToolEffect( + resumeExecution( + readArgString(args.executionId), + readResumeAction(args.action), + parseJsonContent(typeof args.content === "string" ? args.content : "{}"), + ), + ); + } + return runToolEffect(invokeSingleTool(name, args)); + }); + }).pipe( + Effect.withSpan("mcp.host.register_transparent_tools", { + attributes: { "mcp.tool.count": wireTools.length }, + }), + ); + } yield* Effect.sync(() => { console.error( "[executor] MCP session mode", JSON.stringify({ ...capabilitySnapshot(server), + codeMode, elicitationMode: elicitationMode.mode, resumeEnabled: elicitationMode.mode !== "native", }), @@ -787,6 +1018,7 @@ export const createExecutorMcpServer = ( debugLog("tool.visibility", { clientCapabilities: server.server.getClientCapabilities() ?? null, elicitationSupport: getElicitationSupport(server), + codeMode, elicitationMode: elicitationMode.mode, resumeEnabled: elicitationMode.mode !== "native", }); From fc49a4074a053a74c7ffc6cd944e007109b00635 Mon Sep 17 00:00:00 2001 From: Rhys Sullivan Date: Thu, 25 Jun 2026 00:51:26 -0700 Subject: [PATCH 2/4] Keep transparent-mode tool result shape stable across resume A direct tool call in transparent mode unwraps the tool's `ToolResult` envelope (renders `data` natively, sets `isError` on failures). The `resume` path is shared with code mode and formatted every resumed completion with the code-mode `execute` envelope, so a transparent-mode tool that paused for approval and then resumed came back wrapped in `{ status, result, logs }` instead of the tool's own result, unlike the same tool when it did not pause. Pick the resume completion formatter by session mode: a paused execution can only have originated from the tool this session registered (`execute` in code mode, a direct single-tool invoke in transparent mode), so format the resumed completion the same way that origin tool formats a non-paused completion. Covered by a second case in the codemode-off scenario that drives the approval-gated `policies.create` through pause, approve, and resume in a transparent session and asserts the resumed structured content is the policy itself, not the execute envelope. Green on self-host and workerd; the assertion fails against the pre-fix formatter. --- e2e/scenarios/mcp-codemode-off.test.ts | 81 ++++++++++++++++++++++++++ packages/hosts/mcp/src/tool-server.ts | 14 ++++- 2 files changed, 93 insertions(+), 2 deletions(-) diff --git a/e2e/scenarios/mcp-codemode-off.test.ts b/e2e/scenarios/mcp-codemode-off.test.ts index 2707516c7..f1abb5fab 100644 --- a/e2e/scenarios/mcp-codemode-off.test.ts +++ b/e2e/scenarios/mcp-codemode-off.test.ts @@ -89,6 +89,11 @@ const apiKeyTemplate = [ }, ] as const; +// The approval-gated core tool used by the pause+resume scenario below. It +// gates on its own `requiresApproval` annotation (no policy needed), so a direct +// transparent-mode call pauses, and resuming it exercises the resume formatter. +const POLICY_CREATE_TOOL = "executor.coreTools.policies.create"; + scenario( "MCP · ?codemode=false dumps every tool directly instead of `execute`", { timeout: 120_000 }, @@ -212,3 +217,79 @@ scenario( ); }), ); + +// Result-shape parity across the pause boundary. A transparent-mode tool that +// pauses for approval and then resumes must return the SAME shape it would have +// returned without pausing: the tool's own result, unwrapped from the +// `ToolResult` envelope. The `resume` machinery is shared with code mode, where a +// completion is an `execute` envelope (`{ status, result, logs }`); a regression +// here formatted the resumed direct-tool result that same way, so a transparent +// client got the code-mode envelope instead of the policy fields. This drives the +// approval-gated `policies.create` through pause -> approve -> resume and asserts +// the resumed structured content is the policy itself. +scenario( + "MCP · ?codemode=false keeps the unwrapped tool result across an approval pause+resume", + { timeout: 120_000 }, + Effect.gen(function* () { + const target = yield* Target; + const { client } = yield* Api; + const mcp = yield* Mcp; + const identity = yield* target.newIdentity(); + const apiClient = yield* client(api, identity); + + // Unique, non-matching pattern: the rule the gated tool creates is inert and + // cannot gate any other scenario's tools. Removed in the finalizer. + const nonce = randomBytes(4).toString("hex"); + const pattern = `codemode-resume-${nonce}.gate`; + + const cleanup = apiClient.policies.list().pipe( + Effect.flatMap((list) => + Effect.forEach( + list.filter((p) => p.pattern === pattern), + (p) => + apiClient.policies + .remove({ params: { policyId: p.id }, payload: { owner: "org" } }) + .pipe(Effect.ignore), + ), + ), + Effect.ignore, + ); + + yield* Effect.ensuring( + Effect.gen(function* () { + const transparent = mcp.session(identity, { codeMode: false }); + yield* transparent.listTools(); + + // Direct by-name call to the approval-gated tool. No policy is in play, so + // the only thing that can pause it is its own `requiresApproval` + // annotation. The paused result carries the executionId to resume. + const paused = yield* transparent.call(POLICY_CREATE_TOOL, { + owner: "org", + pattern, + action: "block", + }); + expect(paused.text, "the gated tool paused for approval").toContain("Execution paused"); + expect(paused.text, "the paused result carries an executionId").toContain("executionId:"); + + // Approve and resume. + const resumed = yield* transparent.approvePaused(paused.text); + expect(resumed.ok, "the resumed call completed without error").toBe(true); + + const structured = (resumed.raw as { structuredContent?: Record }) + .structuredContent; + // Fixed shape: the tool's own result, so the policy fields sit at the top + // level. Buggy shape: the code-mode `execute` envelope, where the policy + // would be nested under `result` and `pattern` absent at the top level. + expect( + structured?.pattern, + "the resumed result is the unwrapped tool result (policy fields at the top level)", + ).toBe(pattern); + expect( + structured?.result, + "the code-mode execute envelope (status/result/logs) is not used in transparent mode", + ).toBeUndefined(); + }), + cleanup, + ); + }), +); diff --git a/packages/hosts/mcp/src/tool-server.ts b/packages/hosts/mcp/src/tool-server.ts index e7b07e3fb..9e320e604 100644 --- a/packages/hosts/mcp/src/tool-server.ts +++ b/packages/hosts/mcp/src/tool-server.ts @@ -705,6 +705,16 @@ export const createExecutorMcpServer = ( }), ); + // `resume` is shared by both modes, but a paused execution can only have + // originated from the tool that this session registered: `execute` in code + // mode, a direct single-tool invoke in transparent mode. Format the resumed + // completion the same way that origin tool formats a non-paused completion, + // so a tool returns an identically-shaped result whether or not it paused. + // In transparent mode that means unwrapping the `ToolResult` envelope (so + // `data` renders natively and a failed `ToolResult` carries `isError`) + // rather than emitting the code-mode `execute` envelope. + const formatResumeCompletion = codeMode ? toMcpResult : toNonCodeMcpResult; + const resumeExecution = ( executionId: string, action: "accept" | "decline" | "cancel", @@ -732,7 +742,7 @@ export const createExecutorMcpServer = ( : undefined, }); return outcome.status === "completed" - ? toMcpResult(outcome.result) + ? formatResumeCompletion(outcome.result) : toMcpPausedResult(formatPausedExecution(outcome.execution)); }).pipe( Effect.withSpan("mcp.host.tool.resume", { @@ -795,7 +805,7 @@ export const createExecutorMcpServer = ( return missingExecutionResult(executionId); } return outcome.status === "completed" - ? toMcpResult(outcome.result) + ? formatResumeCompletion(outcome.result) : yield* requireUserResumeApproval(outcome.execution.id); }).pipe( Effect.withSpan("mcp.host.tool.resume.browser_approval", { From b29178783fb7dc2c0ddf83d161b68c9e3751fe80 Mon Sep 17 00:00:00 2001 From: Rhys Sullivan Date: Thu, 25 Jun 2026 10:07:17 -0700 Subject: [PATCH 3/4] Replace non-code tool dump with search + invoke `?codemode=false` previously dumped every tool into one `tools/list`. That does not scale: the full Microsoft Graph connection is ~16.5k tools and ~640 MB of self-contained schema, which no client can load in a single response and which exceeds the runtime's memory budget. The server builds it fast, but the payload itself is the wall, and no client (Codex does not paginate tools/list; the spec's cursor pagination only helps clients that do) can usefully receive a catalog that large. Switch non-code mode to a fixed two-tool surface instead: - `search({ query, limit?, offset? })` ranks over the whole catalog and returns only a bounded page, each hit carrying its own input schema so it can be called directly. - `invoke({ name, arguments? })` runs a tool by name, reusing the same resolve/invoke/pause/resume path (the resumed result stays unwrapped). This is the lazy-loading shape: the client pulls the handful of tools it needs rather than the whole catalog, so it works for any client and any catalog size. It is essentially code mode's own search/invoke primitives exposed as flat MCP tools instead of behind the `execute` sandbox. Engine: add a bounded `searchTools` seam (reuses the existing discovery ranking, enriches the page with schemas) and drop the now-unused `listTools` seam that backed the dump. Covered end to end: - codemode-off: a non-code session advertises search/invoke (not execute, not a dumped catalog); search finds a seeded connection's tools; invoke runs one and returns its real result; the pause/resume shape guard still holds. Green on self-host and the workerd DO. - codemode-scale: the full 16.5k Graph catalog is searched (bounded page) and invoked, with trace assertions that the catalog is never dumped, each invoke dispatches once, and a single invoke neither searches nor rebuilds the catalog. Green on cloud. --- apps/cloud/src/api/protected.test.ts | 2 +- apps/cloud/src/engine/execution-usage.ts | 4 +- e2e/scenarios/mcp-codemode-off.test.ts | 159 +++++++-------- e2e/scenarios/mcp-codemode-scale.test.ts | 203 ++++++++++++++++++++ packages/core/execution/src/engine.ts | 96 ++++++--- packages/core/execution/src/index.ts | 4 + packages/core/execution/src/tool-invoker.ts | 2 +- packages/hosts/mcp/src/tool-server.test.ts | 6 +- packages/hosts/mcp/src/tool-server.ts | 137 +++++++++---- 9 files changed, 475 insertions(+), 138 deletions(-) create mode 100644 e2e/scenarios/mcp-codemode-scale.test.ts diff --git a/apps/cloud/src/api/protected.test.ts b/apps/cloud/src/api/protected.test.ts index 0f926fd16..79bd2bf82 100644 --- a/apps/cloud/src/api/protected.test.ts +++ b/apps/cloud/src/api/protected.test.ts @@ -18,7 +18,7 @@ const makeBaseEngine = (): ExecutionEngine => }), getPausedExecution: () => Effect.succeed(null), getDescription: Effect.succeed("desc"), - listTools: Effect.succeed([]), + searchTools: () => Effect.succeed({ items: [], total: 0, hasMore: false, nextOffset: null }), invokeTool: () => Effect.succeed({ result: "ok", logs: [] }), invokeToolWithPause: () => Effect.succeed({ diff --git a/apps/cloud/src/engine/execution-usage.ts b/apps/cloud/src/engine/execution-usage.ts index 2e9af2273..20be7f4e7 100644 --- a/apps/cloud/src/engine/execution-usage.ts +++ b/apps/cloud/src/engine/execution-usage.ts @@ -20,8 +20,8 @@ export const withExecutionUsageTracking = ( resume: (executionId, response) => engine.resume(executionId, response), getPausedExecution: (executionId) => engine.getPausedExecution(executionId), getDescription: engine.getDescription, - // listTools is discovery, not an execution, so it doesn't count as usage. - listTools: engine.listTools, + // searchTools is discovery, not an execution, so it doesn't count as usage. + searchTools: (input) => engine.searchTools(input), // A direct tool invocation is an execution, so it counts the same as execute. invokeTool: (name, args, options) => engine diff --git a/e2e/scenarios/mcp-codemode-off.test.ts b/e2e/scenarios/mcp-codemode-off.test.ts index f1abb5fab..c34b6cdee 100644 --- a/e2e/scenarios/mcp-codemode-off.test.ts +++ b/e2e/scenarios/mcp-codemode-off.test.ts @@ -1,23 +1,24 @@ -// Transparent connection mode (`?codemode=false`). By default an Executor MCP +// Non-code connection mode (`?codemode=false`). By default an Executor MCP // session runs in "code mode": one `execute` tool the model writes TypeScript // against, discovering connections through `tools.search()` / // `tools.describe.tool()` and calling them as `tools.<...>()` inside the -// sandbox. Some clients instead want every tool enumerated directly (lazy / -// on-demand tool loading), so the session accepts `?codemode=false` and dumps -// the whole catalog as individually-callable MCP tools. This mirrors the -// `?codemode=false` switch in Cloudflare's MCP server. +// sandbox. Some clients can't drive a code sandbox and instead want to discover +// and call tools through plain MCP tool calls, so the session accepts +// `?codemode=false` and exposes two meta-tools, `search` and `invoke`, instead +// of `execute`. // -// The seam under test: the SAME connected identity, opened with the query -// param, advertises its tools by name instead of behind `execute`, and a -// by-name call routes straight to the tool invoker and returns the tool's real -// result. A default (code-mode) session of the same identity is the contrast: -// it still advertises only `execute`. +// Why not just dump every tool directly (the obvious reading of the Cloudflare +// `?codemode=false` switch)? Because a real catalog is enormous: the full +// Microsoft Graph connection alone is ~16.5k tools / hundreds of MB of inlined +// schema, which no client can load in one `tools/list`. `search`+`invoke` is the +// lazy-loading shape: the client searches for the handful of tools it needs and +// invokes them by name, so it scales to any catalog. (See mcp-codemode-scale.) // -// Cross-target: runs on every host that threads the codeMode flag through to the -// MCP server (cloud's Durable Object, self-host's in-process server, Cloudflare's -// DO). The connection tools are seeded from an OpenAPI fixture whose baseUrl is -// never contacted, and the verifiable direct call uses a built-in core tool, so -// the scenario is fully hermetic. +// The seam under test: the SAME connected identity, opened with the query param, +// advertises `search`/`invoke` instead of `execute`; `search` finds a seeded +// connection's tools, and `invoke` runs one and returns its real result. A +// default (code-mode) session of the same identity is the contrast: it still +// advertises only `execute`. import { randomBytes, randomUUID } from "node:crypto"; import { expect } from "@effect/vitest"; @@ -36,15 +37,20 @@ import { Api, Mcp, Target } from "../src/services"; const api = composePluginApi([openApiHttpPlugin()] as const); -// A built-in core tool present on every target. In transparent mode it is -// callable directly by this wire name (a static core tool's address has no -// `tools.` prefix, so it survives `addressToPath` unchanged), and it returns -// real data (the policy listing) we can verify. +// A built-in core tool present on every target. In non-code mode it is invoked +// by this wire name through the `invoke` meta-tool (a static core tool's address +// has no `tools.` prefix, so it survives `addressToPath` unchanged), and it +// returns real data (the policy listing) we can verify. const CORE_TOOL = "executor.coreTools.policies.list"; +// The approval-gated core tool used by the pause+resume scenario below. It gates +// on its own `requiresApproval` annotation (no policy needed), so invoking it +// pauses, and resuming exercises the non-code resume formatter. +const POLICY_CREATE_TOOL = "executor.coreTools.policies.create"; + // Minimal three-operation spec: three operations become three connection tools. // The baseUrl is never contacted; we only need the tools to exist in the -// catalog so transparent mode has something to dump. +// catalog so `search` has something to find. const ordersOpenApiSpec = (baseUrl: string): string => JSON.stringify({ openapi: "3.0.3", @@ -74,9 +80,9 @@ const ordersOpenApiSpec = (baseUrl: string): string => }, }); -// The engine advertises each tool under `addressToPath(address)`: a leading -// proxy-root `tools.` is stripped, everything else is left as-is. Deriving the -// expected name from the same catalog the engine reads keeps the assertion from +// `search`/`invoke` use the same `addressToPath(address)` the engine does: a +// leading proxy-root `tools.` is stripped, everything else is left as-is. +// Deriving the expected name from the same catalog keeps the assertion from // drifting if the address format changes. const wireName = (address: string): string => address.startsWith("tools.") ? address.slice("tools.".length) : address; @@ -89,13 +95,16 @@ const apiKeyTemplate = [ }, ] as const; -// The approval-gated core tool used by the pause+resume scenario below. It -// gates on its own `requiresApproval` annotation (no policy needed), so a direct -// transparent-mode call pauses, and resuming it exercises the resume formatter. -const POLICY_CREATE_TOOL = "executor.coreTools.policies.create"; +type SearchPage = { + readonly items?: ReadonlyArray<{ readonly name?: string; readonly inputSchema?: unknown }>; + readonly total?: number; +}; + +const searchPageOf = (raw: unknown): SearchPage => + ((raw as { structuredContent?: SearchPage }).structuredContent ?? {}) as SearchPage; scenario( - "MCP · ?codemode=false dumps every tool directly instead of `execute`", + "MCP · ?codemode=false exposes search + invoke instead of `execute`", { timeout: 120_000 }, Effect.gen(function* () { const target = yield* Target; @@ -127,7 +136,7 @@ scenario( yield* Effect.ensuring( Effect.gen(function* () { - // Seed an integration + connection so there are connection tools to dump. + // Seed an integration + connection so `search` has tools to find. const added = yield* apiClient.openapi.addSpec({ payload: { spec: { kind: "blob", value: ordersOpenApiSpec(specBaseUrl) }, @@ -149,8 +158,7 @@ scenario( }, }); - // Derive the exact wire names transparent mode must advertise from the - // catalog itself, applying the same `tools.`-strip the engine does. + // The exact wire names `search` should surface, derived from the catalog. const catalog = yield* apiClient.tools.list({ query: { integration: IntegrationSlug.make(slug) }, }); @@ -161,52 +169,54 @@ scenario( ).toBe(3); // A policy with an unrelated pattern: it does NOT gate `policies.list`, - // so the direct call below runs ungated. Its id only has to appear in - // the listing to prove the tool actually executed and returned data. + // so the invoke below runs ungated. Its id only has to appear in the + // listing to prove the tool actually executed and returned data. const policy = yield* apiClient.policies.create({ payload: { owner: "org", pattern: `codemode.gate.${nonce}`, action: "block" }, }); yield* Effect.ensuring( Effect.gen(function* () { - // 1) Transparent mode: the tool list IS the tools, not `execute`. - const transparent = mcp.session(identity, { codeMode: false }); - const transparentTools = yield* transparent.listTools(); + const noncode = mcp.session(identity, { codeMode: false }); - expect(transparentTools, "code mode's `execute` is gone").not.toContain("execute"); - expect( - transparentTools, - "the code-mode meta-tool `search` is not advertised", - ).not.toContain("search"); + // 1) The advertised tools are the meta-tools, NOT `execute` and NOT a + // dumped catalog. + const tools = yield* noncode.listTools(); + expect(tools, "search is advertised").toContain("search"); + expect(tools, "invoke is advertised").toContain("invoke"); + expect(tools, "code mode's `execute` is gone").not.toContain("execute"); expect( - transparentTools, - "the code-mode meta-tool `describe.tool` is not advertised", - ).not.toContain("describe.tool"); + tools, + "the catalog is NOT dumped directly (that is the whole point)", + ).not.toContain(expectedConnectionTools[0]!); + // 2) `search` finds the seeded connection's tools, each with a schema. + const search = yield* noncode.call("search", { query: slug }); + expect(search.ok, "search completed without error").toBe(true); + const page = searchPageOf(search.raw); + const found = (page.items ?? []).map((item) => item.name); for (const name of expectedConnectionTools) { - expect(transparentTools, `connection tool ${name} is advertised directly`).toContain( - name, - ); + expect(found, `search surfaced connection tool ${name}`).toContain(name); } - expect(transparentTools, "built-in core tools are dumped too").toContain(CORE_TOOL); + expect( + (page.items ?? []).every((item) => item.inputSchema != null), + "each search hit carries its input schema, so it can be invoked directly", + ).toBe(true); - // 2) A direct call by name runs the tool and returns its real result. - const result = yield* transparent.call(CORE_TOOL, {}); - expect(result.ok, "the direct tool call completed without error").toBe(true); + // 3) `invoke` runs a tool by name and returns its real result. + const invoked = yield* noncode.call("invoke", { name: CORE_TOOL, arguments: {} }); + expect(invoked.ok, "the invoke completed without error").toBe(true); expect( - result.text, + invoked.text, "the listing the tool returned includes the policy we created", ).toContain(policy.id); - // 3) Contrast: the same identity in default (code) mode still gets - // the single `execute` tool and does NOT dump the connection tools. - // The query param is the only thing that flips behavior. + // 4) Contrast: the same identity in default (code) mode still gets the + // single `execute` tool and not the meta-tools. const codeModeSession = mcp.session(identity); const codeModeTools = yield* codeModeSession.listTools(); expect(codeModeTools, "code mode still advertises `execute`").toContain("execute"); - expect(codeModeTools, "code mode does not dump the connection tools").not.toContain( - expectedConnectionTools[0]!, - ); + expect(codeModeTools, "code mode does not advertise `search`").not.toContain("search"); }), apiClient.policies .remove({ params: { policyId: policy.id }, payload: { owner: "org" } }) @@ -218,15 +228,15 @@ scenario( }), ); -// Result-shape parity across the pause boundary. A transparent-mode tool that -// pauses for approval and then resumes must return the SAME shape it would have +// Result-shape parity across the pause boundary. An `invoke`d tool that pauses +// for approval and then resumes must return the SAME shape it would have // returned without pausing: the tool's own result, unwrapped from the // `ToolResult` envelope. The `resume` machinery is shared with code mode, where a // completion is an `execute` envelope (`{ status, result, logs }`); a regression -// here formatted the resumed direct-tool result that same way, so a transparent -// client got the code-mode envelope instead of the policy fields. This drives the -// approval-gated `policies.create` through pause -> approve -> resume and asserts -// the resumed structured content is the policy itself. +// here formatted the resumed direct-tool result that same way, so a non-code +// client got the code-mode envelope instead of the tool's fields. This drives +// the approval-gated `policies.create` through invoke -> pause -> approve -> +// resume and asserts the resumed structured content is the policy itself. scenario( "MCP · ?codemode=false keeps the unwrapped tool result across an approval pause+resume", { timeout: 120_000 }, @@ -257,22 +267,21 @@ scenario( yield* Effect.ensuring( Effect.gen(function* () { - const transparent = mcp.session(identity, { codeMode: false }); - yield* transparent.listTools(); + const noncode = mcp.session(identity, { codeMode: false }); + yield* noncode.listTools(); - // Direct by-name call to the approval-gated tool. No policy is in play, so - // the only thing that can pause it is its own `requiresApproval` - // annotation. The paused result carries the executionId to resume. - const paused = yield* transparent.call(POLICY_CREATE_TOOL, { - owner: "org", - pattern, - action: "block", + // Invoke the approval-gated tool by name. No policy is in play, so the + // only thing that can pause it is its own `requiresApproval` annotation. + // The paused result carries the executionId to resume. + const paused = yield* noncode.call("invoke", { + name: POLICY_CREATE_TOOL, + arguments: { owner: "org", pattern, action: "block" }, }); expect(paused.text, "the gated tool paused for approval").toContain("Execution paused"); expect(paused.text, "the paused result carries an executionId").toContain("executionId:"); // Approve and resume. - const resumed = yield* transparent.approvePaused(paused.text); + const resumed = yield* noncode.approvePaused(paused.text); expect(resumed.ok, "the resumed call completed without error").toBe(true); const structured = (resumed.raw as { structuredContent?: Record }) @@ -286,7 +295,7 @@ scenario( ).toBe(pattern); expect( structured?.result, - "the code-mode execute envelope (status/result/logs) is not used in transparent mode", + "the code-mode execute envelope (status/result/logs) is not used in non-code mode", ).toBeUndefined(); }), cleanup, diff --git a/e2e/scenarios/mcp-codemode-scale.test.ts b/e2e/scenarios/mcp-codemode-scale.test.ts new file mode 100644 index 000000000..ee5c26aea --- /dev/null +++ b/e2e/scenarios/mcp-codemode-scale.test.ts @@ -0,0 +1,203 @@ +// Non-code mode (`?codemode=false`) at real catalog scale. This is the scenario +// that justified the design: dumping every tool directly does not scale (the +// full Microsoft Graph catalog is ~16.5k tools / hundreds of MB of inlined +// schema, far too big for any client to load in one `tools/list`). `search` + +// `invoke` is the lazy-loading answer: a fixed two-tool surface that never +// returns more than a bounded page, no matter how large the catalog is. +// +// What it asserts, against the full Graph catalog and the suite's trace store: +// - the non-code session advertises only the meta-tools, NOT the 16.5k-tool +// catalog (the dump is gone); +// - `search` over the whole catalog returns a small bounded page, each hit +// carrying its own schema; +// - each invocation dispatches the tool exactly once (`executor.tool.execute`), +// with no fan-out; +// - a single invocation's trace neither searches nor rebuilds the catalog +// (no `executor.tools.search`, no `executor.tools.sync_stale`) — resolving +// one tool out of 16.5k is O(1), it does not touch the rest of the catalog; +// - the catalog is served from persisted bindings, not re-parsed on every read +// (`executor.tools.sync_stale`, scoped to this run, fires at most once). +// +// Telemetry is only wired on targets that boot motel (cloud today), so this +// scenario yields `Telemetry` up front and skips cleanly elsewhere. It drives +// only public surfaces (typed API + MCP), so a green run is real evidence. +import { randomBytes } from "node:crypto"; + +import { expect } from "@effect/vitest"; +import { Effect, Schedule } from "effect"; +import { composePluginApi } from "@executor-js/api/server"; +import { + MICROSOFT_AUTH_TEMPLATE_SLUG, + MICROSOFT_GRAPH_ALL_PRESET_IDS, +} from "@executor-js/plugin-microsoft"; +import { microsoftHttpPlugin } from "@executor-js/plugin-microsoft/api"; +import { AuthTemplateSlug, ConnectionName, IntegrationSlug } from "@executor-js/sdk/shared"; + +import { scenario } from "../src/scenario"; +import type { ExportedSpan, SpanQuery, TelemetrySurface } from "../src/surfaces/telemetry"; +import { Api, Mcp, Target, Telemetry } from "../src/services"; + +const api = composePluginApi([microsoftHttpPlugin()] as const); + +const SEARCH_LIMIT = 5; +const HOW_MANY_INVOCATIONS = 3; + +type SearchPage = { + readonly items?: ReadonlyArray<{ readonly name?: string; readonly inputSchema?: unknown }>; + readonly total?: number; +}; + +const searchPageOf = (raw: unknown): SearchPage => + ((raw as { structuredContent?: SearchPage }).structuredContent ?? {}) as SearchPage; + +// Spans flush ~1s after the request (BatchSpanProcessor, drained on waitUntil). +// Poll the store until at least `n` matching spans have arrived, then hand the +// set back so the caller can assert the exact count. ~20s ceiling: slower is a +// real export bug, and the test should fail rather than hang. +const searchUntilCount = ( + telemetry: TelemetrySurface, + query: SpanQuery, + n: number, +): Effect.Effect => + telemetry.searchSpans(query).pipe( + Effect.filterOrFail( + (spans) => spans.length >= n, + (spans) => `expected >= ${n} spans for ${JSON.stringify(query)}, saw ${spans.length}`, + ), + Effect.retry(Schedule.both(Schedule.spaced("500 millis"), Schedule.recurs(40))), + ); + +scenario( + "MCP · ?codemode=false searches a 16k-tool catalog and invokes without dumping it", + { timeout: 300_000 }, + Effect.gen(function* () { + const target = yield* Target; + const { client } = yield* Api; + const mcp = yield* Mcp; + // Skips on any target without a trace store (selfhost, cloudflare today). + const telemetry = yield* Telemetry; + const identity = yield* target.newIdentity(); + const apiClient = yield* client(api, identity); + + const slug = `codemode-scale-${randomBytes(4).toString("hex")}`; + const connection = ConnectionName.make("main"); + + const cleanup = Effect.gen(function* () { + yield* apiClient.connections + .remove({ + params: { owner: "org", integration: IntegrationSlug.make(slug), name: connection }, + }) + .pipe(Effect.ignore); + yield* apiClient.microsoft + .removeGraph({ params: { slug: IntegrationSlug.make(slug) } }) + .pipe(Effect.ignore); + }); + + yield* Effect.ensuring( + Effect.gen(function* () { + // Seed the full Graph catalog: every workload, ~16.5k operations. + const added = yield* apiClient.microsoft.addGraph({ + payload: { + presetIds: [...MICROSOFT_GRAPH_ALL_PRESET_IDS], + customScopes: [], + slug, + name: "Microsoft Graph (codemode scale)", + }, + }); + expect( + added.toolCount, + "the full Graph catalog extracts thousands of tools", + ).toBeGreaterThan(5_000); + + // A static token is enough to exercise resolve+invoke; the upstream 401 + // surfaces as a tool failure, which still emits the spans we assert on. + yield* apiClient.connections.create({ + payload: { + owner: "org", + name: connection, + integration: IntegrationSlug.make(slug), + template: AuthTemplateSlug.make(MICROSOFT_AUTH_TEMPLATE_SLUG), + value: "token-xyz", + }, + }); + + const noncode = mcp.session(identity, { codeMode: false }); + + // 1) The non-code session advertises the meta-tools, NOT the 16.5k-tool + // catalog. This is the whole point: the catalog is never dumped. + const tools = yield* noncode.listTools(); + expect(tools, "search is advertised").toContain("search"); + expect(tools, "invoke is advertised").toContain("invoke"); + expect( + tools.length, + "the giant catalog is not dumped — only the fixed meta-tools are advertised", + ).toBeLessThan(5); + + // 2) `search` ranks over the whole catalog but returns a small bounded + // page, each hit with its own schema. + const search = yield* noncode.call("search", { query: "user", limit: SEARCH_LIMIT }); + expect(search.ok, "search completed without error").toBe(true); + const page = searchPageOf(search.raw); + const hits = page.items ?? []; + expect(hits.length, "search returns a bounded page, not the catalog").toBeLessThanOrEqual( + SEARCH_LIMIT, + ); + expect(hits.length, "search found matching tools").toBeGreaterThan(0); + expect( + hits.every((hit) => hit.inputSchema != null), + "each hit carries its input schema", + ).toBe(true); + + const targetTool = hits[0]!.name!; + // The `executor.tool.execute` span stamps the full address, which is + // `tools.` (the proxy-root prefix the wire name strips). + const executeToolName = `tools.${targetTool}`; + + // 3) Invoke the found tool several times. Each hits Graph with the fake + // token (401 -> tool failure) but exercises the full resolve+invoke + // path and emits one execute span per call. + for (let i = 0; i < HOW_MANY_INVOCATIONS; i++) { + yield* noncode.call("invoke", { name: targetTool, arguments: {} }); + } + + // (a) Every invocation dispatched the tool exactly once: no fan-out. + const executes = yield* searchUntilCount( + telemetry, + { + operation: "executor.tool.execute", + attributes: { "mcp.tool.name": executeToolName }, + }, + HOW_MANY_INVOCATIONS, + ); + expect(executes.length, "each invocation dispatches the tool exactly once").toBe( + HOW_MANY_INVOCATIONS, + ); + + // (b) A single invocation is O(1) in the catalog: its whole trace neither + // searches nor rebuilds the catalog. Resolving one tool out of 16.5k + // must not touch the rest. + const invokeTrace = yield* telemetry.searchSpans({ traceId: executes[0]!.traceId }); + const operations = invokeTrace.map((entry) => entry.span.operationName); + expect(operations, "an invocation does not search the whole catalog").not.toContain( + "executor.tools.search", + ); + expect(operations, "an invocation does not rebuild the catalog").not.toContain( + "executor.tools.sync_stale", + ); + + // (c) The catalog is served from persisted bindings, not re-parsed on + // every read: the per-connection rebuild for THIS integration fires + // at most once across the search above. + const rebuilds = yield* telemetry.searchSpans({ + operation: "executor.tools.sync_stale", + attributes: { "executor.integration": slug }, + }); + expect( + rebuilds.length, + "the catalog is not rebuilt/re-parsed on every read", + ).toBeLessThanOrEqual(1); + }), + cleanup, + ); + }), +); diff --git a/packages/core/execution/src/engine.ts b/packages/core/execution/src/engine.ts index 36f4a6040..38c025cca 100644 --- a/packages/core/execution/src/engine.ts +++ b/packages/core/execution/src/engine.ts @@ -13,7 +13,7 @@ import { CodeExecutionError } from "@executor-js/codemode-core"; import type { CodeExecutor, ExecuteResult, SandboxToolInvoker } from "@executor-js/codemode-core"; import { - addressToPath, + pathToAddress, defaultToolDiscoveryProvider, makeExecutorToolInvoker, listExecutorSources, @@ -53,6 +53,28 @@ export type ToolListing = { readonly inputSchema: unknown; }; +/** One ranked search hit for the non-code-mode `search` tool: a directly + * invocable `name` plus enough schema to call it. Same shape as a + * {@link ToolListing}, returned for only the matched page rather than the + * whole catalog. */ +export type ToolSearchResult = ToolListing; + +/** A page of {@link ToolSearchResult}s. `total` is the match count before + * pagination so the caller can tell it was truncated; `nextOffset` is the + * offset to pass back for the next page, or null at the end. */ +export type ToolSearchPage = { + readonly items: readonly ToolSearchResult[]; + readonly total: number; + readonly hasMore: boolean; + readonly nextOffset: number | null; +}; + +/** Default and ceiling for `search` page size. Search returns each hit's full + * self-contained schema, so the page is bounded to keep the response small + * even when the catalog has tens of thousands of tools. */ +export const DEFAULT_SEARCH_LIMIT = 10; +export const MAX_SEARCH_LIMIT = 25; + /** Internal representation with Effect runtime state for pause/resume. */ type InternalPausedExecution = PausedExecution & { readonly response: Deferred.Deferred; @@ -439,11 +461,16 @@ export type ExecutionEngine readonly getDescription: Effect.Effect; /** - * Enumerate every directly-callable tool with a self-contained input schema. - * Backs the non-code-mode MCP surface that exposes each tool individually - * instead of behind the single `execute` tool. + * Ranked, paginated tool search backing the non-code-mode `search` tool. + * Returns only the matched page (each hit with its self-contained input + * schema), so it scales to catalogs far too large to enumerate in one + * `listTools`. The lazy-loading counterpart to {@link listTools}. */ - readonly listTools: Effect.Effect; + readonly searchTools: (input: { + readonly query: string; + readonly limit?: number; + readonly offset?: number; + }) => Effect.Effect; /** * Invoke a single tool by its wire name with elicitation handled inline by @@ -719,26 +746,47 @@ export const createExecutionEngine = = executor.tools - .list({ includeSchemas: true }) - .pipe( - Effect.map((tools) => - tools.map( - (tool): ToolListing => ({ - name: addressToPath(String(tool.address)), - description: tool.description, - inputSchema: tool.inputSchema ?? { type: "object" }, - }), - ), - ), - // oxlint-disable-next-line executor/no-effect-escape-hatch -- boundary: ExecutionEngine.listTools exposes no error channel; a catalog read the listing surface can't recover from dies rather than forcing every caller to thread a typed error + const searchTools = (input: { + readonly query: string; + readonly limit?: number; + readonly offset?: number; + }): Effect.Effect => + Effect.gen(function* () { + const limit = Math.min(Math.max(input.limit ?? DEFAULT_SEARCH_LIMIT, 1), MAX_SEARCH_LIMIT); + const offset = Math.max(input.offset ?? 0, 0); + const page = yield* toolDiscoveryProvider.searchTools({ + executor, + query: input.query, + limit, + offset, + }); + const items = yield* Effect.forEach( + page.items, + (hit) => + executor.tools.schema(pathToAddress(hit.path)).pipe( + Effect.map( + (schema): ToolSearchResult => ({ + name: hit.path, + description: hit.description, + inputSchema: schema?.inputSchema ?? { type: "object" }, + }), + ), + ), + { concurrency: "unbounded" }, + ); + return { items, total: page.total, hasMore: page.hasMore, nextOffset: page.nextOffset }; + }).pipe( + // oxlint-disable-next-line executor/no-effect-escape-hatch -- boundary: ExecutionEngine.searchTools exposes no error channel; a catalog read the search surface can't recover from dies rather than forcing every caller to thread a typed error Effect.orDie, - Effect.withSpan("mcp.list_tools"), + Effect.withSpan("mcp.search_tools", { attributes: { "mcp.search.query": input.query } }), ); return { @@ -748,7 +796,7 @@ export const createExecutionEngine = Effect.sync(() => pausedExecutions.get(executionId) ?? null), getDescription: buildExecuteDescription(executor), - listTools, + searchTools, invokeTool: invokeToolInline, invokeToolWithPause, }; diff --git a/packages/core/execution/src/index.ts b/packages/core/execution/src/index.ts index 8388fcd93..2097878f4 100644 --- a/packages/core/execution/src/index.ts +++ b/packages/core/execution/src/index.ts @@ -2,12 +2,16 @@ export { createExecutionEngine, formatExecuteResult, formatPausedExecution, + DEFAULT_SEARCH_LIMIT, + MAX_SEARCH_LIMIT, type ExecutionEngine, type ExecutionEngineConfig, type ExecutionResult, type PausedExecution, type ResumeResponse, type ToolListing, + type ToolSearchPage, + type ToolSearchResult, } from "./engine"; export { buildExecuteDescription } from "./description"; diff --git a/packages/core/execution/src/tool-invoker.ts b/packages/core/execution/src/tool-invoker.ts index 5c324d77a..1bbabb0f9 100644 --- a/packages/core/execution/src/tool-invoker.ts +++ b/packages/core/execution/src/tool-invoker.ts @@ -56,7 +56,7 @@ const ADDRESS_PREFIX = "tools."; * namespaces) are addressed by their fqid with no prefix; the executor resolves * those from its static map directly, so leave them untouched. */ -const pathToAddress = (path: string): ToolAddress => { +export const pathToAddress = (path: string): ToolAddress => { if (path.startsWith(ADDRESS_PREFIX)) return ToolAddress.make(path); if (parseToolAddress(`${ADDRESS_PREFIX}${path}`)) { return ToolAddress.make(`${ADDRESS_PREFIX}${path}`); diff --git a/packages/hosts/mcp/src/tool-server.test.ts b/packages/hosts/mcp/src/tool-server.test.ts index 717eac2ae..14675a99e 100644 --- a/packages/hosts/mcp/src/tool-server.test.ts +++ b/packages/hosts/mcp/src/tool-server.test.ts @@ -30,7 +30,7 @@ const makeStubEngine = (overrides: { execute?: ExecutionEngine["execute"]; executeWithPause?: ExecutionEngine["executeWithPause"]; resume?: ExecutionEngine["resume"]; - listTools?: ExecutionEngine["listTools"]; + searchTools?: ExecutionEngine["searchTools"]; invokeTool?: ExecutionEngine["invokeTool"]; invokeToolWithPause?: ExecutionEngine["invokeToolWithPause"]; description?: string; @@ -42,7 +42,9 @@ const makeStubEngine = (overrides: { resume: overrides.resume ?? (() => Effect.succeed(null)), getPausedExecution: () => Effect.succeed(null), getDescription: Effect.succeed(overrides.description ?? "test executor"), - listTools: overrides.listTools ?? Effect.succeed([]), + searchTools: + overrides.searchTools ?? + (() => Effect.succeed({ items: [], total: 0, hasMore: false, nextOffset: null })), invokeTool: overrides.invokeTool ?? (() => Effect.succeed({ result: "default" })), invokeToolWithPause: overrides.invokeToolWithPause ?? diff --git a/packages/hosts/mcp/src/tool-server.ts b/packages/hosts/mcp/src/tool-server.ts index 9e320e604..938fec54c 100644 --- a/packages/hosts/mcp/src/tool-server.ts +++ b/packages/hosts/mcp/src/tool-server.ts @@ -29,9 +29,12 @@ import { createExecutionEngine, formatExecuteResult, formatPausedExecution, + DEFAULT_SEARCH_LIMIT, + MAX_SEARCH_LIMIT, type ExecutionEngine, type ExecutionEngineConfig, type ResumeResponse, + type ToolSearchPage, } from "@executor-js/execution"; // --------------------------------------------------------------------------- @@ -98,12 +101,13 @@ type SharedMcpServerConfig = { }; readonly browserApprovalStore?: BrowserApprovalStore; /** - * When `false`, run in transparent (non-code) mode: every available tool is - * enumerated as a directly-callable MCP tool instead of being reached through - * the single `execute` code tool. Defaults to `true` (code mode). Selected by - * a `?codemode=false` query param on the MCP endpoint, mirroring the same - * switch Cloudflare's MCP exposes — for clients that lazily load tools and - * need them listed individually. + * When `false`, run in non-code mode: instead of the single `execute` code + * tool, expose two meta-tools, `search` (find tools, ranked + paginated) and + * `invoke` (call a tool by name). This is the lazy-loading surface for clients + * that can't drive a code sandbox; it scales to any catalog size because + * `search` only ever returns a bounded page (dumping a large catalog directly + * does not). Defaults to `true` (code mode). Selected by a `?codemode=false` + * query param on the MCP endpoint. */ readonly codeMode?: boolean; }; @@ -448,24 +452,6 @@ const toMcpPausedResult = (formatted: ReturnType): const renderToolValueText = (value: unknown): string => typeof value === "string" ? value : JSON.stringify(value ?? null, null, 2); -// Transparent mode advertises every tool's input schema verbatim over the wire, -// and the MCP client validates each `inputSchema` root against `type: "object"`. -// A tool whose top-level input is a union (e.g. `executor.mcp.addServer`) -// compiles to `{ anyOf: [...] }` with no root `type`, which makes the client -// reject the ENTIRE `tools/list` response. Executor always invokes tools with a -// named-args object, so it is safe to stamp `type: "object"` onto any root that -// lacks one while preserving the rest of the schema (the union variants stay in -// `anyOf`; MCP's `.passthrough()` keeps the extra keys, and the tool invoker -// still enforces the real schema). -const toMcpInputSchema = (schema: unknown): Record => { - if (schema !== null && typeof schema === "object" && !Array.isArray(schema)) { - const record = schema as Record; - if (record.type === "object") return record; - if (record.type === undefined) return { ...record, type: "object" }; - } - return { type: "object" }; -}; - const toolErrorText = (error: ToolError): string => { const status = error.status != null ? ` (status ${error.status})` : ""; // oxlint-disable-next-line executor/no-unknown-error-message -- boundary: ToolError is a typed struct whose `message` is a schema field, not an unknown error @@ -480,6 +466,29 @@ const renderToolData = (data: unknown): McpToolResult => { }; }; +// A `search` result page: render the matches as text plus structured content so +// the model can read either. The page (items + total + nextOffset) is a record, +// so it rides `structuredContent` directly. +const renderSearchResult = (page: ToolSearchPage): McpToolResult => ({ + content: [{ type: "text", text: JSON.stringify(page, null, 2) }], + structuredContent: { ...page }, +}); + +// A call for a name that is neither `search`, `invoke`, nor `resume`. In +// search+invoke mode only those meta-tools are advertised; everything else is +// reached by name through `invoke`, so a direct call to a tool name is a client +// mistake worth naming explicitly. +const unknownMetaToolResult = (name: string): McpToolResult => ({ + content: [ + { + type: "text", + text: `Error: unknown tool "${name}". This connection exposes "search" (find tools) and "invoke" (call a tool by name).`, + }, + ], + structuredContent: { status: "error", error: `unknown tool: ${name}` }, + isError: true, +}); + const toNonCodeMcpResult = (result: FormattedExecuteInput): McpToolResult => { // Engine-level failure (declined approval, opaque defect surfaced as a // string) — not a tool-domain failure, but still an error for the client. @@ -925,7 +934,56 @@ export const createExecutorMcpServer = ( }), ); } else { - const toolListings = yield* engine.listTools; + // Non-code mode: instead of dumping the whole catalog (a large catalog + // produces a tools/list far too big for clients to load or the runtime to + // hold), expose two meta-tools, `search` and `invoke`. The client searches + // for the handful of tools it needs and invokes them by name. This is the + // lazy-loading counterpart to code mode's `execute`, and it scales to any + // catalog size because `search` only ever returns a bounded page. + const searchWireTool = { + name: "search", + description: [ + "Search the available tools by keyword. Returns ranked matches, each with its input schema, so you can call it with `invoke`.", + `Page with \`limit\` (default ${DEFAULT_SEARCH_LIMIT}, max ${MAX_SEARCH_LIMIT}) and \`offset\`; \`total\` and \`nextOffset\` in the result tell you whether there is more.`, + ].join("\n"), + inputSchema: { + type: "object" as const, + properties: { + query: { + type: "string", + description: + "Keywords matched against tool names and descriptions. Empty returns the top tools.", + }, + limit: { + type: "number", + description: `Maximum matches to return (default ${DEFAULT_SEARCH_LIMIT}, max ${MAX_SEARCH_LIMIT}).`, + }, + offset: { + type: "number", + description: "Offset into the ranked results, for pagination.", + }, + }, + }, + }; + const invokeWireTool = { + name: "invoke", + description: [ + "Invoke a tool by name with its arguments.", + "Get the tool `name` and its input schema from `search` first, then pass `arguments` matching that schema.", + ].join("\n"), + inputSchema: { + type: "object" as const, + properties: { + name: { type: "string", description: "The tool name exactly as returned by `search`." }, + arguments: { + type: "object", + description: "Arguments object matching the tool's input schema.", + additionalProperties: true, + }, + }, + required: ["name"], + }, + }; const resumeWireTool = elicitationMode.mode === "native" ? undefined @@ -977,16 +1035,13 @@ export const createExecutorMcpServer = ( }; const wireTools = [ - ...toolListings.map((tool) => ({ - name: tool.name, - description: tool.description, - inputSchema: toMcpInputSchema(tool.inputSchema), - })), + searchWireTool, + invokeWireTool, ...(resumeWireTool ? [resumeWireTool] : []), ]; yield* Effect.sync(() => { - // `registerTool` normally declares this; transparent mode bypasses it. + // `registerTool` normally declares this; the low-level handlers bypass it. server.server.registerCapabilities({ tools: { listChanged: false } }); server.server.setRequestHandler(ListToolsRequestSchema, () => ({ tools: wireTools })); @@ -1006,10 +1061,26 @@ export const createExecutorMcpServer = ( ), ); } - return runToolEffect(invokeSingleTool(name, args)); + if (name === "search") { + return runToolEffect( + engine + .searchTools({ + query: typeof args.query === "string" ? args.query : "", + limit: typeof args.limit === "number" ? args.limit : undefined, + offset: typeof args.offset === "number" ? args.offset : undefined, + }) + .pipe(Effect.map(renderSearchResult)), + ); + } + if (name === "invoke") { + const toolName = readArgString(args.name); + const toolArgs = isRecord(args.arguments) ? args.arguments : {}; + return runToolEffect(invokeSingleTool(toolName, toolArgs)); + } + return runToolEffect(Effect.succeed(unknownMetaToolResult(name))); }); }).pipe( - Effect.withSpan("mcp.host.register_transparent_tools", { + Effect.withSpan("mcp.host.register_search_invoke", { attributes: { "mcp.tool.count": wireTools.length }, }), ); From 175edf5b67d9bb79a2c21407ecb6abf7a3aa2ff9 Mon Sep 17 00:00:00 2001 From: Rhys Sullivan Date: Thu, 25 Jun 2026 10:40:53 -0700 Subject: [PATCH 4/4] Drop dead includeSchemas path left by the tool dump The non-code mode no longer dumps the catalog, so nothing enumerates tools with their schemas anymore. Revert `tools.list`'s `includeSchemas` branch (the bulk self-contained-schema enrichment) to the original projected-only listing, drop the `ToolListFilter.includeSchemas` field, and fold the now-single-use `ToolListing` type into `ToolSearchResult`. Also refresh the doc comments that still described the dump. --- e2e/src/surfaces/mcp.ts | 6 +- packages/core/execution/src/engine.ts | 18 ++--- packages/core/execution/src/index.ts | 1 - packages/core/sdk/src/executor.ts | 72 +++++-------------- packages/core/sdk/src/tool.ts | 5 -- packages/hosts/mcp/src/browser-approval.ts | 7 +- .../hosts/mcp/src/in-memory-session-store.ts | 2 +- 7 files changed, 32 insertions(+), 79 deletions(-) diff --git a/e2e/src/surfaces/mcp.ts b/e2e/src/surfaces/mcp.ts index a7607dfed..e01f76f07 100644 --- a/e2e/src/surfaces/mcp.ts +++ b/e2e/src/surfaces/mcp.ts @@ -163,7 +163,7 @@ export interface McpSurface { options?: { readonly elicitationMode?: McpElicitationMode; /** Pass `false` to add `?codemode=false`, switching the session into - * transparent mode (every tool registered directly instead of behind the + * non-code mode (the `search` and `invoke` meta-tools instead of the * single `execute` tool). Omitted/`true` keeps the default code mode. */ readonly codeMode?: boolean; }, @@ -276,8 +276,8 @@ export const makeMcpSurface = (target: Target, runDir?: string): McpSurface => ( const serverName = `${target.name}-${randomUUID().slice(0, 8)}`; // Session config rides query params on the MCP endpoint, per ecosystem // convention: `?elicitation_mode=` (a paused execution yields an approvalUrl - // instead of letting the model resume inline) and `?codemode=false` (every - // tool registered directly instead of behind the single `execute` tool). + // instead of letting the model resume inline) and `?codemode=false` (the + // `search`/`invoke` meta-tools instead of the single `execute` tool). const sessionUrl = (() => { const url = new URL(target.mcpUrl); if (options?.elicitationMode) { diff --git a/packages/core/execution/src/engine.ts b/packages/core/execution/src/engine.ts index 38c025cca..d362845ca 100644 --- a/packages/core/execution/src/engine.ts +++ b/packages/core/execution/src/engine.ts @@ -42,23 +42,17 @@ export type PausedExecution = { readonly elicitationContext: ElicitationContext; }; -/** One directly-callable tool, as enumerated for non-code-mode MCP. The - * `name` is the sandbox-callable path (`...` - * or a static fqid), which doubles as the wire tool name clients call back - * with. `inputSchema` is self-contained JSON Schema (shared `$defs` already - * inlined by `tools.list({ includeSchemas: true })`). */ -export type ToolListing = { +/** One ranked hit from the non-code-mode `search` tool. `name` is the + * sandbox-callable path (`...` or a + * static fqid), which doubles as the wire name a client passes to `invoke`. + * `inputSchema` is self-contained JSON Schema so the hit can be called + * directly without a second round-trip. */ +export type ToolSearchResult = { readonly name: string; readonly description?: string; readonly inputSchema: unknown; }; -/** One ranked search hit for the non-code-mode `search` tool: a directly - * invocable `name` plus enough schema to call it. Same shape as a - * {@link ToolListing}, returned for only the matched page rather than the - * whole catalog. */ -export type ToolSearchResult = ToolListing; - /** A page of {@link ToolSearchResult}s. `total` is the match count before * pagination so the caller can tell it was truncated; `nextOffset` is the * offset to pass back for the next page, or null at the end. */ diff --git a/packages/core/execution/src/index.ts b/packages/core/execution/src/index.ts index 2097878f4..fa5c05a21 100644 --- a/packages/core/execution/src/index.ts +++ b/packages/core/execution/src/index.ts @@ -9,7 +9,6 @@ export { type ExecutionResult, type PausedExecution, type ResumeResponse, - type ToolListing, type ToolSearchPage, type ToolSearchResult, } from "./engine"; diff --git a/packages/core/sdk/src/executor.ts b/packages/core/sdk/src/executor.ts index 7a587aea0..38e12a3c6 100644 --- a/packages/core/sdk/src/executor.ts +++ b/packages/core/sdk/src/executor.ts @@ -132,7 +132,7 @@ import { import { ToolSchemaView, type IntegrationDetectionResult } from "./types"; import { type Tool, type ToolAnnotations, type ToolDef, type ToolListFilter } from "./tool"; import { buildToolTypeScriptPreview } from "./schema-types"; -import { collectReferencedDefinitions, reattachDefs } from "./schema-refs"; +import { collectReferencedDefinitions } from "./schema-refs"; import { refreshAccessToken, exchangeClientCredentials, @@ -2473,56 +2473,22 @@ export const createExecutor = => Effect.gen(function* () { yield* syncStaleConnectionTools; - const includeSchemas = filter?.includeSchemas ?? false; - // Projected by default: the list surface is metadata (address, - // description, annotations) — loading every tool's input/output - // schema JSON made an unbounded list scale with schema bytes, not - // tool count. Callers that enumerate tools as directly-callable - // definitions (non-code-mode MCP) opt in with `includeSchemas`, which - // loads the full rows and inlines each tool's referenced shared - // `$defs` so the returned schemas are self-contained. - const where = (b: AnyCb) => - b.and( - filter?.integration === undefined - ? true - : b("integration", "=", String(filter.integration)), - filter?.owner === undefined ? true : b("owner", "=", filter.owner), - filter?.connection === undefined - ? true - : b("connection", "=", String(filter.connection)), - ); - const rows = includeSchemas - ? yield* core.findMany("tool", { where }) - : yield* core.findMany("tool", { where, select: TOOL_INVOCATION_COLUMNS }); - - // Shared `$defs` grouped by connection — definition names are unique - // per connection, not globally, so each tool's `$ref`s must resolve - // against its own connection's defs. One bulk query keeps this from - // becoming an N+1 over `tools.schema`. - const defsByConnection = new Map>(); - if (includeSchemas) { - const definitionRows = yield* core.findMany("definition", { where }); - for (const def of definitionRows) { - const key = `${def.owner}|${def.integration}|${def.connection}`; - let defs = defsByConnection.get(key); - if (!defs) { - defs = new Map(); - defsByConnection.set(key, defs); - } - defs.set(def.name, decodeJsonColumn(def.schema)); - } - } - const selfContain = (tool: Tool): Tool => { - if (!includeSchemas) return tool; - const defs = defsByConnection.get(`${tool.owner}|${tool.integration}|${tool.connection}`); - if (!defs || defs.size === 0) return tool; - return { - ...tool, - inputSchema: reattachDefs(tool.inputSchema, defs), - outputSchema: reattachDefs(tool.outputSchema, defs), - }; - }; - + // Projected: the list surface is metadata (address, description, + // annotations) — loading every tool's input/output schema JSON made + // an unbounded list scale with schema bytes, not tool count. + const rows = yield* core.findMany("tool", { + where: (b: AnyCb) => + b.and( + filter?.integration === undefined + ? true + : b("integration", "=", String(filter.integration)), + filter?.owner === undefined ? true : b("owner", "=", filter.owner), + filter?.connection === undefined + ? true + : b("connection", "=", String(filter.connection)), + ), + select: TOOL_INVOCATION_COLUMNS, + }); const includeBlocked = filter?.includeBlocked ?? false; const policyRows = yield* core.findMany("tool_policy", {}); const tools: Tool[] = []; @@ -2538,10 +2504,8 @@ export const createExecutor = { /** * Read whether the session runs in code mode off an MCP request's `?codemode=` * query. Code mode (the default) exposes a single `execute` tool the agent - * drives with TypeScript; `?codemode=false` selects transparent mode, where - * every available tool is registered as a directly-callable MCP tool. Only the - * literal value `false` turns code mode off; anything else keeps the default. + * drives with TypeScript; `?codemode=false` selects non-code mode, where the + * agent discovers and runs tools through the `search` and `invoke` meta-tools. + * Only the literal value `false` turns code mode off; anything else keeps the + * default. */ export const readCodeMode = (request: Request): boolean => { const url = new URL(request.url); diff --git a/packages/hosts/mcp/src/in-memory-session-store.ts b/packages/hosts/mcp/src/in-memory-session-store.ts index 5749e1ea3..1269ea301 100644 --- a/packages/hosts/mcp/src/in-memory-session-store.ts +++ b/packages/hosts/mcp/src/in-memory-session-store.ts @@ -70,7 +70,7 @@ export interface McpBuildServerOptions { /** * Whether the session runs in code mode (the single `execute` tool the agent * drives with TypeScript). Defaults to `true`; `?codemode=false` selects - * transparent mode, where every available tool is registered directly. + * non-code mode, with the `search` and `invoke` meta-tools. */ readonly codeMode?: boolean; }