diff --git a/apps/cloud/src/api/protected.test.ts b/apps/cloud/src/api/protected.test.ts index 5a1a02283..79bd2bf82 100644 --- a/apps/cloud/src/api/protected.test.ts +++ b/apps/cloud/src/api/protected.test.ts @@ -18,6 +18,13 @@ const makeBaseEngine = (): ExecutionEngine => }), getPausedExecution: () => Effect.succeed(null), getDescription: Effect.succeed("desc"), + searchTools: () => Effect.succeed({ items: [], total: 0, hasMore: false, nextOffset: null }), + invokeTool: () => Effect.succeed({ result: "ok", logs: [] }), + invokeToolWithPause: () => + Effect.succeed({ + status: "completed", + result: { result: "ok", logs: [] }, + }), }) as ExecutionEngine; describe("withExecutionUsageTracking", () => { diff --git a/apps/cloud/src/engine/execution-usage.ts b/apps/cloud/src/engine/execution-usage.ts index cd4489984..20be7f4e7 100644 --- a/apps/cloud/src/engine/execution-usage.ts +++ b/apps/cloud/src/engine/execution-usage.ts @@ -20,4 +20,15 @@ export const withExecutionUsageTracking = ( resume: (executionId, response) => engine.resume(executionId, response), getPausedExecution: (executionId) => engine.getPausedExecution(executionId), getDescription: engine.getDescription, + // searchTools is discovery, not an execution, so it doesn't count as usage. + searchTools: (input) => engine.searchTools(input), + // A direct tool invocation is an execution, so it counts the same as execute. + invokeTool: (name, args, options) => + engine + .invokeTool(name, args, options) + .pipe(Effect.tap(() => Effect.sync(() => trackUsage(organizationId)))), + invokeToolWithPause: (name, args) => + engine + .invokeToolWithPause(name, args) + .pipe(Effect.tap(() => Effect.sync(() => trackUsage(organizationId)))), }); diff --git a/apps/cloud/src/mcp/session-durable-object.ts b/apps/cloud/src/mcp/session-durable-object.ts index 5513ddb90..b7e6f3f90 100644 --- a/apps/cloud/src/mcp/session-durable-object.ts +++ b/apps/cloud/src/mcp/session-durable-object.ts @@ -173,6 +173,7 @@ export class McpSessionDO extends McpSessionDOBase { organizationSlug: org.slug, userId: token.userId, elicitationMode: token.elicitationMode, + codeMode: token.codeMode, } satisfies SessionMeta; }).pipe( Effect.withSpan("McpSessionDO.resolveSessionMeta"), @@ -208,6 +209,7 @@ export class McpSessionDO extends McpSessionDOBase { parentSpan: () => self.currentParentSpan(), debug: env.EXECUTOR_MCP_DEBUG === "true", browserApprovalStore: self.browserApprovalStore, + codeMode: sessionMeta.codeMode, elicitationMode: sessionElicitationMode === "browser" ? { diff --git a/apps/host-cloudflare/src/mcp/session-durable-object.ts b/apps/host-cloudflare/src/mcp/session-durable-object.ts index 26caac4df..8c36474cf 100644 --- a/apps/host-cloudflare/src/mcp/session-durable-object.ts +++ b/apps/host-cloudflare/src/mcp/session-durable-object.ts @@ -62,6 +62,7 @@ export class McpSessionDO extends McpSessionDOBase { organizationSlug: this.cfConfig.organizationSlug, userId: token.userId, elicitationMode: token.elicitationMode, + codeMode: token.codeMode, } satisfies SessionMeta); } @@ -88,6 +89,7 @@ export class McpSessionDO extends McpSessionDOBase { const mcpServer = yield* createExecutorMcpServer({ engine, browserApprovalStore: self.browserApprovalStore, + codeMode: sessionMeta.codeMode, elicitationMode: elicitationMode === "browser" ? { diff --git a/apps/local/src/mcp.ts b/apps/local/src/mcp.ts index b26114a5f..4f1f02999 100644 --- a/apps/local/src/mcp.ts +++ b/apps/local/src/mcp.ts @@ -12,6 +12,7 @@ import { approvalUrlForRequest, decodeResumeResponse, formatResumeAcknowledgement, + readCodeMode, readElicitationMode, } from "@executor-js/host-mcp/browser-approval"; import { makeInProcessBrowserApprovalStore } from "@executor-js/host-mcp/browser-approval-store"; @@ -144,6 +145,7 @@ export const createMcpRequestHandler = (config: ExecutorMcpServerConfig): McpReq createExecutorMcpServer({ ...config, browserApprovalStore: approvals.store, + codeMode: readCodeMode(request), elicitationMode: elicitationMode === "browser" ? { diff --git a/e2e/scenarios/mcp-codemode-off.test.ts b/e2e/scenarios/mcp-codemode-off.test.ts new file mode 100644 index 000000000..c34b6cdee --- /dev/null +++ b/e2e/scenarios/mcp-codemode-off.test.ts @@ -0,0 +1,304 @@ +// Non-code connection mode (`?codemode=false`). By default an Executor MCP +// session runs in "code mode": one `execute` tool the model writes TypeScript +// against, discovering connections through `tools.search()` / +// `tools.describe.tool()` and calling them as `tools.<...>()` inside the +// sandbox. Some clients can't drive a code sandbox and instead want to discover +// and call tools through plain MCP tool calls, so the session accepts +// `?codemode=false` and exposes two meta-tools, `search` and `invoke`, instead +// of `execute`. +// +// Why not just dump every tool directly (the obvious reading of the Cloudflare +// `?codemode=false` switch)? Because a real catalog is enormous: the full +// Microsoft Graph connection alone is ~16.5k tools / hundreds of MB of inlined +// schema, which no client can load in one `tools/list`. `search`+`invoke` is the +// lazy-loading shape: the client searches for the handful of tools it needs and +// invokes them by name, so it scales to any catalog. (See mcp-codemode-scale.) +// +// The seam under test: the SAME connected identity, opened with the query param, +// advertises `search`/`invoke` instead of `execute`; `search` finds a seeded +// connection's tools, and `invoke` runs one and returns its real result. A +// default (code-mode) session of the same identity is the contrast: it still +// advertises only `execute`. +import { randomBytes, randomUUID } from "node:crypto"; + +import { expect } from "@effect/vitest"; +import { Effect } from "effect"; +import { composePluginApi } from "@executor-js/api/server"; +import { openApiHttpPlugin } from "@executor-js/plugin-openapi/api"; +import { + AuthTemplateSlug, + ConnectionName, + IntegrationSlug, + ProviderItemId, +} from "@executor-js/sdk/shared"; + +import { scenario } from "../src/scenario"; +import { Api, Mcp, Target } from "../src/services"; + +const api = composePluginApi([openApiHttpPlugin()] as const); + +// A built-in core tool present on every target. In non-code mode it is invoked +// by this wire name through the `invoke` meta-tool (a static core tool's address +// has no `tools.` prefix, so it survives `addressToPath` unchanged), and it +// returns real data (the policy listing) we can verify. +const CORE_TOOL = "executor.coreTools.policies.list"; + +// The approval-gated core tool used by the pause+resume scenario below. It gates +// on its own `requiresApproval` annotation (no policy needed), so invoking it +// pauses, and resuming exercises the non-code resume formatter. +const POLICY_CREATE_TOOL = "executor.coreTools.policies.create"; + +// Minimal three-operation spec: three operations become three connection tools. +// The baseUrl is never contacted; we only need the tools to exist in the +// catalog so `search` has something to find. +const ordersOpenApiSpec = (baseUrl: string): string => + JSON.stringify({ + openapi: "3.0.3", + info: { title: "Orders API", version: "1.0.0" }, + servers: [{ url: baseUrl }], + paths: { + "/orders/{orderId}": { + get: { + operationId: "getOrder", + summary: "Fetch a single order", + parameters: [{ name: "orderId", in: "path", required: true, schema: { type: "string" } }], + responses: { "200": { description: "An order." } }, + }, + }, + "/orders": { + get: { + operationId: "listOrders", + summary: "List orders", + responses: { "200": { description: "The orders." } }, + }, + post: { + operationId: "createOrder", + summary: "Create an order", + responses: { "201": { description: "The created order." } }, + }, + }, + }, + }); + +// `search`/`invoke` use the same `addressToPath(address)` the engine does: a +// leading proxy-root `tools.` is stripped, everything else is left as-is. +// Deriving the expected name from the same catalog keeps the assertion from +// drifting if the address format changes. +const wireName = (address: string): string => + address.startsWith("tools.") ? address.slice("tools.".length) : address; + +const apiKeyTemplate = [ + { + slug: "apiKey", + type: "apiKey", + headers: { "x-api-key": [{ type: "variable", name: "token" }] }, + }, +] as const; + +type SearchPage = { + readonly items?: ReadonlyArray<{ readonly name?: string; readonly inputSchema?: unknown }>; + readonly total?: number; +}; + +const searchPageOf = (raw: unknown): SearchPage => + ((raw as { structuredContent?: SearchPage }).structuredContent ?? {}) as SearchPage; + +scenario( + "MCP · ?codemode=false exposes search + invoke instead of `execute`", + { timeout: 120_000 }, + Effect.gen(function* () { + const target = yield* Target; + const { client } = yield* Api; + const mcp = yield* Mcp; + const identity = yield* target.newIdentity(); + const apiClient = yield* client(api, identity); + + // Unique slug per run keeps parallel/repeated runs out of each other's + // catalog (selfhost shares the bootstrap-admin identity). + const nonce = randomBytes(4).toString("hex"); + const slug = `codemode-orders-${nonce}`; + const specBaseUrl = "http://127.0.0.1:59999"; // never contacted + + const cleanup = Effect.gen(function* () { + yield* apiClient.connections + .remove({ + params: { + owner: "org", + integration: IntegrationSlug.make(slug), + name: ConnectionName.make("main"), + }, + }) + .pipe(Effect.ignore); + yield* apiClient.integrations + .remove({ params: { slug: IntegrationSlug.make(slug) } }) + .pipe(Effect.ignore); + }); + + yield* Effect.ensuring( + Effect.gen(function* () { + // Seed an integration + connection so `search` has tools to find. + const added = yield* apiClient.openapi.addSpec({ + payload: { + spec: { kind: "blob", value: ordersOpenApiSpec(specBaseUrl) }, + slug, + baseUrl: specBaseUrl, + authenticationTemplate: apiKeyTemplate, + }, + }); + expect(added.toolCount, "the orders fixture's operations became tools").toBe(3); + + const providers = yield* apiClient.providers.list(); + yield* apiClient.connections.create({ + payload: { + owner: "org", + name: ConnectionName.make("main"), + integration: IntegrationSlug.make(slug), + template: AuthTemplateSlug.make("apiKey"), + from: { provider: providers[0]!, id: ProviderItemId.make(randomUUID()) }, + }, + }); + + // The exact wire names `search` should surface, derived from the catalog. + const catalog = yield* apiClient.tools.list({ + query: { integration: IntegrationSlug.make(slug) }, + }); + const expectedConnectionTools = catalog.map((tool) => wireName(String(tool.address))); + expect( + expectedConnectionTools.length, + "the three connection tools are in the catalog", + ).toBe(3); + + // A policy with an unrelated pattern: it does NOT gate `policies.list`, + // so the invoke below runs ungated. Its id only has to appear in the + // listing to prove the tool actually executed and returned data. + const policy = yield* apiClient.policies.create({ + payload: { owner: "org", pattern: `codemode.gate.${nonce}`, action: "block" }, + }); + + yield* Effect.ensuring( + Effect.gen(function* () { + const noncode = mcp.session(identity, { codeMode: false }); + + // 1) The advertised tools are the meta-tools, NOT `execute` and NOT a + // dumped catalog. + const tools = yield* noncode.listTools(); + expect(tools, "search is advertised").toContain("search"); + expect(tools, "invoke is advertised").toContain("invoke"); + expect(tools, "code mode's `execute` is gone").not.toContain("execute"); + expect( + tools, + "the catalog is NOT dumped directly (that is the whole point)", + ).not.toContain(expectedConnectionTools[0]!); + + // 2) `search` finds the seeded connection's tools, each with a schema. + const search = yield* noncode.call("search", { query: slug }); + expect(search.ok, "search completed without error").toBe(true); + const page = searchPageOf(search.raw); + const found = (page.items ?? []).map((item) => item.name); + for (const name of expectedConnectionTools) { + expect(found, `search surfaced connection tool ${name}`).toContain(name); + } + expect( + (page.items ?? []).every((item) => item.inputSchema != null), + "each search hit carries its input schema, so it can be invoked directly", + ).toBe(true); + + // 3) `invoke` runs a tool by name and returns its real result. + const invoked = yield* noncode.call("invoke", { name: CORE_TOOL, arguments: {} }); + expect(invoked.ok, "the invoke completed without error").toBe(true); + expect( + invoked.text, + "the listing the tool returned includes the policy we created", + ).toContain(policy.id); + + // 4) Contrast: the same identity in default (code) mode still gets the + // single `execute` tool and not the meta-tools. + const codeModeSession = mcp.session(identity); + const codeModeTools = yield* codeModeSession.listTools(); + expect(codeModeTools, "code mode still advertises `execute`").toContain("execute"); + expect(codeModeTools, "code mode does not advertise `search`").not.toContain("search"); + }), + apiClient.policies + .remove({ params: { policyId: policy.id }, payload: { owner: "org" } }) + .pipe(Effect.ignore), + ); + }), + cleanup, + ); + }), +); + +// Result-shape parity across the pause boundary. An `invoke`d tool that pauses +// for approval and then resumes must return the SAME shape it would have +// returned without pausing: the tool's own result, unwrapped from the +// `ToolResult` envelope. The `resume` machinery is shared with code mode, where a +// completion is an `execute` envelope (`{ status, result, logs }`); a regression +// here formatted the resumed direct-tool result that same way, so a non-code +// client got the code-mode envelope instead of the tool's fields. This drives +// the approval-gated `policies.create` through invoke -> pause -> approve -> +// resume and asserts the resumed structured content is the policy itself. +scenario( + "MCP · ?codemode=false keeps the unwrapped tool result across an approval pause+resume", + { timeout: 120_000 }, + Effect.gen(function* () { + const target = yield* Target; + const { client } = yield* Api; + const mcp = yield* Mcp; + const identity = yield* target.newIdentity(); + const apiClient = yield* client(api, identity); + + // Unique, non-matching pattern: the rule the gated tool creates is inert and + // cannot gate any other scenario's tools. Removed in the finalizer. + const nonce = randomBytes(4).toString("hex"); + const pattern = `codemode-resume-${nonce}.gate`; + + const cleanup = apiClient.policies.list().pipe( + Effect.flatMap((list) => + Effect.forEach( + list.filter((p) => p.pattern === pattern), + (p) => + apiClient.policies + .remove({ params: { policyId: p.id }, payload: { owner: "org" } }) + .pipe(Effect.ignore), + ), + ), + Effect.ignore, + ); + + yield* Effect.ensuring( + Effect.gen(function* () { + const noncode = mcp.session(identity, { codeMode: false }); + yield* noncode.listTools(); + + // Invoke the approval-gated tool by name. No policy is in play, so the + // only thing that can pause it is its own `requiresApproval` annotation. + // The paused result carries the executionId to resume. + const paused = yield* noncode.call("invoke", { + name: POLICY_CREATE_TOOL, + arguments: { owner: "org", pattern, action: "block" }, + }); + expect(paused.text, "the gated tool paused for approval").toContain("Execution paused"); + expect(paused.text, "the paused result carries an executionId").toContain("executionId:"); + + // Approve and resume. + const resumed = yield* noncode.approvePaused(paused.text); + expect(resumed.ok, "the resumed call completed without error").toBe(true); + + const structured = (resumed.raw as { structuredContent?: Record }) + .structuredContent; + // Fixed shape: the tool's own result, so the policy fields sit at the top + // level. Buggy shape: the code-mode `execute` envelope, where the policy + // would be nested under `result` and `pattern` absent at the top level. + expect( + structured?.pattern, + "the resumed result is the unwrapped tool result (policy fields at the top level)", + ).toBe(pattern); + expect( + structured?.result, + "the code-mode execute envelope (status/result/logs) is not used in non-code mode", + ).toBeUndefined(); + }), + cleanup, + ); + }), +); diff --git a/e2e/scenarios/mcp-codemode-scale.test.ts b/e2e/scenarios/mcp-codemode-scale.test.ts new file mode 100644 index 000000000..ee5c26aea --- /dev/null +++ b/e2e/scenarios/mcp-codemode-scale.test.ts @@ -0,0 +1,203 @@ +// Non-code mode (`?codemode=false`) at real catalog scale. This is the scenario +// that justified the design: dumping every tool directly does not scale (the +// full Microsoft Graph catalog is ~16.5k tools / hundreds of MB of inlined +// schema, far too big for any client to load in one `tools/list`). `search` + +// `invoke` is the lazy-loading answer: a fixed two-tool surface that never +// returns more than a bounded page, no matter how large the catalog is. +// +// What it asserts, against the full Graph catalog and the suite's trace store: +// - the non-code session advertises only the meta-tools, NOT the 16.5k-tool +// catalog (the dump is gone); +// - `search` over the whole catalog returns a small bounded page, each hit +// carrying its own schema; +// - each invocation dispatches the tool exactly once (`executor.tool.execute`), +// with no fan-out; +// - a single invocation's trace neither searches nor rebuilds the catalog +// (no `executor.tools.search`, no `executor.tools.sync_stale`) — resolving +// one tool out of 16.5k is O(1), it does not touch the rest of the catalog; +// - the catalog is served from persisted bindings, not re-parsed on every read +// (`executor.tools.sync_stale`, scoped to this run, fires at most once). +// +// Telemetry is only wired on targets that boot motel (cloud today), so this +// scenario yields `Telemetry` up front and skips cleanly elsewhere. It drives +// only public surfaces (typed API + MCP), so a green run is real evidence. +import { randomBytes } from "node:crypto"; + +import { expect } from "@effect/vitest"; +import { Effect, Schedule } from "effect"; +import { composePluginApi } from "@executor-js/api/server"; +import { + MICROSOFT_AUTH_TEMPLATE_SLUG, + MICROSOFT_GRAPH_ALL_PRESET_IDS, +} from "@executor-js/plugin-microsoft"; +import { microsoftHttpPlugin } from "@executor-js/plugin-microsoft/api"; +import { AuthTemplateSlug, ConnectionName, IntegrationSlug } from "@executor-js/sdk/shared"; + +import { scenario } from "../src/scenario"; +import type { ExportedSpan, SpanQuery, TelemetrySurface } from "../src/surfaces/telemetry"; +import { Api, Mcp, Target, Telemetry } from "../src/services"; + +const api = composePluginApi([microsoftHttpPlugin()] as const); + +const SEARCH_LIMIT = 5; +const HOW_MANY_INVOCATIONS = 3; + +type SearchPage = { + readonly items?: ReadonlyArray<{ readonly name?: string; readonly inputSchema?: unknown }>; + readonly total?: number; +}; + +const searchPageOf = (raw: unknown): SearchPage => + ((raw as { structuredContent?: SearchPage }).structuredContent ?? {}) as SearchPage; + +// Spans flush ~1s after the request (BatchSpanProcessor, drained on waitUntil). +// Poll the store until at least `n` matching spans have arrived, then hand the +// set back so the caller can assert the exact count. ~20s ceiling: slower is a +// real export bug, and the test should fail rather than hang. +const searchUntilCount = ( + telemetry: TelemetrySurface, + query: SpanQuery, + n: number, +): Effect.Effect => + telemetry.searchSpans(query).pipe( + Effect.filterOrFail( + (spans) => spans.length >= n, + (spans) => `expected >= ${n} spans for ${JSON.stringify(query)}, saw ${spans.length}`, + ), + Effect.retry(Schedule.both(Schedule.spaced("500 millis"), Schedule.recurs(40))), + ); + +scenario( + "MCP · ?codemode=false searches a 16k-tool catalog and invokes without dumping it", + { timeout: 300_000 }, + Effect.gen(function* () { + const target = yield* Target; + const { client } = yield* Api; + const mcp = yield* Mcp; + // Skips on any target without a trace store (selfhost, cloudflare today). + const telemetry = yield* Telemetry; + const identity = yield* target.newIdentity(); + const apiClient = yield* client(api, identity); + + const slug = `codemode-scale-${randomBytes(4).toString("hex")}`; + const connection = ConnectionName.make("main"); + + const cleanup = Effect.gen(function* () { + yield* apiClient.connections + .remove({ + params: { owner: "org", integration: IntegrationSlug.make(slug), name: connection }, + }) + .pipe(Effect.ignore); + yield* apiClient.microsoft + .removeGraph({ params: { slug: IntegrationSlug.make(slug) } }) + .pipe(Effect.ignore); + }); + + yield* Effect.ensuring( + Effect.gen(function* () { + // Seed the full Graph catalog: every workload, ~16.5k operations. + const added = yield* apiClient.microsoft.addGraph({ + payload: { + presetIds: [...MICROSOFT_GRAPH_ALL_PRESET_IDS], + customScopes: [], + slug, + name: "Microsoft Graph (codemode scale)", + }, + }); + expect( + added.toolCount, + "the full Graph catalog extracts thousands of tools", + ).toBeGreaterThan(5_000); + + // A static token is enough to exercise resolve+invoke; the upstream 401 + // surfaces as a tool failure, which still emits the spans we assert on. + yield* apiClient.connections.create({ + payload: { + owner: "org", + name: connection, + integration: IntegrationSlug.make(slug), + template: AuthTemplateSlug.make(MICROSOFT_AUTH_TEMPLATE_SLUG), + value: "token-xyz", + }, + }); + + const noncode = mcp.session(identity, { codeMode: false }); + + // 1) The non-code session advertises the meta-tools, NOT the 16.5k-tool + // catalog. This is the whole point: the catalog is never dumped. + const tools = yield* noncode.listTools(); + expect(tools, "search is advertised").toContain("search"); + expect(tools, "invoke is advertised").toContain("invoke"); + expect( + tools.length, + "the giant catalog is not dumped — only the fixed meta-tools are advertised", + ).toBeLessThan(5); + + // 2) `search` ranks over the whole catalog but returns a small bounded + // page, each hit with its own schema. + const search = yield* noncode.call("search", { query: "user", limit: SEARCH_LIMIT }); + expect(search.ok, "search completed without error").toBe(true); + const page = searchPageOf(search.raw); + const hits = page.items ?? []; + expect(hits.length, "search returns a bounded page, not the catalog").toBeLessThanOrEqual( + SEARCH_LIMIT, + ); + expect(hits.length, "search found matching tools").toBeGreaterThan(0); + expect( + hits.every((hit) => hit.inputSchema != null), + "each hit carries its input schema", + ).toBe(true); + + const targetTool = hits[0]!.name!; + // The `executor.tool.execute` span stamps the full address, which is + // `tools.` (the proxy-root prefix the wire name strips). + const executeToolName = `tools.${targetTool}`; + + // 3) Invoke the found tool several times. Each hits Graph with the fake + // token (401 -> tool failure) but exercises the full resolve+invoke + // path and emits one execute span per call. + for (let i = 0; i < HOW_MANY_INVOCATIONS; i++) { + yield* noncode.call("invoke", { name: targetTool, arguments: {} }); + } + + // (a) Every invocation dispatched the tool exactly once: no fan-out. + const executes = yield* searchUntilCount( + telemetry, + { + operation: "executor.tool.execute", + attributes: { "mcp.tool.name": executeToolName }, + }, + HOW_MANY_INVOCATIONS, + ); + expect(executes.length, "each invocation dispatches the tool exactly once").toBe( + HOW_MANY_INVOCATIONS, + ); + + // (b) A single invocation is O(1) in the catalog: its whole trace neither + // searches nor rebuilds the catalog. Resolving one tool out of 16.5k + // must not touch the rest. + const invokeTrace = yield* telemetry.searchSpans({ traceId: executes[0]!.traceId }); + const operations = invokeTrace.map((entry) => entry.span.operationName); + expect(operations, "an invocation does not search the whole catalog").not.toContain( + "executor.tools.search", + ); + expect(operations, "an invocation does not rebuild the catalog").not.toContain( + "executor.tools.sync_stale", + ); + + // (c) The catalog is served from persisted bindings, not re-parsed on + // every read: the per-connection rebuild for THIS integration fires + // at most once across the search above. + const rebuilds = yield* telemetry.searchSpans({ + operation: "executor.tools.sync_stale", + attributes: { "executor.integration": slug }, + }); + expect( + rebuilds.length, + "the catalog is not rebuilt/re-parsed on every read", + ).toBeLessThanOrEqual(1); + }), + cleanup, + ); + }), +); diff --git a/e2e/src/surfaces/mcp.ts b/e2e/src/surfaces/mcp.ts index 0e47555bb..e01f76f07 100644 --- a/e2e/src/surfaces/mcp.ts +++ b/e2e/src/surfaces/mcp.ts @@ -160,7 +160,13 @@ export interface McpSurface { readonly url: string; readonly session: ( identity: Identity, - options?: { readonly elicitationMode?: McpElicitationMode }, + options?: { + readonly elicitationMode?: McpElicitationMode; + /** Pass `false` to add `?codemode=false`, switching the session into + * non-code mode (the `search` and `invoke` meta-tools instead of the + * single `execute` tool). Omitted/`true` keeps the default code mode. */ + readonly codeMode?: boolean; + }, ) => McpSession; /** * Mint a real MCP bearer headlessly: protected-resource discovery → @@ -268,12 +274,18 @@ export const makeMcpSurface = (target: Target, runDir?: string): McpSurface => ( // identity's OAuth isolated. The traceparent ledger keys off the URL, not // this name, so it is unaffected. const serverName = `${target.name}-${randomUUID().slice(0, 8)}`; - // `browser` mode is selected per the ecosystem convention — an - // `?elicitation_mode=` query on the MCP endpoint — so a paused execution - // yields an approvalUrl instead of letting the model resume inline. - const sessionUrl = options?.elicitationMode - ? `${target.mcpUrl}?elicitation_mode=${options.elicitationMode}` - : target.mcpUrl; + // Session config rides query params on the MCP endpoint, per ecosystem + // convention: `?elicitation_mode=` (a paused execution yields an approvalUrl + // instead of letting the model resume inline) and `?codemode=false` (the + // `search`/`invoke` meta-tools instead of the single `execute` tool). + const sessionUrl = (() => { + const url = new URL(target.mcpUrl); + if (options?.elicitationMode) { + url.searchParams.set("elicitation_mode", options.elicitationMode); + } + if (options?.codeMode === false) url.searchParams.set("codemode", "false"); + return url.toString(); + })(); let runtimePromise: Promise | undefined; let connected = false; diff --git a/e2e/vitest.config.ts b/e2e/vitest.config.ts index 6e3d5a56b..88f7e184b 100644 --- a/e2e/vitest.config.ts +++ b/e2e/vitest.config.ts @@ -49,6 +49,7 @@ export default defineConfig({ include: [ "scenarios/browser-approval.test.ts", "scenarios/microsoft-graph-full.test.ts", + "scenarios/mcp-codemode-off.test.ts", "cloudflare/**/*.test.ts", ], fileParallelism: false, diff --git a/packages/core/execution/src/engine.ts b/packages/core/execution/src/engine.ts index 665548852..d362845ca 100644 --- a/packages/core/execution/src/engine.ts +++ b/packages/core/execution/src/engine.ts @@ -13,6 +13,7 @@ import { CodeExecutionError } from "@executor-js/codemode-core"; import type { CodeExecutor, ExecuteResult, SandboxToolInvoker } from "@executor-js/codemode-core"; import { + pathToAddress, defaultToolDiscoveryProvider, makeExecutorToolInvoker, listExecutorSources, @@ -41,6 +42,33 @@ export type PausedExecution = { readonly elicitationContext: ElicitationContext; }; +/** One ranked hit from the non-code-mode `search` tool. `name` is the + * sandbox-callable path (`...` or a + * static fqid), which doubles as the wire name a client passes to `invoke`. + * `inputSchema` is self-contained JSON Schema so the hit can be called + * directly without a second round-trip. */ +export type ToolSearchResult = { + readonly name: string; + readonly description?: string; + readonly inputSchema: unknown; +}; + +/** A page of {@link ToolSearchResult}s. `total` is the match count before + * pagination so the caller can tell it was truncated; `nextOffset` is the + * offset to pass back for the next page, or null at the end. */ +export type ToolSearchPage = { + readonly items: readonly ToolSearchResult[]; + readonly total: number; + readonly hasMore: boolean; + readonly nextOffset: number | null; +}; + +/** Default and ceiling for `search` page size. Search returns each hit's full + * self-contained schema, so the page is bounded to keep the response small + * even when the catalog has tens of thousands of tools. */ +export const DEFAULT_SEARCH_LIMIT = 10; +export const MAX_SEARCH_LIMIT = 25; + /** Internal representation with Effect runtime state for pause/resume. */ type InternalPausedExecution = PausedExecution & { readonly response: Deferred.Deferred; @@ -210,6 +238,39 @@ const readOptionalOffset = (value: unknown, toolName: string): number | Executio return Math.floor(value); }; +/** Pull a human-readable message off an engine-level invoke failure (always an + * `ExecutionToolError` once the base invoker has mapped expected failures into + * the success channel), falling back to a string render. */ +// oxlint-disable executor/no-unknown-error-message -- boundary: SandboxToolInvoker.invoke declares an `unknown` error channel (kernel contract); the tag guard narrows it to ExecutionToolError before rendering, with a String() fallback for the impossible-defect case +const toolErrorMessage = (error: unknown): string => + Predicate.isTagged(error, "ExecutionToolError") && + "message" in error && + typeof error.message === "string" + ? error.message + : String(error); +// oxlint-enable executor/no-unknown-error-message + +/** + * Invoke a single tool through the base invoker and normalize the outcome into + * an `ExecuteResult`. The base invoker already routes expected tool failures + * (HTTP errors, auth walls, not-found, bad args) into the success channel as a + * `ToolResult` envelope; only engine-level failures (user-declined approval, + * opaque defects) reach the error channel, and those become `result.error`. + * Shared by the native and pause-mode non-code-mode paths so both render the + * same way. + */ +const invokeToolAsExecuteResult = ( + invoker: SandboxToolInvoker, + name: string, + args: unknown, +): Effect.Effect => + invoker.invoke({ path: name, args }).pipe( + Effect.map((result): ExecuteResult => ({ result })), + Effect.catch((error: unknown) => + Effect.succeed({ result: undefined, error: toolErrorMessage(error) }), + ), + ); + const makeFullInvoker = ( executor: Executor, invokeOptions: InvokeOptions, @@ -392,6 +453,34 @@ export type ExecutionEngine * Get the dynamic tool description (workflow + namespaces). */ readonly getDescription: Effect.Effect; + + /** + * Ranked, paginated tool search backing the non-code-mode `search` tool. + * Returns only the matched page (each hit with its self-contained input + * schema), so it scales to catalogs far too large to enumerate in one + * `listTools`. The lazy-loading counterpart to {@link listTools}. + */ + readonly searchTools: (input: { + readonly query: string; + readonly limit?: number; + readonly offset?: number; + }) => Effect.Effect; + + /** + * Invoke a single tool by its wire name with elicitation handled inline by + * the provided handler. The non-code-mode counterpart to {@link execute}. + */ + readonly invokeTool: ( + name: string, + args: unknown, + options: { readonly onElicitation: ElicitationHandler }, + ) => Effect.Effect; + + /** + * Invoke a single tool by its wire name, intercepting an approval gate as a + * pause point. The non-code-mode counterpart to {@link executeWithPause}. + */ + readonly invokeToolWithPause: (name: string, args: unknown) => Effect.Effect; }; export const createExecutionEngine = ( @@ -450,84 +539,109 @@ export const createExecutionEngine = Effect.Effect, + attributes: Record, + ): Effect.Effect => + Effect.gen(function* () { + // Queue preserves pauses that arrive before the previous approval has + // returned to the caller, which can happen with concurrent tool calls. + const pauseQueue = yield* Queue.unbounded>(); - // Queue preserves pauses that arrive before the previous approval has - // returned to the caller, which can happen with concurrent tool calls. - const pauseQueue = yield* Queue.unbounded>(); - - // Will be set once the fiber is forked. - let fiber: Fiber.Fiber; - - const elicitationHandler: ElicitationHandler = (ctx) => - Effect.gen(function* () { - const responseDeferred = yield* Deferred.make(); - // Globally unique — engine instances are rebuilt on host restarts - // (Durable Object cold restores, redeploys), so a counter would - // re-mint the same ids and let a stale client resume bind to a - // different execution's pause. - const id = `exec_${crypto.randomUUID()}`; - - const paused: InternalPausedExecution = { - id, - elicitationContext: ctx, - response: responseDeferred, - fiber: fiber!, - pauseQueue, - }; - pausedExecutions.set(id, paused); - - yield* Queue.offer(pauseQueue, paused); - - // Suspend until resume() completes responseDeferred. - return yield* Deferred.await(responseDeferred); - }); + // Will be set once the fiber is forked. + let fiber: Fiber.Fiber; - const invoker = makeFullInvoker( - executor, - { onElicitation: elicitationHandler }, - toolDiscoveryProvider, - ); - fiber = yield* Effect.forkDetach( - codeExecutor.execute(code, invoker).pipe(Effect.withSpan("executor.code.exec")), + const elicitationHandler: ElicitationHandler = (ctx) => + Effect.gen(function* () { + const responseDeferred = yield* Deferred.make(); + // Globally unique — engine instances are rebuilt on host restarts + // (Durable Object cold restores, redeploys), so a counter would + // re-mint the same ids and let a stale client resume bind to a + // different execution's pause. + const id = `exec_${crypto.randomUUID()}`; + + const paused: InternalPausedExecution = { + id, + elicitationContext: ctx, + response: responseDeferred, + fiber: fiber!, + pauseQueue, + }; + pausedExecutions.set(id, paused); + + yield* Queue.offer(pauseQueue, paused); + + // Suspend until resume() completes responseDeferred. + return yield* Deferred.await(responseDeferred); + }); + + fiber = yield* Effect.forkDetach(run(elicitationHandler)); + + // When the fiber settles on its own (sandbox timeout, failure) while + // pauses are still outstanding, drop them: getPausedExecution must not + // report a pause whose fiber can no longer consume a response, and the + // map must not grow forever. A resume retry still finds the terminal + // outcome via the settled-outcome cache. + const sandboxFiber = fiber; + yield* Effect.forkDetach( + Fiber.await(sandboxFiber).pipe( + Effect.flatMap((exit) => + Effect.sync(() => { + const outcome = Exit.map( + exit, + (result): ExecutionResult => ({ status: "completed", result }), + ); + for (const [id, paused] of pausedExecutions) { + if (paused.fiber !== sandboxFiber) continue; + pausedExecutions.delete(id); + recordSettledOutcome(id, outcome); + } + }), + ), + ), + ); + + return (yield* awaitCompletionOrPause(fiber, pauseQueue)) as ExecutionResult; + }).pipe(Effect.withSpan("mcp.execute", { attributes })); + + /** Code-mode pause/resume: run the dynamic worker over the full invoker. */ + const startPausableExecution = (code: string): Effect.Effect => + runWithPause( + (onElicitation) => + codeExecutor + .execute(code, makeFullInvoker(executor, { onElicitation }, toolDiscoveryProvider)) + .pipe(Effect.withSpan("executor.code.exec")), + { "mcp.execute.mode": "pausable", "mcp.execute.code_length": code.length }, ); - // When the fiber settles on its own (sandbox timeout, failure) while - // pauses are still outstanding, drop them: getPausedExecution must not - // report a pause whose fiber can no longer consume a response, and the - // map must not grow forever. A resume retry still finds the terminal - // outcome via the settled-outcome cache. - const sandboxFiber = fiber; - yield* Effect.forkDetach( - Fiber.await(sandboxFiber).pipe( - Effect.flatMap((exit) => - Effect.sync(() => { - const outcome = Exit.map( - exit, - (result): ExecutionResult => ({ status: "completed", result }), - ); - for (const [id, paused] of pausedExecutions) { - if (paused.fiber !== sandboxFiber) continue; - pausedExecutions.delete(id); - recordSettledOutcome(id, outcome); - } - }), + /** + * Non-code-mode pause/resume: invoke a single tool by its wire name. The + * tool's `ToolResult` envelope (or an engine-level error) is carried back as + * an `ExecuteResult` so the host renders it the same way it renders a + * code-mode result. Approval-gated tools pause through the same machinery. + */ + const invokeToolWithPause = (name: string, args: unknown): Effect.Effect => + runWithPause( + (onElicitation) => + invokeToolAsExecuteResult( + makeExecutorToolInvoker(executor, { invokeOptions: { onElicitation } }), + name, + args, ), - ), + { "mcp.execute.mode": "pausable_tool", "mcp.tool.name": name }, ); - return (yield* awaitCompletionOrPause(fiber, pauseQueue)) as ExecutionResult; - }); - /** * Resume a paused execution. Completes the response Deferred to unblock the * fiber, then races completion against the next queued or future pause. @@ -602,6 +716,73 @@ export const createExecutionEngine = => + Effect.gen(function* () { + const limit = Math.min(Math.max(input.limit ?? DEFAULT_SEARCH_LIMIT, 1), MAX_SEARCH_LIMIT); + const offset = Math.max(input.offset ?? 0, 0); + const page = yield* toolDiscoveryProvider.searchTools({ + executor, + query: input.query, + limit, + offset, + }); + const items = yield* Effect.forEach( + page.items, + (hit) => + executor.tools.schema(pathToAddress(hit.path)).pipe( + Effect.map( + (schema): ToolSearchResult => ({ + name: hit.path, + description: hit.description, + inputSchema: schema?.inputSchema ?? { type: "object" }, + }), + ), + ), + { concurrency: "unbounded" }, + ); + return { items, total: page.total, hasMore: page.hasMore, nextOffset: page.nextOffset }; + }).pipe( + // oxlint-disable-next-line executor/no-effect-escape-hatch -- boundary: ExecutionEngine.searchTools exposes no error channel; a catalog read the search surface can't recover from dies rather than forcing every caller to thread a typed error + Effect.orDie, + Effect.withSpan("mcp.search_tools", { attributes: { "mcp.search.query": input.query } }), + ); + return { execute: runInlineExecution, executeWithPause: startPausableExecution, @@ -609,5 +790,8 @@ export const createExecutionEngine = Effect.sync(() => pausedExecutions.get(executionId) ?? null), getDescription: buildExecuteDescription(executor), + searchTools, + invokeTool: invokeToolInline, + invokeToolWithPause, }; }; diff --git a/packages/core/execution/src/index.ts b/packages/core/execution/src/index.ts index 18955fc67..fa5c05a21 100644 --- a/packages/core/execution/src/index.ts +++ b/packages/core/execution/src/index.ts @@ -2,11 +2,15 @@ export { createExecutionEngine, formatExecuteResult, formatPausedExecution, + DEFAULT_SEARCH_LIMIT, + MAX_SEARCH_LIMIT, type ExecutionEngine, type ExecutionEngineConfig, type ExecutionResult, type PausedExecution, type ResumeResponse, + type ToolSearchPage, + type ToolSearchResult, } from "./engine"; export { buildExecuteDescription } from "./description"; diff --git a/packages/core/execution/src/tool-invoker.ts b/packages/core/execution/src/tool-invoker.ts index 8ce6d31bc..1bbabb0f9 100644 --- a/packages/core/execution/src/tool-invoker.ts +++ b/packages/core/execution/src/tool-invoker.ts @@ -56,7 +56,7 @@ const ADDRESS_PREFIX = "tools."; * namespaces) are addressed by their fqid with no prefix; the executor resolves * those from its static map directly, so leave them untouched. */ -const pathToAddress = (path: string): ToolAddress => { +export const pathToAddress = (path: string): ToolAddress => { if (path.startsWith(ADDRESS_PREFIX)) return ToolAddress.make(path); if (parseToolAddress(`${ADDRESS_PREFIX}${path}`)) { return ToolAddress.make(`${ADDRESS_PREFIX}${path}`); @@ -66,7 +66,7 @@ const pathToAddress = (path: string): ToolAddress => { /** Strip the proxy-root `tools.` prefix from a full address so it becomes the * sandbox-callable path the model writes after `tools.`. */ -const addressToPath = (address: string): string => +export const addressToPath = (address: string): string => address.startsWith(ADDRESS_PREFIX) ? address.slice(ADDRESS_PREFIX.length) : address; type DescribedTool = { diff --git a/packages/hosts/cloudflare/src/mcp/do-headers.ts b/packages/hosts/cloudflare/src/mcp/do-headers.ts index 822893f31..04e2a2a6c 100644 --- a/packages/hosts/cloudflare/src/mcp/do-headers.ts +++ b/packages/hosts/cloudflare/src/mcp/do-headers.ts @@ -89,10 +89,12 @@ export const withMcpResponseHeaders = (response: Response): Response => { }; // The elicitation-mode query contract (`?elicitation_mode=` plus the legacy -// `?allow_model_resume` alias) is shared with every host that serves the -// browser-approval flow. Re-exported here so the worker dispatcher's existing -// import site (`./do-headers`) is unchanged. +// `?allow_model_resume` alias) and the code-mode query contract (`?codemode=`) +// are shared with every host that serves the browser-approval flow. Re-exported +// here so the worker dispatcher's existing import site (`./do-headers`) is +// unchanged. export { + readCodeMode, readElicitationMode, type McpElicitationMode, } from "@executor-js/host-mcp/browser-approval"; diff --git a/packages/hosts/cloudflare/src/mcp/seams.ts b/packages/hosts/cloudflare/src/mcp/seams.ts index 330f78200..fb1b25a03 100644 --- a/packages/hosts/cloudflare/src/mcp/seams.ts +++ b/packages/hosts/cloudflare/src/mcp/seams.ts @@ -11,6 +11,9 @@ export interface McpSessionInit { readonly organizationId: string; readonly userId: string; readonly elicitationMode: McpElicitationMode; + /** Whether the session runs in code mode (single `execute` tool). Defaults to + * `true`; `false` selects transparent mode (every tool registered directly). */ + readonly codeMode?: boolean; /** Public origin of the create request (`https://host`), so the DO derives a * web base URL zero-config when the host configures no static one. */ readonly webOrigin?: string; diff --git a/packages/hosts/cloudflare/src/mcp/session-durable-object.ts b/packages/hosts/cloudflare/src/mcp/session-durable-object.ts index e23a99c7b..64904368a 100644 --- a/packages/hosts/cloudflare/src/mcp/session-durable-object.ts +++ b/packages/hosts/cloudflare/src/mcp/session-durable-object.ts @@ -127,6 +127,9 @@ export interface SessionMeta { readonly organizationSlug?: string; readonly userId: string; readonly elicitationMode?: "browser" | "model" | "native"; + /** Whether the session runs in code mode (single `execute` tool). Defaults to + * `true`; `false` selects transparent mode (every tool registered directly). */ + readonly codeMode?: boolean; /** Public origin captured at session create — used to derive the runtime's * web base URL when the host configures no static one. */ readonly webOrigin?: string; diff --git a/packages/hosts/cloudflare/src/mcp/session-store.ts b/packages/hosts/cloudflare/src/mcp/session-store.ts index ff9fdf444..18970d106 100644 --- a/packages/hosts/cloudflare/src/mcp/session-store.ts +++ b/packages/hosts/cloudflare/src/mcp/session-store.ts @@ -29,6 +29,7 @@ import { import { currentPropagationHeaders, + readCodeMode, readElicitationMode, withMcpResponseHeaders, withPropagationHeaders, @@ -178,6 +179,7 @@ const createSession = ( organizationId: token.organizationId, userId: token.accountId, elicitationMode: readElicitationMode(request), + codeMode: readCodeMode(request), // The public origin the client reached us at — lets the DO derive a web // base URL with no static config (we read the real URL, not a spoofable // forwarded host). diff --git a/packages/hosts/mcp/src/browser-approval.ts b/packages/hosts/mcp/src/browser-approval.ts index 0bea2ea3b..d1dffb406 100644 --- a/packages/hosts/mcp/src/browser-approval.ts +++ b/packages/hosts/mcp/src/browser-approval.ts @@ -51,6 +51,19 @@ export const readElicitationMode = (request: Request): McpElicitationMode => { return "model"; }; +/** + * Read whether the session runs in code mode off an MCP request's `?codemode=` + * query. Code mode (the default) exposes a single `execute` tool the agent + * drives with TypeScript; `?codemode=false` selects non-code mode, where the + * agent discovers and runs tools through the `search` and `invoke` meta-tools. + * Only the literal value `false` turns code mode off; anything else keeps the + * default. + */ +export const readCodeMode = (request: Request): boolean => { + const url = new URL(request.url); + return url.searchParams.get("codemode") !== "false"; +}; + /** * Build the console approval URL for a paused execution: * `/resume/?mcp_session_id=`. The diff --git a/packages/hosts/mcp/src/in-memory-session-store.ts b/packages/hosts/mcp/src/in-memory-session-store.ts index 5c67ecbe5..1269ea301 100644 --- a/packages/hosts/mcp/src/in-memory-session-store.ts +++ b/packages/hosts/mcp/src/in-memory-session-store.ts @@ -8,6 +8,7 @@ import { buildResumeApprovalUrl, decodeResumeResponse, formatResumeAcknowledgement, + readCodeMode, readElicitationMode, } from "./browser-approval"; import { @@ -66,6 +67,12 @@ export interface McpBuildServerOptions { | { readonly mode: "model" } | { readonly mode: "native" }; readonly browserApprovalStore?: BrowserApprovalStore; + /** + * Whether the session runs in code mode (the single `execute` tool the agent + * drives with TypeScript). Defaults to `true`; `?codemode=false` selects + * non-code mode, with the `search` and `invoke` meta-tools. + */ + readonly codeMode?: boolean; } /** Build the per-session `McpServer` + engine for a principal (the host's engine + tools). */ @@ -200,7 +207,10 @@ export const makeInMemoryMcpSessionStore = ( request: Request, sessionId: () => string | null, ): McpBuildServerOptions => { - if (readElicitationMode(request) !== "browser") return { elicitationMode: { mode: "model" } }; + const codeMode = readCodeMode(request); + if (readElicitationMode(request) !== "browser") { + return { elicitationMode: { mode: "model" }, codeMode }; + } return { elicitationMode: { mode: "browser", @@ -214,6 +224,7 @@ export const makeInMemoryMcpSessionStore = ( }), }, browserApprovalStore: approvals.store, + codeMode, }; }; diff --git a/packages/hosts/mcp/src/tool-server.test.ts b/packages/hosts/mcp/src/tool-server.test.ts index d42b5f741..14675a99e 100644 --- a/packages/hosts/mcp/src/tool-server.test.ts +++ b/packages/hosts/mcp/src/tool-server.test.ts @@ -30,6 +30,9 @@ const makeStubEngine = (overrides: { execute?: ExecutionEngine["execute"]; executeWithPause?: ExecutionEngine["executeWithPause"]; resume?: ExecutionEngine["resume"]; + searchTools?: ExecutionEngine["searchTools"]; + invokeTool?: ExecutionEngine["invokeTool"]; + invokeToolWithPause?: ExecutionEngine["invokeToolWithPause"]; description?: string; }): ExecutionEngine => ({ execute: overrides.execute ?? (() => Effect.succeed({ result: "default" })), @@ -39,6 +42,13 @@ const makeStubEngine = (overrides: { resume: overrides.resume ?? (() => Effect.succeed(null)), getPausedExecution: () => Effect.succeed(null), getDescription: Effect.succeed(overrides.description ?? "test executor"), + searchTools: + overrides.searchTools ?? + (() => Effect.succeed({ items: [], total: 0, hasMore: false, nextOffset: null })), + invokeTool: overrides.invokeTool ?? (() => Effect.succeed({ result: "default" })), + invokeToolWithPause: + overrides.invokeToolWithPause ?? + (() => Effect.succeed({ status: "completed", result: { result: "default" } })), }); /** Connect a real MCP Client to our executor MCP server over in-memory transports. */ diff --git a/packages/hosts/mcp/src/tool-server.ts b/packages/hosts/mcp/src/tool-server.ts index ff7989b55..938fec54c 100644 --- a/packages/hosts/mcp/src/tool-server.ts +++ b/packages/hosts/mcp/src/tool-server.ts @@ -1,7 +1,12 @@ import { Effect, Match, Option, Schema } from "effect"; import * as Cause from "effect/Cause"; import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; -import { ContentBlockSchema, type ContentBlock } from "@modelcontextprotocol/sdk/types.js"; +import { + CallToolRequestSchema, + ContentBlockSchema, + ListToolsRequestSchema, + type ContentBlock, +} from "@modelcontextprotocol/sdk/types.js"; import type { jsonSchemaValidator, JsonSchemaType, @@ -10,12 +15,13 @@ import type { import { Validator } from "@cfworker/json-schema"; import * as z from "zod/v4"; -import { isToolFile } from "@executor-js/sdk"; +import { isToolFile, isToolResult } from "@executor-js/sdk"; import type { ElicitationResponse, ElicitationHandler, ElicitationContext, ElicitationRequest, + ToolError, ToolFileValue, } from "@executor-js/sdk"; import type * as Tracer from "effect/Tracer"; @@ -23,9 +29,12 @@ import { createExecutionEngine, formatExecuteResult, formatPausedExecution, + DEFAULT_SEARCH_LIMIT, + MAX_SEARCH_LIMIT, type ExecutionEngine, type ExecutionEngineConfig, type ResumeResponse, + type ToolSearchPage, } from "@executor-js/execution"; // --------------------------------------------------------------------------- @@ -91,6 +100,16 @@ type SharedMcpServerConfig = { readonly mode: "native"; }; readonly browserApprovalStore?: BrowserApprovalStore; + /** + * When `false`, run in non-code mode: instead of the single `execute` code + * tool, expose two meta-tools, `search` (find tools, ranked + paginated) and + * `invoke` (call a tool by name). This is the lazy-loading surface for clients + * that can't drive a code sandbox; it scales to any catalog size because + * `search` only ever returns a bounded page (dumping a large catalog directly + * does not). Defaults to `true` (code mode). Selected by a `?codemode=false` + * query param on the MCP endpoint. + */ + readonly codeMode?: boolean; }; export type ExecutorMcpServerConfig = @@ -419,6 +438,83 @@ const toMcpPausedResult = (formatted: ReturnType): structuredContent: formatted.structured, }); +// --------------------------------------------------------------------------- +// Non-code-mode result formatting +// +// In non-code mode each tool is called directly, so the execution result's +// `result` field is the tool's own `ToolResult` envelope rather than a +// script's return value. Unwrap it: render `data` on success (a `ToolFile` +// becomes native MCP content), surface the `ToolError` on an expected +// failure, and drop transport `http` metadata (a non-code client wants the +// payload, not pagination headers). +// --------------------------------------------------------------------------- + +const renderToolValueText = (value: unknown): string => + typeof value === "string" ? value : JSON.stringify(value ?? null, null, 2); + +const toolErrorText = (error: ToolError): string => { + const status = error.status != null ? ` (status ${error.status})` : ""; + // oxlint-disable-next-line executor/no-unknown-error-message -- boundary: ToolError is a typed struct whose `message` is a schema field, not an unknown error + return `Error [${error.code}]${status}: ${error.message}`; +}; + +const renderToolData = (data: unknown): McpToolResult => { + if (isToolFile(data)) return { content: toolFileContent(data) }; + return { + content: [{ type: "text", text: renderToolValueText(data) }], + ...(isRecord(data) ? { structuredContent: data } : {}), + }; +}; + +// A `search` result page: render the matches as text plus structured content so +// the model can read either. The page (items + total + nextOffset) is a record, +// so it rides `structuredContent` directly. +const renderSearchResult = (page: ToolSearchPage): McpToolResult => ({ + content: [{ type: "text", text: JSON.stringify(page, null, 2) }], + structuredContent: { ...page }, +}); + +// A call for a name that is neither `search`, `invoke`, nor `resume`. In +// search+invoke mode only those meta-tools are advertised; everything else is +// reached by name through `invoke`, so a direct call to a tool name is a client +// mistake worth naming explicitly. +const unknownMetaToolResult = (name: string): McpToolResult => ({ + content: [ + { + type: "text", + text: `Error: unknown tool "${name}". This connection exposes "search" (find tools) and "invoke" (call a tool by name).`, + }, + ], + structuredContent: { status: "error", error: `unknown tool: ${name}` }, + isError: true, +}); + +const toNonCodeMcpResult = (result: FormattedExecuteInput): McpToolResult => { + // Engine-level failure (declined approval, opaque defect surfaced as a + // string) — not a tool-domain failure, but still an error for the client. + if (result.error) { + return { + content: [{ type: "text", text: `Error: ${result.error}` }], + structuredContent: { status: "error", error: result.error }, + isError: true, + }; + } + const value = result.result; + if (isToolResult(value)) { + if (!value.ok) { + return { + content: [{ type: "text", text: toolErrorText(value.error) }], + structuredContent: { status: "error", error: value.error }, + isError: true, + }; + } + return renderToolData(value.data); + } + // Defensive: a direct invoke always yields a `ToolResult`, but render any + // bare value rather than dropping it. + return renderToolData(value); +}; + // `execute` failures reaching the MCP host are infra defects — domain // failures from tools are now expressed as `ToolResult` values (success // channel) and flow through `formatExecuteResult`. Emit an opaque @@ -512,6 +608,15 @@ const parseJsonContent = (raw: string): Record | undefined => { return Option.isSome(parsed) ? parsed.value : undefined; }; +// The non-code-mode dispatch reads `resume` arguments off the raw CallTool +// payload (no Zod layer in the low-level handler), so coerce defensively: an +// unknown executionId becomes "" (resolved to a not-found result) and an +// unknown action falls back to "cancel". +const readResumeAction = (value: unknown): "accept" | "decline" | "cancel" => + value === "accept" || value === "decline" || value === "cancel" ? value : "cancel"; + +const readArgString = (value: unknown): string => (typeof value === "string" ? value : ""); + // --------------------------------------------------------------------------- // Server factory // --------------------------------------------------------------------------- @@ -544,6 +649,7 @@ export const createExecutorMcpServer = ( ({ mode: "model", } as const); + const codeMode = config.codeMode ?? true; const resolveParentSpan = (): Tracer.AnySpan | undefined => { const ps = config.parentSpan; @@ -608,6 +714,16 @@ export const createExecutorMcpServer = ( }), ); + // `resume` is shared by both modes, but a paused execution can only have + // originated from the tool that this session registered: `execute` in code + // mode, a direct single-tool invoke in transparent mode. Format the resumed + // completion the same way that origin tool formats a non-paused completion, + // so a tool returns an identically-shaped result whether or not it paused. + // In transparent mode that means unwrapping the `ToolResult` envelope (so + // `data` renders natively and a failed `ToolResult` carries `isError`) + // rather than emitting the code-mode `execute` envelope. + const formatResumeCompletion = codeMode ? toMcpResult : toNonCodeMcpResult; + const resumeExecution = ( executionId: string, action: "accept" | "decline" | "cancel", @@ -635,7 +751,7 @@ export const createExecutorMcpServer = ( : undefined, }); return outcome.status === "completed" - ? toMcpResult(outcome.result) + ? formatResumeCompletion(outcome.result) : toMcpPausedResult(formatPausedExecution(outcome.execution)); }).pipe( Effect.withSpan("mcp.host.tool.resume", { @@ -698,7 +814,7 @@ export const createExecutorMcpServer = ( return missingExecutionResult(executionId); } return outcome.status === "completed" - ? toMcpResult(outcome.result) + ? formatResumeCompletion(outcome.result) : yield* requireUserResumeApproval(outcome.execution.id); }).pipe( Effect.withSpan("mcp.host.tool.resume.browser_approval", { @@ -709,77 +825,273 @@ export const createExecutorMcpServer = ( }), ); + // Non-code mode: invoke one named tool directly. Reuses the same + // elicitation/pause machinery as `executeCode`, so an approval-gated tool + // pauses and resumes identically whether the model reached it through + // `execute` or called it by name. + const invokeSingleTool = (name: string, args: unknown): Effect.Effect => + Effect.gen(function* () { + debugLog("invoke_tool.call", { + name, + elicitationMode: elicitationMode.mode, + clientCapabilities: server.server.getClientCapabilities() ?? null, + }); + if (elicitationMode.mode === "native") { + const result = yield* engine.invokeTool(name, args, { + onElicitation: makeMcpElicitationHandler(server, debugLog), + }); + return toNonCodeMcpResult(result); + } + const outcome = yield* engine.invokeToolWithPause(name, args); + debugLog("invoke_tool.paused_flow_result", { + name, + status: outcome.status, + executionId: outcome.status === "paused" ? outcome.execution.id : undefined, + interactionKind: + outcome.status === "paused" + ? pausedInteractionKind(outcome.execution.elicitationContext.request) + : undefined, + }); + return outcome.status === "completed" + ? toNonCodeMcpResult(outcome.result) + : elicitationMode.mode === "browser" + ? yield* requireUserResumeApproval(outcome.execution.id) + : toMcpPausedResult(formatPausedExecution(outcome.execution)); + }).pipe( + Effect.withSpan("mcp.host.tool.invoke", { + attributes: { "mcp.tool.name": name }, + }), + ); + // --- tools --- + // Code mode registers the single `execute` tool (plus mode-specific + // `resume`) via the high-level wrapper. Transparent mode skips that and + // serves every tool through the low-level request handlers instead — the + // two registration styles are mutually exclusive on one server. + + if (codeMode) { + yield* Effect.sync(() => + server.registerTool( + "execute", + { + description, + inputSchema: { code: z.string().trim().min(1) }, + }, + ({ code }) => runToolEffect(executeCode(code)), + ), + ).pipe( + Effect.withSpan("mcp.host.register_tool", { + attributes: { "mcp.tool.name": "execute" }, + }), + ); - yield* Effect.sync(() => - server.registerTool( - "execute", - { - description, - inputSchema: { code: z.string().trim().min(1) }, - }, - ({ code }) => runToolEffect(executeCode(code)), - ), - ).pipe( - Effect.withSpan("mcp.host.register_tool", { - attributes: { "mcp.tool.name": "execute" }, - }), - ); + yield* Effect.sync(() => { + if (elicitationMode.mode === "native") { + return undefined; + } - yield* Effect.sync(() => { - if (elicitationMode.mode === "native") { - return undefined; - } + if (elicitationMode.mode === "model") { + return server.registerTool( + "resume", + { + description: [ + "Resume a paused execution using the executionId returned by execute.", + "This connection explicitly allows model-side resume via elicitation_mode=model.", + ].join("\n"), + inputSchema: { + executionId: z.string().describe("The execution ID from the paused result"), + action: z + .enum(["accept", "decline", "cancel"]) + .describe("How to respond to the interaction"), + content: z + .string() + .describe("Optional JSON-encoded response content for form elicitations") + .default("{}"), + }, + }, + ({ executionId, action, content: rawContent }) => + runToolEffect(resumeExecution(executionId, action, parseJsonContent(rawContent))), + ); + } - if (elicitationMode.mode === "model") { return server.registerTool( "resume", { description: [ - "Resume a paused execution using the executionId returned by execute.", - "This connection explicitly allows model-side resume via elicitation_mode=model.", + "Request user approval to resume a paused execution.", + "Call this with the executionId returned by execute. If the user has not approved in the browser yet, tell them to open the returned approval URL. If they have approved, this returns the resumed execution result.", + "This connection does not allow the model to choose accept, decline, cancel, or content.", ].join("\n"), inputSchema: { executionId: z.string().describe("The execution ID from the paused result"), - action: z - .enum(["accept", "decline", "cancel"]) - .describe("How to respond to the interaction"), - content: z - .string() - .describe("Optional JSON-encoded response content for form elicitations") - .default("{}"), }, }, - ({ executionId, action, content: rawContent }) => - runToolEffect(resumeExecution(executionId, action, parseJsonContent(rawContent))), + ({ executionId }) => runToolEffect(resumeAfterBrowserApproval(executionId)), ); - } - - return server.registerTool( - "resume", - { - description: [ - "Request user approval to resume a paused execution.", - "Call this with the executionId returned by execute. If the user has not approved in the browser yet, tell them to open the returned approval URL. If they have approved, this returns the resumed execution result.", - "This connection does not allow the model to choose accept, decline, cancel, or content.", - ].join("\n"), - inputSchema: { - executionId: z.string().describe("The execution ID from the paused result"), + }).pipe( + Effect.withSpan("mcp.host.register_tool", { + attributes: { "mcp.tool.name": "resume" }, + }), + ); + } else { + // Non-code mode: instead of dumping the whole catalog (a large catalog + // produces a tools/list far too big for clients to load or the runtime to + // hold), expose two meta-tools, `search` and `invoke`. The client searches + // for the handful of tools it needs and invokes them by name. This is the + // lazy-loading counterpart to code mode's `execute`, and it scales to any + // catalog size because `search` only ever returns a bounded page. + const searchWireTool = { + name: "search", + description: [ + "Search the available tools by keyword. Returns ranked matches, each with its input schema, so you can call it with `invoke`.", + `Page with \`limit\` (default ${DEFAULT_SEARCH_LIMIT}, max ${MAX_SEARCH_LIMIT}) and \`offset\`; \`total\` and \`nextOffset\` in the result tell you whether there is more.`, + ].join("\n"), + inputSchema: { + type: "object" as const, + properties: { + query: { + type: "string", + description: + "Keywords matched against tool names and descriptions. Empty returns the top tools.", + }, + limit: { + type: "number", + description: `Maximum matches to return (default ${DEFAULT_SEARCH_LIMIT}, max ${MAX_SEARCH_LIMIT}).`, + }, + offset: { + type: "number", + description: "Offset into the ranked results, for pagination.", + }, }, }, - ({ executionId }) => runToolEffect(resumeAfterBrowserApproval(executionId)), + }; + const invokeWireTool = { + name: "invoke", + description: [ + "Invoke a tool by name with its arguments.", + "Get the tool `name` and its input schema from `search` first, then pass `arguments` matching that schema.", + ].join("\n"), + inputSchema: { + type: "object" as const, + properties: { + name: { type: "string", description: "The tool name exactly as returned by `search`." }, + arguments: { + type: "object", + description: "Arguments object matching the tool's input schema.", + additionalProperties: true, + }, + }, + required: ["name"], + }, + }; + const resumeWireTool = + elicitationMode.mode === "native" + ? undefined + : elicitationMode.mode === "model" + ? { + name: "resume", + description: [ + "Resume a paused tool call using the executionId returned by a paused result.", + "This connection explicitly allows model-side resume via elicitation_mode=model.", + ].join("\n"), + inputSchema: { + type: "object" as const, + properties: { + executionId: { + type: "string", + description: "The execution ID from the paused result", + }, + action: { + type: "string", + enum: ["accept", "decline", "cancel"], + description: "How to respond to the interaction", + }, + content: { + type: "string", + description: "Optional JSON-encoded response content for form elicitations", + default: "{}", + }, + }, + required: ["executionId", "action"], + }, + } + : { + name: "resume", + description: [ + "Request user approval to resume a paused tool call.", + "Call this with the executionId returned by a paused result. If the user has not approved in the browser yet, tell them to open the returned approval URL. If they have approved, this returns the resumed result.", + "This connection does not allow the model to choose accept, decline, cancel, or content.", + ].join("\n"), + inputSchema: { + type: "object" as const, + properties: { + executionId: { + type: "string", + description: "The execution ID from the paused result", + }, + }, + required: ["executionId"], + }, + }; + + const wireTools = [ + searchWireTool, + invokeWireTool, + ...(resumeWireTool ? [resumeWireTool] : []), + ]; + + yield* Effect.sync(() => { + // `registerTool` normally declares this; the low-level handlers bypass it. + server.server.registerCapabilities({ tools: { listChanged: false } }); + + server.server.setRequestHandler(ListToolsRequestSchema, () => ({ tools: wireTools })); + + server.server.setRequestHandler(CallToolRequestSchema, (request) => { + const { name } = request.params; + const args = request.params.arguments ?? {}; + if (name === "resume" && elicitationMode.mode !== "native") { + if (elicitationMode.mode === "browser") { + return runToolEffect(resumeAfterBrowserApproval(readArgString(args.executionId))); + } + return runToolEffect( + resumeExecution( + readArgString(args.executionId), + readResumeAction(args.action), + parseJsonContent(typeof args.content === "string" ? args.content : "{}"), + ), + ); + } + if (name === "search") { + return runToolEffect( + engine + .searchTools({ + query: typeof args.query === "string" ? args.query : "", + limit: typeof args.limit === "number" ? args.limit : undefined, + offset: typeof args.offset === "number" ? args.offset : undefined, + }) + .pipe(Effect.map(renderSearchResult)), + ); + } + if (name === "invoke") { + const toolName = readArgString(args.name); + const toolArgs = isRecord(args.arguments) ? args.arguments : {}; + return runToolEffect(invokeSingleTool(toolName, toolArgs)); + } + return runToolEffect(Effect.succeed(unknownMetaToolResult(name))); + }); + }).pipe( + Effect.withSpan("mcp.host.register_search_invoke", { + attributes: { "mcp.tool.count": wireTools.length }, + }), ); - }).pipe( - Effect.withSpan("mcp.host.register_tool", { - attributes: { "mcp.tool.name": "resume" }, - }), - ); + } yield* Effect.sync(() => { console.error( "[executor] MCP session mode", JSON.stringify({ ...capabilitySnapshot(server), + codeMode, elicitationMode: elicitationMode.mode, resumeEnabled: elicitationMode.mode !== "native", }), @@ -787,6 +1099,7 @@ export const createExecutorMcpServer = ( debugLog("tool.visibility", { clientCapabilities: server.server.getClientCapabilities() ?? null, elicitationSupport: getElicitationSupport(server), + codeMode, elicitationMode: elicitationMode.mode, resumeEnabled: elicitationMode.mode !== "native", });