diff --git a/e2e/scenarios/mcp-execute.test.ts b/e2e/scenarios/mcp-execute.test.ts index 811f342cc..925bd8efa 100644 --- a/e2e/scenarios/mcp-execute.test.ts +++ b/e2e/scenarios/mcp-execute.test.ts @@ -23,3 +23,28 @@ scenario( expect(result.text, "the sandbox returns the value").toBe("42"); }), ); + +scenario( + "MCP · a syntax error returns a descriptive message, not an opaque internal error", + {}, + Effect.gen(function* () { + const target = yield* Target; + const mcp = yield* Mcp; + const identity = yield* target.newIdentity(); + const session = mcp.session(identity); + + // A genuine parse error (a `const` with no binding) is the user's own + // mistake, surfaced before the code ever runs. The model needs the real + // reason to self-correct, so the sandbox must report it descriptively + // instead of collapsing it to "Internal tool error [id]". + const result = yield* session.call("execute", { + code: "const = 5; return 1;", + }); + + expect(result.ok, "a syntax error is reported as an error result").toBe(false); + expect(result.text, "the parser's reason reaches the model").toContain("Unexpected"); + expect(result.text, "the opaque mask is not used for syntax errors").not.toContain( + "Internal tool error", + ); + }), +); diff --git a/packages/hosts/mcp/src/tool-server.test.ts b/packages/hosts/mcp/src/tool-server.test.ts index d42b5f741..a178a2f54 100644 --- a/packages/hosts/mcp/src/tool-server.test.ts +++ b/packages/hosts/mcp/src/tool-server.test.ts @@ -815,6 +815,33 @@ describe("MCP host server — native elicitation mode", () => { }); }); + it("surfaces a code syntax error descriptively, not as an opaque internal error", async () => { + // The runtime classifies a genuine parse error (a compile failure in + // the user's own code) onto the success channel as ExecuteResult.error. + // It must reach the model verbatim so it can self-correct, in contrast + // to a genuine Effect failure, which stays opaque (see the + // "execution failure stays opaque" test that asserts the masked form). + const engine = makeStubEngine({ + execute: () => + Effect.succeed({ + result: null, + error: 'Unexpected token, expected "," (1:54)', + }), + }); + + await withNativeClient(engine, ELICITATION_CAPS, async (client) => { + const result = await client.callTool({ + name: "execute", + arguments: { code: "const { items } = await tools.search({ query: 'x' }" }, + }); + expect(result.isError).toBe(true); + const text = textOf(result); + expect(text).toContain("Unexpected token"); + expect(text).toContain("(1:54)"); + expect(text).not.toContain("Internal tool error"); + }); + }); + it("resume tool is hidden in native elicitation mode", async () => { await withNativeClient(makeStubEngine({}), ELICITATION_CAPS, async (client) => { const { tools } = await client.listTools(); diff --git a/packages/kernel/core/src/effect-errors.ts b/packages/kernel/core/src/effect-errors.ts index 8e22d2406..2b391a372 100644 --- a/packages/kernel/core/src/effect-errors.ts +++ b/packages/kernel/core/src/effect-errors.ts @@ -19,3 +19,40 @@ export class CodeExecutionError extends Data.TaggedError("CodeExecutionError")<{ readonly message: string; readonly cause?: unknown; }> {} + +/** + * Raised when user code fails to compile before it ever runs: a genuine + * syntax/parse error (smart quotes from a copy-paste, an unbalanced + * brace, `const = 5`) caught while stripping TypeScript ahead of the + * JS-only sandbox. Unlike `CodeExecutionError` this is the user's + * mistake, not a sandbox defect, so runtimes surface its `message` + * through the descriptive `ExecuteResult.error` channel instead of + * collapsing it to an opaque internal-error string. The original parser + * message (e.g. "Unexpected token (1:54)") is carried verbatim so the + * model can see and fix it. + */ +export class CodeCompilationError extends Data.TaggedError("CodeCompilationError")<{ + readonly runtime: string; + readonly message: string; + readonly cause?: unknown; +}> {} + +/** + * Raised when the sandbox cannot run the user's code to completion for a + * reason that is not a syntax error but is still safe and useful to + * report back: the returned value is not serializable across the sandbox + * boundary (a Symbol, a host object like `Cloudflare`), the code exceeded + * the isolate's CPU or memory limit, or the sandbox is momentarily at + * capacity. Like `CodeCompilationError`, and unlike `CodeExecutionError`, + * this describes the user's own code or a transient, retryable condition + * rather than an executor defect, so runtimes surface its verbatim + * `message` through the descriptive `ExecuteResult.error` channel instead + * of collapsing it to an opaque internal-error string. Genuinely + * unexpected sandbox defects stay on `CodeExecutionError` and remain + * opaque, preserving the host's failure-channel boundary. + */ +export class SandboxRuntimeError extends Data.TaggedError("SandboxRuntimeError")<{ + readonly runtime: string; + readonly message: string; + readonly cause?: unknown; +}> {} diff --git a/packages/kernel/runtime-dynamic-worker/src/executor.ts b/packages/kernel/runtime-dynamic-worker/src/executor.ts index b42727e24..4c0715364 100644 --- a/packages/kernel/runtime-dynamic-worker/src/executor.ts +++ b/packages/kernel/runtime-dynamic-worker/src/executor.ts @@ -14,7 +14,9 @@ import * as Data from "effect/Data"; import * as Effect from "effect/Effect"; import { + CodeCompilationError, recoverExecutionBody, + SandboxRuntimeError, stripTypeScript, type CodeExecutor, type ExecuteOutputItem, @@ -161,6 +163,103 @@ const renderTransportMessage = (value: unknown): string => { return String(value); }; +const serializedErrorName = (value: SerializedWorkerErrorValue): string | null => + typeof value === "object" && + value !== null && + "name" in value && + typeof (value as { name?: unknown }).name === "string" + ? (value as { name: string }).name + : null; + +/** + * Signatures of a compile error in the user's own code. Not all syntax + * errors are caught while stripping TypeScript: smart quotes from a + * paste, an unbalanced brace, and other plain-JS parse errors slip past + * sucrase and only fail when workerd compiles the generated module, + * surfacing as "Failed to start Worker: Uncaught SyntaxError: ...". The + * bare V8 phrasings ("Unexpected token ...", "Invalid or unexpected + * token") are matched too, since some surfaces report the inner message + * without the wrapper. + */ +const COMPILE_SIGNATURES = [ + "Failed to start Worker", + "SyntaxError", + "Unexpected token", + "Invalid or unexpected token", +] as const; + +/** + * Signatures of a sandbox runtime condition that is the user's own + * concern (or transient and retryable) rather than an executor defect: a + * non-serializable return value, the isolate's CPU or memory limit, and + * being momentarily at worker capacity. These are the real categories + * seen in production on the `executor.runtime.*` spans, all of which were + * being collapsed to an opaque internal error before the model could act + * on them. + */ +const RUNTIME_SIGNATURES = [ + "could not be cloned", + "does not support serialization", + "Could not serialize", + "exceeded CPU", + "exceeded memory", + "Too many concurrent dynamic workers", +] as const; + +export type SandboxFailureKind = "compilation" | "runtime" | "internal"; + +/** + * Classify a sandbox rejection so the runtime knows whether to surface + * its message descriptively (the user's mistake or a transient, + * safe-to-report condition) or collapse it to an opaque internal error (a + * genuine, unexpected sandbox defect). Tool-invocation failures never + * reach here: the sandbox reports those through its own result envelope, + * so this only sees module compile failures, return-value serialization + * failures, isolate resource limits, and capacity rejections. Anything + * unrecognized stays "internal" and opaque, preserving the host's + * failure-channel boundary. + */ +export const classifySandboxFailure = ( + serialized: SerializedWorkerErrorValue, + message: string, +): SandboxFailureKind => { + const name = serializedErrorName(serialized); + if ( + name === "SyntaxError" || + COMPILE_SIGNATURES.some((signature) => message.includes(signature)) + ) { + return "compilation"; + } + if ( + name === "DataCloneError" || + RUNTIME_SIGNATURES.some((signature) => message.includes(signature)) + ) { + return "runtime"; + } + return "internal"; +}; + +/** + * Map a raw sandbox rejection (a thrown value from the worker loader or + * the `evaluate` RPC) to the typed error its classification calls for. + * Compilation and runtime conditions carry the verbatim message through + * the descriptive channel; unrecognized defects stay opaque. + */ +const toSandboxFailure = ( + cause: unknown, +): CodeCompilationError | SandboxRuntimeError | DynamicWorkerExecutionError => { + const serialized = serializeWorkerErrorValue(cause); + const message = renderTransportMessage(serialized); + switch (classifySandboxFailure(serialized, message)) { + case "compilation": + return new CodeCompilationError({ runtime: "dynamic-worker", message, cause }); + case "runtime": + return new SandboxRuntimeError({ runtime: "dynamic-worker", message, cause }); + default: + return new DynamicWorkerExecutionError({ message }); + } +}; + export const serializeWorkerCause = (cause: Cause.Cause): SerializedWorkerError => { const failures = cause.reasons .filter(Cause.isFailReason) @@ -433,36 +532,55 @@ const startDynamicWorker = ( options: DynamicWorkerExecutorOptions, code: string, timeoutMs: number, -): Effect.Effect => - Effect.try({ - try: (): DynamicWorkerEntrypoint => { - const recoveredBody = recoverExecutionBody(code); - // The dynamic Worker isolate only accepts plain JavaScript; TS type - // syntax in user code (`: T`, `as T`, generics) would otherwise - // surface as "Unexpected token ':'" inside `evaluate()` and bubble - // out via DynamicWorkerExecutionError. Stripping here gives the - // model a clear syntax-error message at the front door instead. - const strippedBody = stripTypeScript(recoveredBody); - const executorModule = buildExecutorModule(strippedBody, timeoutMs); - const { [ENTRY_MODULE]: _, ...safeModules } = options.modules ?? {}; - - const worker = options.loader.get(`executor-${crypto.randomUUID()}`, () => ({ - compatibilityDate: "2025-06-01", - compatibilityFlags: ["nodejs_compat"], - mainModule: ENTRY_MODULE, - modules: { - ...safeModules, - [ENTRY_MODULE]: executorModule, - }, - globalOutbound: options.globalOutbound ?? null, - })); - - return asDynamicWorkerEntrypoint(worker.getEntrypoint()); - }, - catch: (cause) => - new DynamicWorkerExecutionError({ - message: renderTransportMessage(serializeWorkerErrorValue(cause)), - }), +): Effect.Effect< + DynamicWorkerEntrypoint, + DynamicWorkerExecutionError | CodeCompilationError | SandboxRuntimeError +> => + Effect.gen(function* () { + // The dynamic Worker isolate only accepts plain JavaScript; TS type + // syntax in user code (`: T`, `as T`, generics) would otherwise + // surface as "Unexpected token ':'" inside `evaluate()`. Stripping + // here means valid TS just works. But this step is also where a + // genuine syntax error (smart quotes from a paste, an unbalanced + // brace, `const = 5`) first surfaces, with the parser's precise + // "Unexpected token (line:col)" message. That is the user's mistake, + // not a sandbox defect, so it gets its own `CodeCompilationError` + // and flows back through the descriptive `ExecuteResult.error` + // channel rather than collapsing to an opaque internal error. + const strippedBody = yield* Effect.try({ + try: () => stripTypeScript(recoverExecutionBody(code)), + catch: (cause) => + new CodeCompilationError({ + runtime: "dynamic-worker", + message: renderTransportMessage(serializeWorkerErrorValue(cause)), + cause, + }), + }); + + return yield* Effect.try({ + try: (): DynamicWorkerEntrypoint => { + const executorModule = buildExecutorModule(strippedBody, timeoutMs); + const { [ENTRY_MODULE]: _, ...safeModules } = options.modules ?? {}; + + const worker = options.loader.get(`executor-${crypto.randomUUID()}`, () => ({ + compatibilityDate: "2025-06-01", + compatibilityFlags: ["nodejs_compat"], + mainModule: ENTRY_MODULE, + modules: { + ...safeModules, + [ENTRY_MODULE]: executorModule, + }, + globalOutbound: options.globalOutbound ?? null, + })); + + return asDynamicWorkerEntrypoint(worker.getEntrypoint()); + }, + // A compile error that escaped the strip step, or a capacity + // rejection, can surface here at worker startup rather than at + // `evaluate`. Classify it so the user-actionable reason reaches the + // model instead of an opaque internal error. + catch: toSandboxFailure, + }); }).pipe( Effect.withSpan("executor.runtime.startup", { attributes: { @@ -478,7 +596,10 @@ const evaluate = ( options: DynamicWorkerExecutorOptions, code: string, toolInvoker: SandboxToolInvoker, -): Effect.Effect => { +): Effect.Effect< + ExecuteResult, + DynamicWorkerExecutionError | CodeCompilationError | SandboxRuntimeError +> => { const timeoutMs = Math.max(100, options.timeoutMs ?? DEFAULT_TIMEOUT_MS); return Effect.gen(function* () { @@ -487,10 +608,11 @@ const evaluate = ( const entrypoint = yield* startDynamicWorker(options, code, timeoutMs); const response = yield* Effect.tryPromise({ try: () => entrypoint.evaluate(dispatcher), - catch: (cause) => - new DynamicWorkerExecutionError({ - message: renderTransportMessage(serializeWorkerErrorValue(cause)), - }), + // The evaluate RPC rejects for module compile failures that escaped + // the strip step, non-serializable return values, isolate resource + // limits, and capacity. All are the user's concern or transient, so + // classify and surface them descriptively rather than opaquely. + catch: toSandboxFailure, }).pipe( Effect.withSpan("executor.runtime.evaluate", { attributes: { "executor.runtime": "dynamic-worker" }, @@ -517,6 +639,20 @@ const runInDynamicWorker = ( toolInvoker: SandboxToolInvoker, ): Effect.Effect => evaluate(options, code, toolInvoker).pipe( + // A compile error or a reportable sandbox runtime condition (a + // non-serializable result, a CPU/memory limit, capacity) is the + // user's own concern or transient, not a sandbox defect. Fold both + // into the success channel as a descriptive `ExecuteResult.error` so + // the precise reason reaches the model, exactly as a thrown runtime + // error does, instead of being collapsed to an opaque internal error + // by the host failure path. Unrecognized defects stay on + // `DynamicWorkerExecutionError` and remain opaque. + Effect.catchTags({ + CodeCompilationError: (error) => + Effect.succeed({ result: null, error: error.message } satisfies ExecuteResult), + SandboxRuntimeError: (error) => + Effect.succeed({ result: null, error: error.message } satisfies ExecuteResult), + }), Effect.withSpan("executor.code.exec.dynamic_worker", { attributes: { "executor.runtime": "dynamic-worker" }, }), diff --git a/packages/kernel/runtime-dynamic-worker/src/invocation.test.ts b/packages/kernel/runtime-dynamic-worker/src/invocation.test.ts index 1a8869f64..41e0cb65a 100644 --- a/packages/kernel/runtime-dynamic-worker/src/invocation.test.ts +++ b/packages/kernel/runtime-dynamic-worker/src/invocation.test.ts @@ -6,6 +6,7 @@ import * as Effect from "effect/Effect"; import type { SandboxToolInvoker } from "@executor-js/codemode-core"; import { ExecutionToolError } from "@executor-js/execution"; import { + classifySandboxFailure, ToolDispatcher, makeDynamicWorkerExecutor, renderWorkerError, @@ -156,6 +157,52 @@ describe("serializeWorkerCause", () => { }); }); +describe("classifySandboxFailure", () => { + // Each string here is a real `status.message` seen on the + // executor.runtime.* spans in production. They reject the worker loader + // or the evaluate RPC and were collapsed to an opaque "Internal tool + // error" before the model could act on them. CPU/memory/capacity limits + // can't be triggered deterministically inside a unit-test isolate, so we + // pin their classification here; the syntax and serialization paths also + // have live WorkerLoader coverage above. + it.each([ + ["Failed to start Worker:\nUncaught SyntaxError: Unexpected token '='", "compilation"], + ["Unexpected token '{'", "compilation"], + ["Invalid or unexpected token", "compilation"], + ["Symbol(nope) could not be cloned.", "runtime"], + [ + 'Could not serialize object of type "Cloudflare". This type does not support serialization.', + "runtime", + ], + ["Worker exceeded CPU time limit.", "runtime"], + ["Worker exceeded memory limit.", "runtime"], + ["Too many concurrent dynamic workers", "runtime"], + ] as const)("classifies %j as %s", (message, expected) => { + expect(classifySandboxFailure({ __type: "Error", name: "Error", message }, message)).toBe( + expected, + ); + }); + + it("classifies a SyntaxError by name even when the message is bare", () => { + expect( + classifySandboxFailure({ __type: "Error", name: "SyntaxError", message: "boom" }, "boom"), + ).toBe("compilation"); + }); + + it("classifies a DataCloneError by name even when the message is bare", () => { + expect( + classifySandboxFailure({ __type: "Error", name: "DataCloneError", message: "boom" }, "boom"), + ).toBe("runtime"); + }); + + it("leaves an unrecognized defect opaque (internal)", () => { + const message = "the RPC receiver does not implement the method"; + expect(classifySandboxFailure({ __type: "Error", name: "TypeError", message }, message)).toBe( + "internal", + ); + }); +}); + describe("makeDynamicWorkerExecutor", () => { const loader = (env as { LOADER: WorkerLoader }).LOADER; @@ -271,6 +318,59 @@ describe("makeDynamicWorkerExecutor", () => { expect(result.error).toBe("Internal tool error"); }); + it("surfaces a syntax error with the parser's descriptive message, not an opaque generic", async () => { + const executor = makeDynamicWorkerExecutor({ loader }); + const invoker = makeInvoker(() => null); + + // A genuine parse error: `const` with no binding name. Before the fix + // this threw in stripTypeScript, became a DynamicWorkerExecutionError + // on the failure channel, and the host collapsed it to the opaque + // "Internal tool error". The model needs the real reason to self-correct. + const result = await Effect.runPromise( + executor.execute("async () => { const = 5; return 1; }", invoker), + ); + + expect(result.result).toBeNull(); + expect(result.error).toBeDefined(); + expect(result.error).not.toBe("Internal tool error"); + expect(result.error).not.toContain("Internal tool error"); + expect(result.error?.toLowerCase()).toContain("unexpected"); + }); + + it("surfaces smart-quote paste errors descriptively", async () => { + const executor = makeDynamicWorkerExecutor({ loader }); + const invoker = makeInvoker(() => null); + + // Curly quotes from a copy-paste are the most common real-world cause: + // the snippet looks fine to a human but is invalid JavaScript. + const result = await Effect.runPromise( + executor.execute("async () => { return “hello”; }", invoker), + ); + + expect(result.result).toBeNull(); + expect(result.error).toBeDefined(); + expect(result.error).not.toBe("Internal tool error"); + }); + + it("surfaces a non-serializable return value descriptively, not opaquely", async () => { + const executor = makeDynamicWorkerExecutor({ loader }); + const invoker = makeInvoker(() => null); + + // Returning a value that can't cross the sandbox boundary (a Symbol, a + // host object) rejects the evaluate RPC with a DataCloneError. That is + // the user's own code, not a sandbox defect, so the model needs to be + // told what it returned can't be serialized rather than getting an + // opaque "Internal tool error". Production analog: "Could not serialize + // object of type Cloudflare". + const result = await Effect.runPromise(executor.execute("async () => Symbol('nope')", invoker)); + + expect(result.result).toBeNull(); + expect(result.error).toBeDefined(); + expect(result.error).not.toBe("Internal tool error"); + expect(result.error).not.toContain("Internal tool error"); + expect(result.error?.toLowerCase()).toContain("could not be cloned"); + }); + it("preserves public ExecutionToolError messages across the worker bridge", async () => { const executor = makeDynamicWorkerExecutor({ loader }); const invoker = {