From dbb21394654953e5236b6b658ca4a7bdb5ebafeb Mon Sep 17 00:00:00 2001 From: jgreer013 <18727435+jgreer013@users.noreply.github.com> Date: Thu, 18 Jun 2026 10:29:00 -0700 Subject: [PATCH] fix(litellm): store bookkeeping span off-band, not in forwarded metadata With LiteLLMIntegration enabled, any call passing caller `metadata` crashed during request serialization. `_input_callback` stored the live Span in the caller's `metadata` dict, and some providers (e.g. Anthropic's /v1/messages passthrough) forward that dict into the outbound request body, so `json.dumps(request_body)` raised `TypeError: Object of type Span is not JSON serializable` before the request was sent. The span (holding the verbatim prompt under send_default_pii) could also leak to the provider. Stash the span on a top-level key of the per-request kwargs dict (litellm's `model_call_details`) that litellm threads through the input/success/failure callbacks, instead of in the forwarded `metadata` sub-dict. This ties the span's lifetime to the request with no module-level tracking, mirroring how the clickhouse/dramatiq integrations stash a span on their per-request object. The Anthropic request body is built only from recognized request params, not from `model_call_details`, so the span is never serialized onto the wire (verified end-to-end against the passthrough). Fixes #6596 Co-Authored-By: Claude Opus 4.8 (1M context) --- sentry_sdk/integrations/litellm.py | 45 ++++--- tests/integrations/litellm/test_litellm.py | 133 +++++++++++++++++++++ 2 files changed, 163 insertions(+), 15 deletions(-) diff --git a/sentry_sdk/integrations/litellm.py b/sentry_sdk/integrations/litellm.py index 402676defa..d6c54cdf8c 100644 --- a/sentry_sdk/integrations/litellm.py +++ b/sentry_sdk/integrations/litellm.py @@ -31,16 +31,32 @@ raise DidNotEnable("LiteLLM not installed") -def _get_metadata_dict(kwargs: "Dict[str, Any]") -> "Dict[str, Any]": - """Get the metadata dictionary from the kwargs.""" - litellm_params = kwargs.setdefault("litellm_params", {}) +# litellm threads the SAME `kwargs` dict (its per-request ``model_call_details``) +# through the input, success, and failure callbacks, so the bookkeeping span is +# stashed on it directly. This ties the span's lifetime to the request -- it is +# freed when litellm releases the request -- with no module-level tracking. +# +# The span must NOT go in ``kwargs["litellm_params"]["metadata"]``: litellm +# forwards that caller ``metadata`` dict into the outbound request body for some +# providers (e.g. Anthropic's /v1/messages passthrough), which would break +# ``json.dumps(request_body)`` and leak the span (and its prompt data) to the +# provider. The Anthropic request body is built only from the recognized request +# params, not from ``model_call_details``, so a top-level key here is not +# forwarded -- it is also where litellm stores its own per-request internal state +# (e.g. ``agentic_loop_params``). +_SPAN_KEY = "_sentry_span" - # we need this weird little dance, as metadata might be set but may be None initially - metadata = litellm_params.get("metadata") - if metadata is None: - metadata = {} - litellm_params["metadata"] = metadata - return metadata + +def _store_span(kwargs: "Dict[str, Any]", span: "Any") -> None: + kwargs[_SPAN_KEY] = span + + +def _peek_span(kwargs: "Dict[str, Any]") -> "Any": + return kwargs.get(_SPAN_KEY) + + +def _pop_span(kwargs: "Dict[str, Any]") -> "Any": + return kwargs.pop(_SPAN_KEY, None) def _convert_message_parts(messages: "List[Dict[str, Any]]") -> "List[Dict[str, Any]]": @@ -117,8 +133,8 @@ def _input_callback(kwargs: "Dict[str, Any]") -> None: ) span.__enter__() - # Store span for later - _get_metadata_dict(kwargs)["_sentry_span"] = span + # Store span for later, off-band from the kwargs litellm may forward. + _store_span(kwargs, span) # Set basic data set_data_normalized(span, SPANDATA.GEN_AI_SYSTEM, provider) @@ -198,8 +214,7 @@ def _success_callback( ) -> None: """Handle successful completion.""" - metadata = _get_metadata_dict(kwargs) - span = metadata.get("_sentry_span") + span = _peek_span(kwargs) if span is None: return @@ -259,7 +274,7 @@ def _success_callback( or "complete_streaming_response" in kwargs or "async_complete_streaming_response" in kwargs ): - span = metadata.pop("_sentry_span", None) + span = _pop_span(kwargs) if span is not None: span.__exit__(None, None, None) @@ -285,7 +300,7 @@ def _failure_callback( end_time: "datetime", ) -> None: """Handle request failure.""" - span = _get_metadata_dict(kwargs).get("_sentry_span") + span = _pop_span(kwargs) if span is None: return diff --git a/tests/integrations/litellm/test_litellm.py b/tests/integrations/litellm/test_litellm.py index 39e173049b..e7f0dbc960 100644 --- a/tests/integrations/litellm/test_litellm.py +++ b/tests/integrations/litellm/test_litellm.py @@ -2532,6 +2532,139 @@ def test_integration_setup(sentry_init): assert _failure_callback in (litellm.failure_callback or []) +def test_caller_metadata_stays_json_serializable( + sentry_init, + capture_events, +): + """Regression test for GH-6596. + + litellm threads the caller's ``metadata`` dict into ``litellm_params`` and + some providers (e.g. Anthropic's ``/v1/messages`` passthrough) serialize it + into the outbound request body *before the response comes back*. The + integration must therefore never write its live ``Span`` into that dict, or + ``json.dumps(request_body)`` raises ``TypeError: Object of type Span is not + JSON serializable`` before the request is even sent. + """ + sentry_init( + integrations=[LiteLLMIntegration()], + disabled_integrations=[StdlibIntegration], + traces_sample_rate=1.0, + send_default_pii=True, + _experiments={"trace_lifecycle": "static"}, + ) + events = capture_events() + + # Mirror the kwargs litellm hands to its callbacks: the caller's metadata + # lives under litellm_params and is the very dict forwarded onto the wire. + caller_metadata = {"user_id": "my-org"} + kwargs = { + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "Hello!"}], + "litellm_call_id": "call-6596", + "litellm_params": {"metadata": caller_metadata}, + } + + with start_transaction(name="litellm test"): + _input_callback(kwargs) + + # litellm would serialize the request body here, while the span is live. + # The live span must not be in the forwarded metadata... + assert "_sentry_span" not in caller_metadata + # ...so the request body remains JSON-serializable. + json.dumps(caller_metadata) + + # The span is still recorded off-band, so monitoring keeps working. + _success_callback( + kwargs, MockCompletionResponse(), datetime.now(), datetime.now() + ) + + (event,) = events + chat_spans = [ + span + for span in event["spans"] + if span["op"] == OP.GEN_AI_CHAT and span["origin"] == "auto.ai.litellm" + ] + assert len(chat_spans) == 1 + + +def test_span_stashed_on_shared_kwargs_not_forwarded_metadata(sentry_init): + """The span is stashed on the shared kwargs dict (a top-level key litellm + does not forward), never in the caller's metadata, and each call keeps its + own span. + """ + sentry_init( + integrations=[LiteLLMIntegration()], + disabled_integrations=[StdlibIntegration], + traces_sample_rate=1.0, + _experiments={"trace_lifecycle": "static"}, + ) + + with start_transaction(name="litellm test"): + caller_metadata = {"user_id": "my-org"} + kwargs_a = { + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "a"}], + "litellm_params": {"metadata": caller_metadata}, + } + kwargs_b = { + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "b"}], + } + + _input_callback(kwargs_a) + _input_callback(kwargs_b) + + # Stashed on the shared kwargs dict, off the forwarded metadata path... + assert kwargs_a["_sentry_span"] is not None + assert "_sentry_span" not in caller_metadata + json.dumps(caller_metadata) + # ...and each call keeps its own span (no cross-talk). + assert kwargs_a["_sentry_span"] is not kwargs_b["_sentry_span"] + + _success_callback( + kwargs_a, MockCompletionResponse(), datetime.now(), datetime.now() + ) + _success_callback( + kwargs_b, MockCompletionResponse(), datetime.now(), datetime.now() + ) + + +def test_span_cleaned_up_after_terminal_callbacks(sentry_init): + """Both terminal callbacks remove the span from the shared kwargs dict, so a + completed or failed call leaves nothing behind.""" + sentry_init( + integrations=[LiteLLMIntegration()], + disabled_integrations=[StdlibIntegration], + traces_sample_rate=1.0, + _experiments={"trace_lifecycle": "static"}, + ) + + with start_transaction(name="litellm test"): + success_kwargs = { + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "hi"}], + "litellm_call_id": "success-call", + } + _input_callback(success_kwargs) + assert "_sentry_span" in success_kwargs + _success_callback( + success_kwargs, MockCompletionResponse(), datetime.now(), datetime.now() + ) + assert "_sentry_span" not in success_kwargs + + failure_kwargs = { + "model": "gpt-3.5-turbo", + "messages": [{"role": "user", "content": "hi"}], + "litellm_call_id": "failure-call", + } + _input_callback(failure_kwargs) + assert "_sentry_span" in failure_kwargs + _failure_callback( + failure_kwargs, ValueError("boom"), datetime.now(), datetime.now() + ) + assert "_sentry_span" not in failure_kwargs + + def test_litellm_message_truncation(sentry_init, capture_events): """Test that large messages are truncated properly in LiteLLM integration.""" sentry_init(