From dbb21394654953e5236b6b658ca4a7bdb5ebafeb Mon Sep 17 00:00:00 2001
From: jgreer013 <18727435+jgreer013@users.noreply.github.com>
Date: Thu, 18 Jun 2026 10:29:00 -0700
Subject: [PATCH] fix(litellm): store bookkeeping span off-band, not in
 forwarded metadata

With LiteLLMIntegration enabled, any call passing caller `metadata` crashed
during request serialization. `_input_callback` stored the live Span in the
caller's `metadata` dict, and some providers (e.g. Anthropic's /v1/messages
passthrough) forward that dict into the outbound request body, so
`json.dumps(request_body)` raised `TypeError: Object of type Span is not JSON
serializable` before the request was sent. The span (holding the verbatim
prompt under send_default_pii) could also leak to the provider.

Stash the span on a top-level key of the per-request kwargs dict
(litellm's `model_call_details`) that litellm threads through the
input/success/failure callbacks, instead of in the forwarded `metadata`
sub-dict. This ties the span's lifetime to the request with no module-level
tracking, mirroring how the clickhouse/dramatiq integrations stash a span on
their per-request object. The Anthropic request body is built only from
recognized request params, not from `model_call_details`, so the span is
never serialized onto the wire (verified end-to-end against the passthrough).

Fixes #6596

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 sentry_sdk/integrations/litellm.py         |  45 ++++---
 tests/integrations/litellm/test_litellm.py | 133 +++++++++++++++++++++
 2 files changed, 163 insertions(+), 15 deletions(-)

diff --git a/sentry_sdk/integrations/litellm.py b/sentry_sdk/integrations/litellm.py
index 402676defa..d6c54cdf8c 100644
--- a/sentry_sdk/integrations/litellm.py
+++ b/sentry_sdk/integrations/litellm.py
@@ -31,16 +31,32 @@
     raise DidNotEnable("LiteLLM not installed")
 
 
-def _get_metadata_dict(kwargs: "Dict[str, Any]") -> "Dict[str, Any]":
-    """Get the metadata dictionary from the kwargs."""
-    litellm_params = kwargs.setdefault("litellm_params", {})
+# litellm threads the SAME `kwargs` dict (its per-request ``model_call_details``)
+# through the input, success, and failure callbacks, so the bookkeeping span is
+# stashed on it directly. This ties the span's lifetime to the request -- it is
+# freed when litellm releases the request -- with no module-level tracking.
+#
+# The span must NOT go in ``kwargs["litellm_params"]["metadata"]``: litellm
+# forwards that caller ``metadata`` dict into the outbound request body for some
+# providers (e.g. Anthropic's /v1/messages passthrough), which would break
+# ``json.dumps(request_body)`` and leak the span (and its prompt data) to the
+# provider. The Anthropic request body is built only from the recognized request
+# params, not from ``model_call_details``, so a top-level key here is not
+# forwarded -- it is also where litellm stores its own per-request internal state
+# (e.g. ``agentic_loop_params``).
+_SPAN_KEY = "_sentry_span"
 
-    # we need this weird little dance, as metadata might be set but may be None initially
-    metadata = litellm_params.get("metadata")
-    if metadata is None:
-        metadata = {}
-        litellm_params["metadata"] = metadata
-    return metadata
+
+def _store_span(kwargs: "Dict[str, Any]", span: "Any") -> None:
+    kwargs[_SPAN_KEY] = span
+
+
+def _peek_span(kwargs: "Dict[str, Any]") -> "Any":
+    return kwargs.get(_SPAN_KEY)
+
+
+def _pop_span(kwargs: "Dict[str, Any]") -> "Any":
+    return kwargs.pop(_SPAN_KEY, None)
 
 
 def _convert_message_parts(messages: "List[Dict[str, Any]]") -> "List[Dict[str, Any]]":
@@ -117,8 +133,8 @@ def _input_callback(kwargs: "Dict[str, Any]") -> None:
         )
         span.__enter__()
 
-    # Store span for later
-    _get_metadata_dict(kwargs)["_sentry_span"] = span
+    # Store span for later, off-band from the kwargs litellm may forward.
+    _store_span(kwargs, span)
 
     # Set basic data
     set_data_normalized(span, SPANDATA.GEN_AI_SYSTEM, provider)
@@ -198,8 +214,7 @@ def _success_callback(
 ) -> None:
     """Handle successful completion."""
 
-    metadata = _get_metadata_dict(kwargs)
-    span = metadata.get("_sentry_span")
+    span = _peek_span(kwargs)
     if span is None:
         return
 
@@ -259,7 +274,7 @@ def _success_callback(
             or "complete_streaming_response" in kwargs
             or "async_complete_streaming_response" in kwargs
         ):
-            span = metadata.pop("_sentry_span", None)
+            span = _pop_span(kwargs)
             if span is not None:
                 span.__exit__(None, None, None)
 
@@ -285,7 +300,7 @@ def _failure_callback(
     end_time: "datetime",
 ) -> None:
     """Handle request failure."""
-    span = _get_metadata_dict(kwargs).get("_sentry_span")
+    span = _pop_span(kwargs)
     if span is None:
         return
 
diff --git a/tests/integrations/litellm/test_litellm.py b/tests/integrations/litellm/test_litellm.py
index 39e173049b..e7f0dbc960 100644
--- a/tests/integrations/litellm/test_litellm.py
+++ b/tests/integrations/litellm/test_litellm.py
@@ -2532,6 +2532,139 @@ def test_integration_setup(sentry_init):
     assert _failure_callback in (litellm.failure_callback or [])
 
 
+def test_caller_metadata_stays_json_serializable(
+    sentry_init,
+    capture_events,
+):
+    """Regression test for GH-6596.
+
+    litellm threads the caller's ``metadata`` dict into ``litellm_params`` and
+    some providers (e.g. Anthropic's ``/v1/messages`` passthrough) serialize it
+    into the outbound request body *before the response comes back*. The
+    integration must therefore never write its live ``Span`` into that dict, or
+    ``json.dumps(request_body)`` raises ``TypeError: Object of type Span is not
+    JSON serializable`` before the request is even sent.
+    """
+    sentry_init(
+        integrations=[LiteLLMIntegration()],
+        disabled_integrations=[StdlibIntegration],
+        traces_sample_rate=1.0,
+        send_default_pii=True,
+        _experiments={"trace_lifecycle": "static"},
+    )
+    events = capture_events()
+
+    # Mirror the kwargs litellm hands to its callbacks: the caller's metadata
+    # lives under litellm_params and is the very dict forwarded onto the wire.
+    caller_metadata = {"user_id": "my-org"}
+    kwargs = {
+        "model": "gpt-3.5-turbo",
+        "messages": [{"role": "user", "content": "Hello!"}],
+        "litellm_call_id": "call-6596",
+        "litellm_params": {"metadata": caller_metadata},
+    }
+
+    with start_transaction(name="litellm test"):
+        _input_callback(kwargs)
+
+        # litellm would serialize the request body here, while the span is live.
+        # The live span must not be in the forwarded metadata...
+        assert "_sentry_span" not in caller_metadata
+        # ...so the request body remains JSON-serializable.
+        json.dumps(caller_metadata)
+
+        # The span is still recorded off-band, so monitoring keeps working.
+        _success_callback(
+            kwargs, MockCompletionResponse(), datetime.now(), datetime.now()
+        )
+
+    (event,) = events
+    chat_spans = [
+        span
+        for span in event["spans"]
+        if span["op"] == OP.GEN_AI_CHAT and span["origin"] == "auto.ai.litellm"
+    ]
+    assert len(chat_spans) == 1
+
+
+def test_span_stashed_on_shared_kwargs_not_forwarded_metadata(sentry_init):
+    """The span is stashed on the shared kwargs dict (a top-level key litellm
+    does not forward), never in the caller's metadata, and each call keeps its
+    own span.
+    """
+    sentry_init(
+        integrations=[LiteLLMIntegration()],
+        disabled_integrations=[StdlibIntegration],
+        traces_sample_rate=1.0,
+        _experiments={"trace_lifecycle": "static"},
+    )
+
+    with start_transaction(name="litellm test"):
+        caller_metadata = {"user_id": "my-org"}
+        kwargs_a = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "a"}],
+            "litellm_params": {"metadata": caller_metadata},
+        }
+        kwargs_b = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "b"}],
+        }
+
+        _input_callback(kwargs_a)
+        _input_callback(kwargs_b)
+
+        # Stashed on the shared kwargs dict, off the forwarded metadata path...
+        assert kwargs_a["_sentry_span"] is not None
+        assert "_sentry_span" not in caller_metadata
+        json.dumps(caller_metadata)
+        # ...and each call keeps its own span (no cross-talk).
+        assert kwargs_a["_sentry_span"] is not kwargs_b["_sentry_span"]
+
+        _success_callback(
+            kwargs_a, MockCompletionResponse(), datetime.now(), datetime.now()
+        )
+        _success_callback(
+            kwargs_b, MockCompletionResponse(), datetime.now(), datetime.now()
+        )
+
+
+def test_span_cleaned_up_after_terminal_callbacks(sentry_init):
+    """Both terminal callbacks remove the span from the shared kwargs dict, so a
+    completed or failed call leaves nothing behind."""
+    sentry_init(
+        integrations=[LiteLLMIntegration()],
+        disabled_integrations=[StdlibIntegration],
+        traces_sample_rate=1.0,
+        _experiments={"trace_lifecycle": "static"},
+    )
+
+    with start_transaction(name="litellm test"):
+        success_kwargs = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "hi"}],
+            "litellm_call_id": "success-call",
+        }
+        _input_callback(success_kwargs)
+        assert "_sentry_span" in success_kwargs
+        _success_callback(
+            success_kwargs, MockCompletionResponse(), datetime.now(), datetime.now()
+        )
+        assert "_sentry_span" not in success_kwargs
+
+        failure_kwargs = {
+            "model": "gpt-3.5-turbo",
+            "messages": [{"role": "user", "content": "hi"}],
+            "litellm_call_id": "failure-call",
+        }
+        _input_callback(failure_kwargs)
+        assert "_sentry_span" in failure_kwargs
+        _failure_callback(
+            failure_kwargs, ValueError("boom"), datetime.now(), datetime.now()
+        )
+        assert "_sentry_span" not in failure_kwargs
+
+
 def test_litellm_message_truncation(sentry_init, capture_events):
     """Test that large messages are truncated properly in LiteLLM integration."""
     sentry_init(