feat: unified tracing across dynamo + sglang (#4248)

fc161924 · ishandhanani · GitHub · 8559c2bd · fc161924 · fc161924
Unverified Commit fc161924 authored Dec 09, 2025 by ishandhanani Committed by GitHub Dec 09, 2025
11 changed files
--- a/components/src/dynamo/sglang/main.py
+++ b/components/src/dynamo/sglang/main.py
@@ -103,11 +103,8 @@ async def init(runtime: DistributedRuntime, config: Config):
    server_args, dynamo_args = config.server_args, config.dynamo_args
    # Prevent SGLang from blocking on non-leader nodes
-    # We can switch this to 0 and leverage our own metrics
-    # after https://github.com/sgl-project/sglang/pull/13686
-    # is merged in
    if server_args.node_rank >= 1:
-        os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "1"
+        os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "0"
    engine = sgl.Engine(server_args=server_args)
@@ -222,11 +219,8 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
    server_args, dynamo_args = config.server_args, config.dynamo_args
    # Prevent SGLang from blocking on non-leader nodes
-    # We can switch this to 0 and leverage our own metrics
-    # after https://github.com/sgl-project/sglang/pull/13686
-    # is merged in
    if server_args.node_rank >= 1:
-        os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "1"
+        os.environ["SGLANG_BLOCK_NONZERO_RANK_CHILDREN"] = "0"
    engine = sgl.Engine(server_args=server_args)

--- a/components/src/dynamo/sglang/request_handlers/handler_base.py
+++ b/components/src/dynamo/sglang/request_handlers/handler_base.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 import asyncio
+import base64
+import json
 import logging
 import random
 import socket
@@ -10,6 +12,7 @@ from contextlib import asynccontextmanager
 from typing import Any, AsyncGenerator, Dict, Optional, Tuple
 import sglang as sgl
+from sglang.srt.tracing import trace as sglang_trace
 from sglang.srt.utils import get_local_ip_auto
 from dynamo._core import Client, Component, Context
@@ -49,6 +52,7 @@ class BaseWorkerHandler(ABC):
        self.prefill_client = prefill_client
        self.serving_mode = config.serving_mode
        self.skip_tokenizer_init = config.server_args.skip_tokenizer_init
+        self.enable_trace = config.server_args.enable_trace
    @abstractmethod
    async def generate(self, request: Dict[str, Any], context: Context):
@@ -117,6 +121,39 @@ class BaseWorkerHandler(ABC):
        return bootstrap_host, bootstrap_port
+    def _propagate_trace_context_to_sglang(
+        self, context: Context, bootstrap_room: int = 0
+    ):
+        """Propagate Dynamo's trace context to SGLang for distributed tracing. SGLang expects a certain
+        format derived by loooking at https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/tracing/trace.py
+        in the to_dict() method.
+        Args:
+            context: Dynamo Context object containing trace information.
+            bootstrap_room: Bootstrap room ID (0 for aggregated, actual room for disaggregated).
+        """
+        trace_id = context.trace_id
+        span_id = context.span_id
+        if not trace_id or not span_id:
+            return
+        # Build trace context for SGLang
+        trace_context = {
+            str(bootstrap_room): {
+                "root_span": {"traceparent": f"00-{trace_id}-{span_id}-01"},
+                "prev_span": {
+                    "span_id": int(span_id, 16),
+                    "trace_id": int(trace_id, 16),
+                },
+            }
+        }
+        # Encode and propagate
+        base64_context = base64.b64encode(
+            json.dumps(trace_context, ensure_ascii=False).encode("utf-8")
+        ).decode("utf-8")
+        sglang_trace.trace_set_remote_propagate_context(base64_context)
    async def _handle_cancellation(
        self, request_id_future: asyncio.Future, context: Context
    ):

--- a/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py
+++ b/components/src/dynamo/sglang/request_handlers/llm/decode_handler.py
@@ -112,6 +112,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
            RuntimeError: If no bootstrap info received from prefill worker.
        """
        logging.debug(f"New Request ID: {context.id()}")
+        trace_id = context.trace_id
        sampling_params = self._build_sampling_params(request)
        input_param = self._get_input_param(request)
@@ -154,6 +155,11 @@ class DecodeWorkerHandler(BaseWorkerHandler):
            if not bootstrap_info:
                raise RuntimeError("No bootstrap info received from prefill worker")
+            if self.enable_trace:
+                self._propagate_trace_context_to_sglang(
+                    context, bootstrap_info["bootstrap_room"]
+                )
            decode = await self.engine.async_generate(
                **input_param,
                sampling_params=sampling_params,
@@ -161,6 +167,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
                bootstrap_host=bootstrap_info["bootstrap_host"],
                bootstrap_port=bootstrap_info["bootstrap_port"],
                bootstrap_room=bootstrap_info["bootstrap_room"],
+                rid=trace_id,
            )
            if self.skip_tokenizer_init:
@@ -170,10 +177,14 @@ class DecodeWorkerHandler(BaseWorkerHandler):
                async for out in self._process_text_stream(decode, context):
                    yield out
        else:
+            if self.enable_trace:
+                self._propagate_trace_context_to_sglang(context)
            agg = await self.engine.async_generate(
                **input_param,
                sampling_params=sampling_params,
                stream=True,
+                rid=trace_id,
            )
            if self.skip_tokenizer_init:
                async for out in self._process_token_stream(agg, context):

--- a/components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py
+++ b/components/src/dynamo/sglang/request_handlers/llm/prefill_handler.py
@@ -64,6 +64,7 @@ class PrefillWorkerHandler(BaseWorkerHandler):
            Bootstrap info dict with host, port, and room for decode worker connection.
        """
        logging.debug(f"New Request ID: {context.id()}")
+        trace_id = context.trace_id
        bootstrap_room = self._generate_bootstrap_room()
        bootstrap_info = {
@@ -76,6 +77,10 @@ class PrefillWorkerHandler(BaseWorkerHandler):
        input_param = self._get_input_param(request["request"])
+        # Propagate trace context to SGLang
+        if self.enable_trace:
+            self._propagate_trace_context_to_sglang(context, bootstrap_room)
        results = await self.engine.async_generate(
            **input_param,
            sampling_params=request["sampling_params"],
@@ -83,6 +88,7 @@ class PrefillWorkerHandler(BaseWorkerHandler):
            bootstrap_host=self.bootstrap_host,
            bootstrap_port=self.bootstrap_port,
            bootstrap_room=bootstrap_room,
+            rid=trace_id,
        )
        task = asyncio.create_task(self._consume_results(results, context))

--- a/container/Dockerfile.sglang
+++ b/container/Dockerfile.sglang
@@ -66,7 +66,7 @@ ARG DEEPEP_COMMIT=9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee
 ARG DEEPEP_GB_COMMIT=1b14ad661c7640137fcfe93cccb2694ede1220b0
 ARG CMAKE_BUILD_PARALLEL_LEVEL=2
 ARG SGL_KERNEL_VERSION=0.3.16.post5
-ARG SGLANG_COMMIT=0.5.4.post3
+ARG SGLANG_COMMIT=0.5.6
 ARG GDRCOPY_COMMIT=v2.4.4
 ARG NVSHMEM_VERSION=3.3.9
 ARG GRACE_BLACKWELL=false

--- a/deploy/observability/tempo.yaml
+++ b/deploy/observability/tempo.yaml
@@ -9,7 +9,7 @@ distributor:
    otlp:
      protocols:
        grpc:
-          endpoint: 0.0.0.0:4317
+          endpoint: 0.0.0.0:4317  # Receives from OTEL collector
        http:
          endpoint: 0.0.0.0:4318

--- a/examples/backends/sglang/launch/agg.sh
+++ b/examples/backends/sglang/launch/agg.sh
@@ -46,10 +46,12 @@ while [[ $# -gt 0 ]]; do
 done
 # Enable tracing if requested
+TRACE_ARGS=()
 if [ "$ENABLE_OTEL" = true ]; then
    export DYN_LOGGING_JSONL=true
    export OTEL_EXPORT_ENABLED=1
    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317}
+    TRACE_ARGS+=(--enable-trace --otlp-traces-endpoint localhost:4317)
 fi
 # run ingress
@@ -59,7 +61,7 @@ python3 -m dynamo.frontend &
 DYNAMO_PID=$!
 # run worker with metrics enabled
-DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
+OTEL_SERVICE_NAME=dynamo-worker DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT:-8081} \
 python3 -m dynamo.sglang \
  --model-path "$MODEL" \
  --served-model-name "$MODEL" \
@@ -68,4 +70,5 @@ python3 -m dynamo.sglang \
  --trust-remote-code \
  --skip-tokenizer-init \
  --enable-metrics \
+  "${TRACE_ARGS[@]}" \
  "${EXTRA_ARGS[@]}"
--- a/examples/backends/sglang/launch/agg_embed.sh
+++ b/examples/backends/sglang/launch/agg_embed.sh
@@ -37,10 +37,12 @@ while [[ $# -gt 0 ]]; do
 done
 # Enable tracing if requested
+TRACE_ARGS=()
 if [ "$ENABLE_OTEL" = true ]; then
    export DYN_LOGGING_JSONL=true
    export OTEL_EXPORT_ENABLED=1
    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317}
+    TRACE_ARGS+=(--enable-trace --otlp-traces-endpoint localhost:4317)
 fi
 # run ingress
@@ -59,4 +61,5 @@ python3 -m dynamo.sglang \
  --tp 1 \
  --trust-remote-code \
  --use-sglang-tokenizer \
-  --enable-metrics
+  --enable-metrics \
+  "${TRACE_ARGS[@]}"
--- a/examples/backends/sglang/launch/agg_router.sh
+++ b/examples/backends/sglang/launch/agg_router.sh
@@ -37,10 +37,12 @@ while [[ $# -gt 0 ]]; do
 done
 # Enable tracing if requested
+TRACE_ARGS=()
 if [ "$ENABLE_OTEL" = true ]; then
    export DYN_LOGGING_JSONL=true
    export OTEL_EXPORT_ENABLED=1
    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317}
+    TRACE_ARGS+=(--enable-trace --otlp-traces-endpoint localhost:4317)
 fi
 # run ingress
@@ -58,7 +60,8 @@ python3 -m dynamo.sglang \
  --tp 1 \
  --trust-remote-code \
  --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5557"}' \
-  --enable-metrics &
+  --enable-metrics \
+  "${TRACE_ARGS[@]}" &
 WORKER_PID=$!
 OTEL_SERVICE_NAME=dynamo-worker-2 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT_WORKER2:-8082} \
@@ -69,4 +72,5 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
  --tp 1 \
  --trust-remote-code \
  --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5558"}' \
-  --enable-metrics
+  --enable-metrics \
+  "${TRACE_ARGS[@]}"
--- a/examples/backends/sglang/launch/disagg.sh
+++ b/examples/backends/sglang/launch/disagg.sh
@@ -37,10 +37,12 @@ while [[ $# -gt 0 ]]; do
 done
 # Enable tracing if requested
+TRACE_ARGS=()
 if [ "$ENABLE_OTEL" = true ]; then
    export DYN_LOGGING_JSONL=true
    export OTEL_EXPORT_ENABLED=1
    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317}
+    TRACE_ARGS+=(--enable-trace --otlp-traces-endpoint localhost:4317)
 fi
 # run ingress
@@ -65,7 +67,8 @@ python3 -m dynamo.sglang \
  --host 0.0.0.0 \
  --port 40000 \
  --disaggregation-transfer-backend nixl \
-  --enable-metrics --log-level debug &
+  --enable-metrics \
+  "${TRACE_ARGS[@]}" &
 PREFILL_PID=$!
 # run decode worker
@@ -81,4 +84,5 @@ CUDA_VISIBLE_DEVICES=2,3 python3 -m dynamo.sglang \
  --disaggregation-bootstrap-port 12345 \
  --host 0.0.0.0 \
  --disaggregation-transfer-backend nixl \
-  --enable-metrics --log-level debug
+  --enable-metrics \
+  "${TRACE_ARGS[@]}"
--- a/examples/backends/sglang/launch/disagg_router.sh
+++ b/examples/backends/sglang/launch/disagg_router.sh
@@ -38,10 +38,12 @@ while [[ $# -gt 0 ]]; do
 done
 # Enable tracing if requested
+TRACE_ARGS=()
 if [ "$ENABLE_OTEL" = true ]; then
    export DYN_LOGGING_JSONL=true
    export OTEL_EXPORT_ENABLED=1
    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317}
+    TRACE_ARGS+=(--enable-trace --otlp-traces-endpoint localhost:4317)
 fi
 # run ingress
@@ -74,7 +76,8 @@ python3 -m dynamo.sglang \
  --host 0.0.0.0 \
  --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5557"}' \
  --disaggregation-transfer-backend nixl \
-  --enable-metrics &
+  --enable-metrics \
+  "${TRACE_ARGS[@]}" &
 PREFILL_PID=$!
 # run prefill worker
@@ -89,7 +92,8 @@ CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
  --host 0.0.0.0 \
  --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5558"}' \
  --disaggregation-transfer-backend nixl \
-  --enable-metrics &
+  --enable-metrics \
+  "${TRACE_ARGS[@]}" &
 PREFILL_PID=$!
 # run decode worker
@@ -104,7 +108,8 @@ CUDA_VISIBLE_DEVICES=3 python3 -m dynamo.sglang \
  --host 0.0.0.0 \
  --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5560"}' \
  --disaggregation-transfer-backend nixl \
-  --enable-metrics &
+  --enable-metrics \
+  "${TRACE_ARGS[@]}" &
 PREFILL_PID=$!
 # run decode worker
@@ -119,4 +124,5 @@ CUDA_VISIBLE_DEVICES=2 python3 -m dynamo.sglang \
  --host 0.0.0.0 \
  --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:5559"}' \
  --disaggregation-transfer-backend nixl \
-  --enable-metrics
+  --enable-metrics \
+  "${TRACE_ARGS[@]}"