feat: Propagate OTEL tracing context for trtllm (#5377)

8c129ed4 · jh-nv · GitHub · 869562da · 8c129ed4 · 8c129ed4
Unverified Commit 8c129ed4 authored Jan 13, 2026 by jh-nv Committed by GitHub Jan 13, 2026
7 changed files
--- a/components/src/dynamo/common/utils/__init__.py
+++ b/components/src/dynamo/common/utils/__init__.py
@@ -9,10 +9,11 @@ Dynamo backends and components.

 Submodules:
    - endpoint_types: Endpoint type parsing utilities
+    - otel_tracing: OpenTelemetry tracing header utilities
    - paths: Workspace directory detection and path utilities
    - prometheus: Prometheus metrics collection and logging utilities
 """

-from dynamo.common.utils import endpoint_types, paths, prometheus
+from dynamo.common.utils import endpoint_types, otel_tracing, paths, prometheus

-__all__ = ["endpoint_types", "paths", "prometheus"]
+__all__ = ["endpoint_types", "otel_tracing", "paths", "prometheus"]
--- a/components/src/dynamo/common/utils/otel_tracing.py
+++ b/components/src/dynamo/common/utils/otel_tracing.py
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+OpenTelemetry tracing header utilities for Dynamo components.
+"""
+
+
+from dynamo._core import Context
+
+
+def build_trace_headers(context: Context) -> dict[str, str] | None:
+    """
+    Build trace headers from context for propagation.
+    """
+    trace_id = context.trace_id
+    span_id = context.span_id
+    if not trace_id or not span_id:
+        return None
+
+    # W3C Trace Context format: {version}-{trace_id}-{parent_id}-{trace_flags}
+    # version: 00, trace_flags: 01 (sampled)
+    # TODO: properly propagate the trace-flags from current span.
+    return {"traceparent": f"00-{trace_id}-{span_id}-01"}
--- a/components/src/dynamo/trtllm/request_handlers/handler_base.py
+++ b/components/src/dynamo/trtllm/request_handlers/handler_base.py
@@ -29,6 +29,7 @@ from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams
 from tensorrt_llm.llmapi.llm import SamplingParams

 from dynamo._core import Context
+from dynamo.common.utils.otel_tracing import build_trace_headers
 from dynamo.logits_processing.examples import HelloWorldLogitsProcessor
 from dynamo.nixl_connect import Connector
 from dynamo.runtime import DistributedRuntime
@@ -366,6 +367,9 @@ class HandlerBase:
            prefill_result.get("prompt_tokens_details") if prefill_result else None
        )

+        # Build trace headers for distributed tracing
+        trace_headers = build_trace_headers(context)
+
        try:
            # NEW: Updated engine call to include multimodal data
            generation_result = self.engine.llm.generate_async(
@@ -373,6 +377,7 @@ class HandlerBase:
                sampling_params=sampling_params,
                disaggregated_params=disaggregated_params,
                streaming=streaming,
+                trace_headers=trace_headers,
            )

            # Use the context manager to handle cancellation monitoring

--- a/components/src/dynamo/vllm/handlers.py
+++ b/components/src/dynamo/vllm/handlers.py
@@ -21,8 +21,8 @@ from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams, StructuredOutputsParams
 from vllm.v1.engine.exceptions import EngineDeadError

-from dynamo._core import Context
 from dynamo.common.utils.input_params import InputParamManager
+from dynamo.common.utils.otel_tracing import build_trace_headers
 from dynamo.llm import (
    ModelInput,
    ModelType,
@@ -965,20 +965,6 @@ class BaseWorkerHandler(ABC):

        return log_probs if log_probs else None, top_logprobs if top_logprobs else None

-    def _build_trace_headers(self, context: Context) -> dict[str, str] | None:
-        """
-        Build trace headers from context for propagation to vLLM engine.
-        """
-        trace_id = context.trace_id
-        span_id = context.span_id
-        if not trace_id or not span_id:
-            return None
-
-        # W3C Trace Context format: {version}-{trace_id}-{parent_id}-{trace_flags}
-        # version: 00, trace_flags: 01 (sampled)
-        # TODO: properly propagate the trace-flags from current span.
-        return {"traceparent": f"00-{trace_id}-{span_id}-01"}
-
    @staticmethod
    def _log_with_lora_context(
        message: str,
@@ -1203,7 +1189,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):

        dp_rank = request.get("dp_rank", None)

-        trace_headers = self._build_trace_headers(context)
+        trace_headers = build_trace_headers(context)

        async with self._abort_monitor(context, request_id):
            try:
@@ -1249,7 +1235,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
        openai_request_id = request.get("id") or request.get("request_id", request_id)
        previous_text = ""

-        trace_headers = self._build_trace_headers(context)
+        trace_headers = build_trace_headers(context)

        async with self._abort_monitor(context, request_id):
            try:
@@ -1411,7 +1397,7 @@ class PrefillWorkerHandler(BaseWorkerHandler):

        dp_rank = request.get("dp_rank", None)

-        trace_headers = self._build_trace_headers(context)
+        trace_headers = build_trace_headers(context)

        async with self._abort_monitor(context, request_id, is_prefill=True):
            try:

--- a/examples/backends/trtllm/launch/agg.sh
+++ b/examples/backends/trtllm/launch/agg.sh
@@ -20,17 +20,51 @@ cleanup() {
 }
 trap cleanup EXIT INT TERM

+ENABLE_OTEL=false
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --enable-otel)
+            ENABLE_OTEL=true
+            shift
+            ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo "Options:"
+            echo "  --enable-otel        Enable OpenTelemetry tracing"
+            echo "  -h, --help           Show this help message"
+            echo ""
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+TRACE_ARGS=()
+if [ "$ENABLE_OTEL" = true ]; then
+    export DYN_LOGGING_JSONL=true
+    export OTEL_EXPORT_ENABLED=1
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317}
+    TRACE_ARGS+=(--override-engine-args "{\"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\" }")
+fi
+

 # run frontend
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
+OTEL_SERVICE_NAME=dynamo-frontend \
 python3 -m dynamo.frontend &
 DYNAMO_PID=$!

 # run worker
 # Additional command line args can be passed
+OTEL_SERVICE_NAME=dynamo-worker \
 python3 -m dynamo.trtllm \
  --model-path "$MODEL_PATH" \
  --served-model-name "$SERVED_MODEL_NAME" \
  --modality "$MODALITY" \
  --extra-engine-args "$AGG_ENGINE_ARGS" \
+  "${TRACE_ARGS[@]}" \
  "$@"
--- a/examples/backends/trtllm/launch/disagg.sh
+++ b/examples/backends/trtllm/launch/disagg.sh
@@ -23,25 +23,59 @@ cleanup() {
 }
 trap cleanup EXIT INT TERM

+ENABLE_OTEL=false
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --enable-otel)
+            ENABLE_OTEL=true
+            shift
+            ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo "Options:"
+            echo "  --enable-otel        Enable OpenTelemetry tracing"
+            echo "  -h, --help           Show this help message"
+            echo ""
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+
+# Enable tracing if requested
+TRACE_ARGS=()
+if [ "$ENABLE_OTEL" = true ]; then
+    export DYN_LOGGING_JSONL=true
+    export OTEL_EXPORT_ENABLED=1
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317}
+    TRACE_ARGS+=(--override-engine-args "{\"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\" }")
+fi

 # run frontend
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
+OTEL_SERVICE_NAME=dynamo-frontend \
 python3 -m dynamo.frontend &
 DYNAMO_PID=$!

 # run prefill worker
-CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
+OTEL_SERVICE_NAME=dynamo-worker-prefill CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
  --model-path "$MODEL_PATH" \
  --served-model-name "$SERVED_MODEL_NAME" \
  --extra-engine-args  "$PREFILL_ENGINE_ARGS" \
  --modality "$MODALITY" \
-  --disaggregation-mode prefill &
+  --disaggregation-mode prefill \
+  "${TRACE_ARGS[@]}" &
 PREFILL_PID=$!

 # run decode worker
-CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
+OTEL_SERVICE_NAME=dynamo-worker-decode CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
  --model-path "$MODEL_PATH" \
  --served-model-name "$SERVED_MODEL_NAME" \
  --extra-engine-args  "$DECODE_ENGINE_ARGS" \
  --modality "$MODALITY" \
-  --disaggregation-mode decode
+  --disaggregation-mode decode \
+  "${TRACE_ARGS[@]}"
--- a/examples/backends/trtllm/launch/disagg_same_gpu.sh
+++ b/examples/backends/trtllm/launch/disagg_same_gpu.sh
@@ -46,13 +46,45 @@ cleanup() {
 }
 trap cleanup EXIT INT TERM

+ENABLE_OTEL=false
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --enable-otel)
+            ENABLE_OTEL=true
+            shift
+            ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo "Options:"
+            echo "  --enable-otel        Enable OpenTelemetry tracing"
+            echo "  -h, --help           Show this help message"
+            echo ""
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use --help for usage information"
+            exit 1
+            ;;
+    esac
+done

+# Enable tracing if requested
+TRACE_ARGS=()
+if [ "$ENABLE_OTEL" = true ]; then
+    export DYN_LOGGING_JSONL=true
+    export OTEL_EXPORT_ENABLED=1
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317}
+    TRACE_ARGS+=(--override-engine-args "{\"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\" }")
+fi
 # run frontend
 # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
+OTEL_SERVICE_NAME=dynamo-frontend \
 python3 -m dynamo.frontend &
 DYNAMO_PID=$!

 # run prefill worker (shares GPU with decode)
+OTEL_SERVICE_NAME=dynamo-worker-prefill \
 CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
 python3 -m dynamo.trtllm \
@@ -61,10 +93,12 @@ python3 -m dynamo.trtllm \
  --extra-engine-args  "$PREFILL_ENGINE_ARGS" \
  --modality "$MODALITY" \
  --publish-events-and-metrics \
-  --disaggregation-mode prefill &
+  --disaggregation-mode prefill \
+  "${TRACE_ARGS[@]}" &
 PREFILL_PID=$!

 # run decode worker (shares GPU with prefill)
+OTEL_SERVICE_NAME=dynamo-worker-decode \
 CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
 DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
 python3 -m dynamo.trtllm \
@@ -73,5 +107,6 @@ python3 -m dynamo.trtllm \
  --extra-engine-args  "$DECODE_ENGINE_ARGS" \
  --modality "$MODALITY" \
  --publish-events-and-metrics \
-  --disaggregation-mode decode
+  --disaggregation-mode decode \
+  "${TRACE_ARGS[@]}"