Unverified Commit 8c129ed4 authored by jh-nv's avatar jh-nv Committed by GitHub
Browse files

feat: Propagate OTEL tracing context for trtllm (#5377)

parent 869562da
......@@ -9,10 +9,11 @@ Dynamo backends and components.
Submodules:
- endpoint_types: Endpoint type parsing utilities
- otel_tracing: OpenTelemetry tracing header utilities
- paths: Workspace directory detection and path utilities
- prometheus: Prometheus metrics collection and logging utilities
"""
from dynamo.common.utils import endpoint_types, paths, prometheus
from dynamo.common.utils import endpoint_types, otel_tracing, paths, prometheus
__all__ = ["endpoint_types", "paths", "prometheus"]
__all__ = ["endpoint_types", "otel_tracing", "paths", "prometheus"]
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
OpenTelemetry tracing header utilities for Dynamo components.
"""
from dynamo._core import Context
def build_trace_headers(context: Context) -> dict[str, str] | None:
"""
Build trace headers from context for propagation.
"""
trace_id = context.trace_id
span_id = context.span_id
if not trace_id or not span_id:
return None
# W3C Trace Context format: {version}-{trace_id}-{parent_id}-{trace_flags}
# version: 00, trace_flags: 01 (sampled)
# TODO: properly propagate the trace-flags from current span.
return {"traceparent": f"00-{trace_id}-{span_id}-01"}
......@@ -29,6 +29,7 @@ from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams
from tensorrt_llm.llmapi.llm import SamplingParams
from dynamo._core import Context
from dynamo.common.utils.otel_tracing import build_trace_headers
from dynamo.logits_processing.examples import HelloWorldLogitsProcessor
from dynamo.nixl_connect import Connector
from dynamo.runtime import DistributedRuntime
......@@ -366,6 +367,9 @@ class HandlerBase:
prefill_result.get("prompt_tokens_details") if prefill_result else None
)
# Build trace headers for distributed tracing
trace_headers = build_trace_headers(context)
try:
# NEW: Updated engine call to include multimodal data
generation_result = self.engine.llm.generate_async(
......@@ -373,6 +377,7 @@ class HandlerBase:
sampling_params=sampling_params,
disaggregated_params=disaggregated_params,
streaming=streaming,
trace_headers=trace_headers,
)
# Use the context manager to handle cancellation monitoring
......
......@@ -21,8 +21,8 @@ from vllm.outputs import RequestOutput
from vllm.sampling_params import SamplingParams, StructuredOutputsParams
from vllm.v1.engine.exceptions import EngineDeadError
from dynamo._core import Context
from dynamo.common.utils.input_params import InputParamManager
from dynamo.common.utils.otel_tracing import build_trace_headers
from dynamo.llm import (
ModelInput,
ModelType,
......@@ -965,20 +965,6 @@ class BaseWorkerHandler(ABC):
return log_probs if log_probs else None, top_logprobs if top_logprobs else None
def _build_trace_headers(self, context: Context) -> dict[str, str] | None:
"""
Build trace headers from context for propagation to vLLM engine.
"""
trace_id = context.trace_id
span_id = context.span_id
if not trace_id or not span_id:
return None
# W3C Trace Context format: {version}-{trace_id}-{parent_id}-{trace_flags}
# version: 00, trace_flags: 01 (sampled)
# TODO: properly propagate the trace-flags from current span.
return {"traceparent": f"00-{trace_id}-{span_id}-01"}
@staticmethod
def _log_with_lora_context(
message: str,
......@@ -1203,7 +1189,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
dp_rank = request.get("dp_rank", None)
trace_headers = self._build_trace_headers(context)
trace_headers = build_trace_headers(context)
async with self._abort_monitor(context, request_id):
try:
......@@ -1249,7 +1235,7 @@ class DecodeWorkerHandler(BaseWorkerHandler):
openai_request_id = request.get("id") or request.get("request_id", request_id)
previous_text = ""
trace_headers = self._build_trace_headers(context)
trace_headers = build_trace_headers(context)
async with self._abort_monitor(context, request_id):
try:
......@@ -1411,7 +1397,7 @@ class PrefillWorkerHandler(BaseWorkerHandler):
dp_rank = request.get("dp_rank", None)
trace_headers = self._build_trace_headers(context)
trace_headers = build_trace_headers(context)
async with self._abort_monitor(context, request_id, is_prefill=True):
try:
......
......@@ -20,17 +20,51 @@ cleanup() {
}
trap cleanup EXIT INT TERM
ENABLE_OTEL=false
while [[ $# -gt 0 ]]; do
case $1 in
--enable-otel)
ENABLE_OTEL=true
shift
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --enable-otel Enable OpenTelemetry tracing"
echo " -h, --help Show this help message"
echo ""
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done
TRACE_ARGS=()
if [ "$ENABLE_OTEL" = true ]; then
export DYN_LOGGING_JSONL=true
export OTEL_EXPORT_ENABLED=1
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317}
TRACE_ARGS+=(--override-engine-args "{\"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\" }")
fi
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run worker
# Additional command line args can be passed
OTEL_SERVICE_NAME=dynamo-worker \
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--modality "$MODALITY" \
--extra-engine-args "$AGG_ENGINE_ARGS" \
"${TRACE_ARGS[@]}" \
"$@"
......@@ -23,25 +23,59 @@ cleanup() {
}
trap cleanup EXIT INT TERM
ENABLE_OTEL=false
while [[ $# -gt 0 ]]; do
case $1 in
--enable-otel)
ENABLE_OTEL=true
shift
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --enable-otel Enable OpenTelemetry tracing"
echo " -h, --help Show this help message"
echo ""
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done
# Enable tracing if requested
TRACE_ARGS=()
if [ "$ENABLE_OTEL" = true ]; then
export DYN_LOGGING_JSONL=true
export OTEL_EXPORT_ENABLED=1
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317}
TRACE_ARGS+=(--override-engine-args "{\"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\" }")
fi
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run prefill worker
CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
OTEL_SERVICE_NAME=dynamo-worker-prefill CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$PREFILL_ENGINE_ARGS" \
--modality "$MODALITY" \
--disaggregation-mode prefill &
--disaggregation-mode prefill \
"${TRACE_ARGS[@]}" &
PREFILL_PID=$!
# run decode worker
CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
OTEL_SERVICE_NAME=dynamo-worker-decode CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$DECODE_ENGINE_ARGS" \
--modality "$MODALITY" \
--disaggregation-mode decode
--disaggregation-mode decode \
"${TRACE_ARGS[@]}"
......@@ -46,13 +46,45 @@ cleanup() {
}
trap cleanup EXIT INT TERM
ENABLE_OTEL=false
while [[ $# -gt 0 ]]; do
case $1 in
--enable-otel)
ENABLE_OTEL=true
shift
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo "Options:"
echo " --enable-otel Enable OpenTelemetry tracing"
echo " -h, --help Show this help message"
echo ""
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done
# Enable tracing if requested
TRACE_ARGS=()
if [ "$ENABLE_OTEL" = true ]; then
export DYN_LOGGING_JSONL=true
export OTEL_EXPORT_ENABLED=1
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT:-http://localhost:4317}
TRACE_ARGS+=(--override-engine-args "{\"return_perf_metrics\": true, \"otlp_traces_endpoint\": \"${OTEL_EXPORTER_OTLP_TRACES_ENDPOINT}\" }")
fi
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
OTEL_SERVICE_NAME=dynamo-frontend \
python3 -m dynamo.frontend &
DYNAMO_PID=$!
# run prefill worker (shares GPU with decode)
OTEL_SERVICE_NAME=dynamo-worker-prefill \
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
python3 -m dynamo.trtllm \
......@@ -61,10 +93,12 @@ python3 -m dynamo.trtllm \
--extra-engine-args "$PREFILL_ENGINE_ARGS" \
--modality "$MODALITY" \
--publish-events-and-metrics \
--disaggregation-mode prefill &
--disaggregation-mode prefill \
"${TRACE_ARGS[@]}" &
PREFILL_PID=$!
# run decode worker (shares GPU with prefill)
OTEL_SERVICE_NAME=dynamo-worker-decode \
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
python3 -m dynamo.trtllm \
......@@ -73,5 +107,6 @@ python3 -m dynamo.trtllm \
--extra-engine-args "$DECODE_ENGINE_ARGS" \
--modality "$MODALITY" \
--publish-events-and-metrics \
--disaggregation-mode decode
--disaggregation-mode decode \
"${TRACE_ARGS[@]}"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment