Unverified Commit 1f92dd54 authored by mohammedabdulwahhab's avatar mohammedabdulwahhab Committed by GitHub
Browse files

feat: OTEL Exporter and Tempo Visualization (#3307)


Signed-off-by: default avatarmohammedabdulwahhab <furkhan324@berkeley.edu>
parent 037cc35d
...@@ -1252,7 +1252,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" ...@@ -1252,7 +1252,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8030735ecb0d128428b64cd379809817e620a40e5001c54465b99ec5feec2857" checksum = "8030735ecb0d128428b64cd379809817e620a40e5001c54465b99ec5feec2857"
dependencies = [ dependencies = [
"futures-core", "futures-core",
"prost", "prost 0.13.5",
"prost-types", "prost-types",
"tonic 0.12.3", "tonic 0.12.3",
"tracing-core", "tracing-core",
...@@ -1271,7 +1271,7 @@ dependencies = [ ...@@ -1271,7 +1271,7 @@ dependencies = [
"hdrhistogram", "hdrhistogram",
"humantime", "humantime",
"hyper-util", "hyper-util",
"prost", "prost 0.13.5",
"prost-types", "prost-types",
"serde", "serde",
"serde_json", "serde_json",
...@@ -2176,7 +2176,7 @@ dependencies = [ ...@@ -2176,7 +2176,7 @@ dependencies = [
"parking_lot", "parking_lot",
"prometheus", "prometheus",
"proptest", "proptest",
"prost", "prost 0.13.5",
"rand 0.9.2", "rand 0.9.2",
"rayon", "rayon",
"regex", "regex",
...@@ -2295,6 +2295,9 @@ dependencies = [ ...@@ -2295,6 +2295,9 @@ dependencies = [
"nix 0.29.0", "nix 0.29.0",
"nuid", "nuid",
"once_cell", "once_cell",
"opentelemetry",
"opentelemetry-otlp",
"opentelemetry_sdk",
"prometheus", "prometheus",
"rand 0.9.2", "rand 0.9.2",
"rayon", "rayon",
...@@ -2314,6 +2317,7 @@ dependencies = [ ...@@ -2314,6 +2317,7 @@ dependencies = [
"tokio-util", "tokio-util",
"tower-http", "tower-http",
"tracing", "tracing",
"tracing-opentelemetry",
"tracing-subscriber", "tracing-subscriber",
"url", "url",
"uuid 1.18.1", "uuid 1.18.1",
...@@ -2519,7 +2523,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" ...@@ -2519,7 +2523,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88365f1a5671eb2f7fc240adb216786bc6494b38ce15f1d26ad6eaa303d5e822" checksum = "88365f1a5671eb2f7fc240adb216786bc6494b38ce15f1d26ad6eaa303d5e822"
dependencies = [ dependencies = [
"http 1.3.1", "http 1.3.1",
"prost", "prost 0.13.5",
"tokio", "tokio",
"tokio-stream", "tokio-stream",
"tonic 0.13.1", "tonic 0.13.1",
...@@ -4881,7 +4885,7 @@ dependencies = [ ...@@ -4881,7 +4885,7 @@ dependencies = [
"colored", "colored",
"futures", "futures",
"modelexpress-common", "modelexpress-common",
"prost", "prost 0.13.5",
"serde", "serde",
"serde_json", "serde_json",
"thiserror 2.0.16", "thiserror 2.0.16",
...@@ -4904,7 +4908,7 @@ dependencies = [ ...@@ -4904,7 +4908,7 @@ dependencies = [
"config", "config",
"hf-hub", "hf-hub",
"jiff", "jiff",
"prost", "prost 0.13.5",
"serde", "serde",
"serde_json", "serde_json",
"serde_yaml", "serde_yaml",
...@@ -5485,6 +5489,82 @@ dependencies = [ ...@@ -5485,6 +5489,82 @@ dependencies = [
"vcpkg", "vcpkg",
] ]
[[package]]
name = "opentelemetry"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b84bcd6ae87133e903af7ef497404dda70c60d0ea14895fc8a5e6722754fc2a0"
dependencies = [
"futures-core",
"futures-sink",
"js-sys",
"pin-project-lite",
"thiserror 2.0.16",
"tracing",
]
[[package]]
name = "opentelemetry-http"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7a6d09a73194e6b66df7c8f1b680f156d916a1a942abf2de06823dd02b7855d"
dependencies = [
"async-trait",
"bytes",
"http 1.3.1",
"opentelemetry",
"reqwest 0.12.23",
]
[[package]]
name = "opentelemetry-otlp"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a2366db2dca4d2ad033cad11e6ee42844fd727007af5ad04a1730f4cb8163bf"
dependencies = [
"http 1.3.1",
"opentelemetry",
"opentelemetry-http",
"opentelemetry-proto",
"opentelemetry_sdk",
"prost 0.14.1",
"reqwest 0.12.23",
"thiserror 2.0.16",
"tokio",
"tonic 0.14.2",
"tracing",
]
[[package]]
name = "opentelemetry-proto"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7175df06de5eaee9909d4805a3d07e28bb752c34cab57fa9cff549da596b30f"
dependencies = [
"opentelemetry",
"opentelemetry_sdk",
"prost 0.14.1",
"tonic 0.14.2",
"tonic-prost",
]
[[package]]
name = "opentelemetry_sdk"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e14ae4f5991976fd48df6d843de219ca6d31b01daaab2dad5af2badeded372bd"
dependencies = [
"futures-channel",
"futures-executor",
"futures-util",
"opentelemetry",
"percent-encoding",
"rand 0.9.2",
"thiserror 2.0.16",
"tokio",
"tokio-stream",
]
[[package]] [[package]]
name = "option-ext" name = "option-ext"
version = "0.2.0" version = "0.2.0"
...@@ -5991,7 +6071,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" ...@@ -5991,7 +6071,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5"
dependencies = [ dependencies = [
"bytes", "bytes",
"prost-derive", "prost-derive 0.13.5",
]
[[package]]
name = "prost"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7231bd9b3d3d33c86b58adbac74b5ec0ad9f496b19d22801d773636feaa95f3d"
dependencies = [
"bytes",
"prost-derive 0.14.1",
] ]
[[package]] [[package]]
...@@ -6007,7 +6097,7 @@ dependencies = [ ...@@ -6007,7 +6097,7 @@ dependencies = [
"once_cell", "once_cell",
"petgraph", "petgraph",
"prettyplease", "prettyplease",
"prost", "prost 0.13.5",
"prost-types", "prost-types",
"regex", "regex",
"syn 2.0.106", "syn 2.0.106",
...@@ -6027,13 +6117,26 @@ dependencies = [ ...@@ -6027,13 +6117,26 @@ dependencies = [
"syn 2.0.106", "syn 2.0.106",
] ]
[[package]]
name = "prost-derive"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9120690fafc389a67ba3803df527d0ec9cbbc9cc45e4cc20b332996dfb672425"
dependencies = [
"anyhow",
"itertools 0.14.0",
"proc-macro2",
"quote",
"syn 2.0.106",
]
[[package]] [[package]]
name = "prost-types" name = "prost-types"
version = "0.13.5" version = "0.13.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16"
dependencies = [ dependencies = [
"prost", "prost 0.13.5",
] ]
[[package]] [[package]]
...@@ -8622,7 +8725,7 @@ dependencies = [ ...@@ -8622,7 +8725,7 @@ dependencies = [
"hyper-util", "hyper-util",
"percent-encoding", "percent-encoding",
"pin-project", "pin-project",
"prost", "prost 0.13.5",
"socket2 0.5.10", "socket2 0.5.10",
"tokio", "tokio",
"tokio-stream", "tokio-stream",
...@@ -8651,7 +8754,7 @@ dependencies = [ ...@@ -8651,7 +8754,7 @@ dependencies = [
"hyper-util", "hyper-util",
"percent-encoding", "percent-encoding",
"pin-project", "pin-project",
"prost", "prost 0.13.5",
"socket2 0.5.10", "socket2 0.5.10",
"tokio", "tokio",
"tokio-rustls", "tokio-rustls",
...@@ -8662,6 +8765,32 @@ dependencies = [ ...@@ -8662,6 +8765,32 @@ dependencies = [
"tracing", "tracing",
] ]
[[package]]
name = "tonic"
version = "0.14.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb7613188ce9f7df5bfe185db26c5814347d110db17920415cf2fbcad85e7203"
dependencies = [
"async-trait",
"base64 0.22.1",
"bytes",
"http 1.3.1",
"http-body 1.0.1",
"http-body-util",
"hyper 1.7.0",
"hyper-timeout",
"hyper-util",
"percent-encoding",
"pin-project",
"sync_wrapper 1.0.2",
"tokio",
"tokio-stream",
"tower 0.5.2",
"tower-layer",
"tower-service",
"tracing",
]
[[package]] [[package]]
name = "tonic-build" name = "tonic-build"
version = "0.13.1" version = "0.13.1"
...@@ -8676,6 +8805,17 @@ dependencies = [ ...@@ -8676,6 +8805,17 @@ dependencies = [
"syn 2.0.106", "syn 2.0.106",
] ]
[[package]]
name = "tonic-prost"
version = "0.14.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "66bd50ad6ce1252d87ef024b3d64fe4c3cf54a86fb9ef4c631fdd0ded7aeaa67"
dependencies = [
"bytes",
"prost 0.14.1",
"tonic 0.14.2",
]
[[package]] [[package]]
name = "tower" name = "tower"
version = "0.4.13" version = "0.4.13"
...@@ -8801,6 +8941,25 @@ dependencies = [ ...@@ -8801,6 +8941,25 @@ dependencies = [
"tracing-core", "tracing-core",
] ]
[[package]]
name = "tracing-opentelemetry"
version = "0.32.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e6e5658463dd88089aba75c7791e1d3120633b1bfde22478b28f625a9bb1b8e"
dependencies = [
"js-sys",
"opentelemetry",
"opentelemetry_sdk",
"rustversion",
"smallvec",
"thiserror 2.0.16",
"tracing",
"tracing-core",
"tracing-log",
"tracing-subscriber",
"web-time",
]
[[package]] [[package]]
name = "tracing-serde" name = "tracing-serde"
version = "0.2.0" version = "0.2.0"
......
...@@ -105,6 +105,10 @@ tracing-subscriber = { version = "0.3", features = [ ...@@ -105,6 +105,10 @@ tracing-subscriber = { version = "0.3", features = [
"local-time", "local-time",
"json", "json",
] } ] }
tracing-opentelemetry = { version = "0.32.0" }
opentelemetry = { version = "0.31.0", features = ["trace"] }
opentelemetry_sdk = { version = "0.31.0", features = ["trace", "rt-tokio"] }
opentelemetry-otlp = { version = "0.31.0", features = ["trace", "grpc-tonic"] }
validator = { version = "0.20.0", features = ["derive"] } validator = { version = "0.20.0", features = ["derive"] }
uuid = { version = "1.17", features = ["v4", "serde"] } uuid = { version = "1.17", features = ["v4", "serde"] }
url = { version = "2.5", features = ["serde"] } url = { version = "2.5", features = ["serde"] }
......
<!--
SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
SPDX-License-Identifier: Apache-2.0
-->
# Distributed Tracing with Tempo
This guide explains how to set up and view distributed traces in Grafana Tempo for Dynamo workloads.
## Overview
Dynamo supports OpenTelemetry-based distributed tracing, allowing you to visualize request flows across Frontend and Worker components. Traces are exported to Tempo via OTLP (OpenTelemetry Protocol) and visualized in Grafana.
## Prerequisites
- Docker and Docker Compose (for local deployment)
- Kubernetes cluster with kubectl access (for Kubernetes deployment)
- Dynamo runtime with tracing support
## Environment Variables
Dynamo's tracing is configured via environment variables. For complete logging documentation, see [docs/guides/logging.md](../../docs/guides/logging.md).
### Required Environment Variables
| Variable | Description | Example Value |
|----------|-------------|---------------|
| `DYN_LOGGING_JSONL` | Enable JSONL logging format (required for tracing) | `true` |
| `OTEL_EXPORT_ENABLED` | Enable OTLP trace export | `1` |
| `OTEL_EXPORT_ENDPOINT` | OTLP gRPC endpoint for Tempo | `http://localhost:4317` (local) or `http://tempo:4317` (docker) |
| `OTEL_SERVICE_NAME` | Service name for identifying components | `dynamo-frontend`, `dynamo-worker-prefill`, `dynamo-worker-decode` |
### Example Configuration
```bash
# Enable JSONL logging and tracing
export DYN_LOGGING_JSONL=true
# Enable trace export to Tempo
export OTEL_EXPORT_ENABLED=1
# Set the Tempo endpoint (docker-compose network)
export OTEL_EXPORT_ENDPOINT=http://tempo:4317
# Set service name to identify this component
export OTEL_SERVICE_NAME=dynamo-frontend
```
---
## Local Deployment with Docker Compose
### 1. Start Tempo and Grafana
From the `deploy/tracing` directory, start the observability stack:
```bash
cd deploy/tracing
docker-compose up -d
```
This will start:
- **Tempo** on `http://localhost:3200` (HTTP API) and `localhost:4317` (OTLP gRPC)
- **Grafana** on `http://localhost:3000` (username: `admin`, password: `admin`)
Verify services are running:
```bash
docker-compose ps
```
### 2. Set Environment Variables
Configure Dynamo components to export traces:
```bash
# Enable JSONL logging and tracing
export DYN_LOGGING_JSONL=true
export OTEL_EXPORT_ENABLED=1
export OTEL_EXPORT_ENDPOINT=http://localhost:4317
# Set service names for each component
export OTEL_SERVICE_NAME=dynamo-frontend
```
### 3. Run vLLM Disaggregated Deployment
Run the vLLM disaggregated script with tracing enabled:
```bash
# Navigate to vLLM launch directory
cd components/backends/vllm/launch
# Run disaggregated deployment (modify the script to export env vars first)
./disagg.sh
```
**Note:** You may need to modify `disagg.sh` to export the tracing environment variables before starting each component:
```bash
#!/bin/bash
set -e
trap 'echo Cleaning up...; kill 0' EXIT
# Enable tracing
export DYN_LOGGING_JSONL=true
export OTEL_EXPORT_ENABLED=1
export OTEL_EXPORT_ENDPOINT=http://localhost:4317
# Run frontend
export OTEL_SERVICE_NAME=dynamo-frontend
python -m dynamo.frontend --router-mode kv --http-port=8000 &
# Run decode worker
export OTEL_SERVICE_NAME=dynamo-worker-decode
CUDA_VISIBLE_DEVICES=0 python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager &
# Run prefill worker
export OTEL_SERVICE_NAME=dynamo-worker-prefill
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.vllm \
--model Qwen/Qwen3-0.6B \
--enforce-eager \
--is-prefill-worker &
wait
```
### 4. Generate Traces
Send requests to the frontend to generate traces:
```bash
curl -d '{
"model": "Qwen/Qwen3-0.6B",
"max_completion_tokens": 100,
"messages": [
{"role": "user", "content": "What is the capital of France?"}
]
}' \
-H 'Content-Type: application/json' \
-H 'x-request-id: test-trace-001' \
http://localhost:8000/v1/chat/completions
```
### 5. View Traces in Grafana Tempo
1. Open Grafana at `http://localhost:3000`
2. Login with username `admin` and password `admin`
3. Navigate to **Explore** (compass icon in the left sidebar)
4. Select **Tempo** as the data source (should be selected by default)
5. Use the **Search** tab to find traces:
- Search by **Service Name** (e.g., `dynamo-frontend`)
- Search by **Span Name** (e.g., `http-request`, `handle_payload`)
- Search by **Tags** (e.g., `x_request_id=test-trace-001`)
6. Click on a trace to view the detailed flame graph
#### Example Trace View
Below is an example of what a trace looks like in Grafana Tempo:
![Trace Example](./trace.png)
### 6. Stop Services
When done, stop the Tempo and Grafana stack:
```bash
cd deploy/tracing
docker-compose down
```
---
## Kubernetes Deployment
For Kubernetes deployments, ensure you have a Tempo instance deployed and accessible (e.g., `http://tempo.observability.svc.cluster.local:4317`).
### Modify DynamoGraphDeployment for Tracing
Add common tracing environment variables at the top level and service-specific names in each component in your `DynamoGraphDeployment` (e.g., `components/backends/vllm/deploy/disagg.yaml`):
```yaml
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: vllm-disagg
spec:
# Common environment variables for all services
env:
- name: DYN_LOGGING_JSONL
value: "true"
- name: OTEL_EXPORT_ENABLED
value: "1"
- name: OTEL_EXPORT_ENDPOINT
value: "http://tempo.observability.svc.cluster.local:4317"
services:
Frontend:
# ... existing configuration ...
extraPodSpec:
mainContainer:
# ... existing configuration ...
env:
- name: OTEL_SERVICE_NAME
value: "dynamo-frontend"
VllmDecodeWorker:
# ... existing configuration ...
extraPodSpec:
mainContainer:
# ... existing configuration ...
env:
- name: OTEL_SERVICE_NAME
value: "dynamo-worker-decode"
VllmPrefillWorker:
# ... existing configuration ...
extraPodSpec:
mainContainer:
# ... existing configuration ...
env:
- name: OTEL_SERVICE_NAME
value: "dynamo-worker-prefill"
```
Apply the updated DynamoGraphDeployment:
```bash
kubectl apply -f components/backends/vllm/deploy/disagg.yaml
```
Traces will now be exported to Tempo and can be viewed in Grafana.
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
version: '3.8'
services:
# Tempo - Distributed tracing backend
tempo:
image: grafana/tempo:2.8.2
command: [ "-config.file=/etc/tempo.yaml" ]
volumes:
- ./tempo.yaml:/etc/tempo.yaml
- tempo-data:/tmp/tempo
ports:
- "3200:3200" # Tempo HTTP
- "4317:4317" # OTLP gRPC receiver (accessible from host)
- "4318:4318" # OTLP HTTP receiver (accessible from host)
# Grafana - Visualization and dashboards
grafana:
image: grafana/grafana:12.2.0
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_PASSWORD=admin
- GF_FEATURE_TOGGLES_ENABLE=traceqlEditor
volumes:
- grafana-data:/var/lib/grafana
- ./grafana/provisioning:/etc/grafana/provisioning
depends_on:
- tempo
volumes:
tempo-data:
grafana-data:
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: 1
datasources:
- name: Tempo
type: tempo
access: proxy
url: http://tempo:3200
uid: tempo
isDefault: true
editable: true
jsonData:
httpMethod: GET
serviceMap:
datasourceUid: tempo
search:
hide: false
nodeGraph:
enabled: true
traceQuery:
timeShiftEnabled: true
spanStartTimeShift: 1h
spanEndTimeShift: 1h
spanBar:
type: Tag
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
server:
http_listen_port: 3200
distributor:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
ingester:
trace_idle_period: 10s
max_block_bytes: 1_000_000
max_block_duration: 5m
storage:
trace:
backend: local
block:
bloom_filter_false_positive: .05
wal:
path: /tmp/tempo/wal
local:
path: /tmp/tempo/blocks
compactor:
compaction:
compaction_window: 1h
max_compaction_objects: 1000000
block_retention: 1h
compacted_block_retention: 10m
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
...@@ -32,6 +32,9 @@ distributed tracing. ...@@ -32,6 +32,9 @@ distributed tracing.
| `DYN_LOG_USE_LOCAL_TZ` | Use local timezone for logging timestamps (default: UTC) | `DYN_LOG_USE_LOCAL_TZ=1` | | `DYN_LOG_USE_LOCAL_TZ` | Use local timezone for logging timestamps (default: UTC) | `DYN_LOG_USE_LOCAL_TZ=1` |
| `DYN_LOG` | Log levels per target `<default_level>,<module_path>=<level>,<module_path>=<level>` | `DYN_LOG=info,dynamo_runtime::system_status_server:trace` | | `DYN_LOG` | Log levels per target `<default_level>,<module_path>=<level>,<module_path>=<level>` | `DYN_LOG=info,dynamo_runtime::system_status_server:trace` |
| `DYN_LOGGING_CONFIG_PATH` | Path to custom TOML logging configuration file | `DYN_LOGGING_CONFIG_PATH=/path/to/config.toml`| | `DYN_LOGGING_CONFIG_PATH` | Path to custom TOML logging configuration file | `DYN_LOGGING_CONFIG_PATH=/path/to/config.toml`|
| `OTEL_SERVICE_NAME` | Service name for OpenTelemetry traces (default: `dynamo`) | `OTEL_SERVICE_NAME=dynamo-frontend` |
| `OTEL_EXPORT_ENABLED` | Enable OTLP trace exporting (set to `1` to enable) | `OTEL_EXPORT_ENABLED=1` |
| `OTEL_EXPORT_ENDPOINT` | OTLP exporter endpoint (default: http://localhost:4317) | `OTEL_EXPORT_ENDPOINT=http://tempo:4317` |
## Available Logging Levels ## Available Logging Levels
...@@ -82,10 +85,49 @@ Resulting Log format: ...@@ -82,10 +85,49 @@ Resulting Log format:
{"time":"2025-09-02T15:53:31.943747Z","level":"INFO","target":"log","message":"Scheduler config values: {'max_num_seqs': 256, 'max_num_batched_tokens': 2048}","log.file":"/opt/dynamo/venv/lib/python3.12/site-packages/dynamo/vllm/main.py","log.line":268,"log.target":"main.get_engine_cache_info"} {"time":"2025-09-02T15:53:31.943747Z","level":"INFO","target":"log","message":"Scheduler config values: {'max_num_seqs': 256, 'max_num_batched_tokens': 2048}","log.file":"/opt/dynamo/venv/lib/python3.12/site-packages/dynamo/vllm/main.py","log.line":268,"log.target":"main.get_engine_cache_info"}
``` ```
## OpenTelemetry Distributed Tracing
When `DYN_LOGGING_JSONL` is enabled, Dynamo uses OpenTelemetry for distributed tracing. All logs include `trace_id` and `span_id` fields, and spans are automatically created for requests. By default, traces are **not exported**. To export traces to an observability backend (like Tempo, Jaeger, or Zipkin), set `OTEL_EXPORT_ENABLED=1`.
### Behavior
- **With `DYN_LOGGING_JSONL=true` only**: OpenTelemetry layer is active, generating trace context and span IDs for all requests. Traces appear in logs but are not exported anywhere.
- **With `OTEL_EXPORT_ENABLED=1` and `DYN_LOGGING_JSONL=true`**: Same as above, plus traces are exported to an OTLP collector for visualization.
### Configuration
To enable OTLP trace exporting:
1. Set `OTEL_EXPORT_ENABLED=1` to enable trace export
2. Optionally configure the endpoint using `OTEL_EXPORT_ENDPOINT` (default: `http://localhost:4317`)
3. Optionally set `OTEL_SERVICE_NAME` to identify the service (useful in Kubernetes, default: `dynamo`)
**Export Settings:**
- **Protocol**: gRPC (Tonic)
- **Service Name**: Value of `OTEL_SERVICE_NAME` env var, or `dynamo` if not set
- **Endpoint**: Value of `OTEL_EXPORT_ENDPOINT` env var, or `http://localhost:4317` if not set
### Example: JSONL Logging Only (No Export)
```bash
export DYN_LOGGING_JSONL=true
# OpenTelemetry is active, traces appear in logs, but nothing is exported
```
### Example: JSONL Logging + Trace Export to Tempo
```bash
export DYN_LOGGING_JSONL=true
export OTEL_EXPORT_ENABLED=1
export OTEL_EXPORT_ENDPOINT=http://tempo:4317
export OTEL_SERVICE_NAME=dynamo-frontend
# OpenTelemetry is active, traces appear in logs AND are exported to Tempo
```
## Trace and Span information ## Trace and Span information
When `DYN_LOGGING_JSONL` is enabled with `DYN_LOG` set to greater than or equal to When `DYN_LOGGING_JSONL` is enabled with `DYN_LOG` set to greater than or equal to
`info` level trace information is added to all logged spans along with `info` level, trace information is added to all logged spans along with
`SPAN_CREATED` and `SPAN_CLOSED` events. `SPAN_CREATED` and `SPAN_CLOSED` events.
### Example Request ### Example Request
......
...@@ -1502,7 +1502,7 @@ dependencies = [ ...@@ -1502,7 +1502,7 @@ dependencies = [
"oneshot", "oneshot",
"parking_lot", "parking_lot",
"prometheus", "prometheus",
"prost", "prost 0.13.5",
"rand 0.9.2", "rand 0.9.2",
"rayon", "rayon",
"regex", "regex",
...@@ -1520,7 +1520,7 @@ dependencies = [ ...@@ -1520,7 +1520,7 @@ dependencies = [
"tokio-util", "tokio-util",
"toktrie", "toktrie",
"toktrie_hf_tokenizers", "toktrie_hf_tokenizers",
"tonic", "tonic 0.13.1",
"tonic-build", "tonic-build",
"tower-http", "tower-http",
"tracing", "tracing",
...@@ -1619,6 +1619,9 @@ dependencies = [ ...@@ -1619,6 +1619,9 @@ dependencies = [
"nix 0.29.0", "nix 0.29.0",
"nuid", "nuid",
"once_cell", "once_cell",
"opentelemetry",
"opentelemetry-otlp",
"opentelemetry_sdk",
"prometheus", "prometheus",
"rand 0.9.2", "rand 0.9.2",
"rayon", "rayon",
...@@ -1633,6 +1636,7 @@ dependencies = [ ...@@ -1633,6 +1636,7 @@ dependencies = [
"tokio-util", "tokio-util",
"tower-http", "tower-http",
"tracing", "tracing",
"tracing-opentelemetry",
"tracing-subscriber", "tracing-subscriber",
"url", "url",
"uuid", "uuid",
...@@ -1792,10 +1796,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" ...@@ -1792,10 +1796,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "88365f1a5671eb2f7fc240adb216786bc6494b38ce15f1d26ad6eaa303d5e822" checksum = "88365f1a5671eb2f7fc240adb216786bc6494b38ce15f1d26ad6eaa303d5e822"
dependencies = [ dependencies = [
"http", "http",
"prost", "prost 0.13.5",
"tokio", "tokio",
"tokio-stream", "tokio-stream",
"tonic", "tonic 0.13.1",
"tonic-build", "tonic-build",
"tower", "tower",
"tower-service", "tower-service",
...@@ -3482,12 +3486,12 @@ dependencies = [ ...@@ -3482,12 +3486,12 @@ dependencies = [
"colored", "colored",
"futures", "futures",
"modelexpress-common", "modelexpress-common",
"prost", "prost 0.13.5",
"serde", "serde",
"serde_json", "serde_json",
"thiserror 2.0.16", "thiserror 2.0.16",
"tokio", "tokio",
"tonic", "tonic 0.13.1",
"tracing", "tracing",
"tracing-subscriber", "tracing-subscriber",
"uuid", "uuid",
...@@ -3505,13 +3509,13 @@ dependencies = [ ...@@ -3505,13 +3509,13 @@ dependencies = [
"config", "config",
"hf-hub", "hf-hub",
"jiff", "jiff",
"prost", "prost 0.13.5",
"serde", "serde",
"serde_json", "serde_json",
"serde_yaml", "serde_yaml",
"thiserror 2.0.16", "thiserror 2.0.16",
"tokio", "tokio",
"tonic", "tonic 0.13.1",
"tonic-build", "tonic-build",
"tracing", "tracing",
] ]
...@@ -3936,6 +3940,82 @@ version = "0.1.6" ...@@ -3936,6 +3940,82 @@ version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e"
[[package]]
name = "opentelemetry"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b84bcd6ae87133e903af7ef497404dda70c60d0ea14895fc8a5e6722754fc2a0"
dependencies = [
"futures-core",
"futures-sink",
"js-sys",
"pin-project-lite",
"thiserror 2.0.16",
"tracing",
]
[[package]]
name = "opentelemetry-http"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d7a6d09a73194e6b66df7c8f1b680f156d916a1a942abf2de06823dd02b7855d"
dependencies = [
"async-trait",
"bytes",
"http",
"opentelemetry",
"reqwest",
]
[[package]]
name = "opentelemetry-otlp"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a2366db2dca4d2ad033cad11e6ee42844fd727007af5ad04a1730f4cb8163bf"
dependencies = [
"http",
"opentelemetry",
"opentelemetry-http",
"opentelemetry-proto",
"opentelemetry_sdk",
"prost 0.14.1",
"reqwest",
"thiserror 2.0.16",
"tokio",
"tonic 0.14.2",
"tracing",
]
[[package]]
name = "opentelemetry-proto"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7175df06de5eaee9909d4805a3d07e28bb752c34cab57fa9cff549da596b30f"
dependencies = [
"opentelemetry",
"opentelemetry_sdk",
"prost 0.14.1",
"tonic 0.14.2",
"tonic-prost",
]
[[package]]
name = "opentelemetry_sdk"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e14ae4f5991976fd48df6d843de219ca6d31b01daaab2dad5af2badeded372bd"
dependencies = [
"futures-channel",
"futures-executor",
"futures-util",
"opentelemetry",
"percent-encoding",
"rand 0.9.2",
"thiserror 2.0.16",
"tokio",
"tokio-stream",
]
[[package]] [[package]]
name = "option-ext" name = "option-ext"
version = "0.2.0" version = "0.2.0"
...@@ -4358,7 +4438,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" ...@@ -4358,7 +4438,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5"
dependencies = [ dependencies = [
"bytes", "bytes",
"prost-derive", "prost-derive 0.13.5",
]
[[package]]
name = "prost"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7231bd9b3d3d33c86b58adbac74b5ec0ad9f496b19d22801d773636feaa95f3d"
dependencies = [
"bytes",
"prost-derive 0.14.1",
] ]
[[package]] [[package]]
...@@ -4374,7 +4464,7 @@ dependencies = [ ...@@ -4374,7 +4464,7 @@ dependencies = [
"once_cell", "once_cell",
"petgraph", "petgraph",
"prettyplease", "prettyplease",
"prost", "prost 0.13.5",
"prost-types", "prost-types",
"regex", "regex",
"syn 2.0.106", "syn 2.0.106",
...@@ -4394,13 +4484,26 @@ dependencies = [ ...@@ -4394,13 +4484,26 @@ dependencies = [
"syn 2.0.106", "syn 2.0.106",
] ]
[[package]]
name = "prost-derive"
version = "0.14.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9120690fafc389a67ba3803df527d0ec9cbbc9cc45e4cc20b332996dfb672425"
dependencies = [
"anyhow",
"itertools 0.14.0",
"proc-macro2",
"quote",
"syn 2.0.106",
]
[[package]] [[package]]
name = "prost-types" name = "prost-types"
version = "0.13.5" version = "0.13.5"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16"
dependencies = [ dependencies = [
"prost", "prost 0.13.5",
] ]
[[package]] [[package]]
...@@ -6299,7 +6402,7 @@ dependencies = [ ...@@ -6299,7 +6402,7 @@ dependencies = [
"hyper-util", "hyper-util",
"percent-encoding", "percent-encoding",
"pin-project", "pin-project",
"prost", "prost 0.13.5",
"socket2 0.5.10", "socket2 0.5.10",
"tokio", "tokio",
"tokio-rustls", "tokio-rustls",
...@@ -6310,6 +6413,32 @@ dependencies = [ ...@@ -6310,6 +6413,32 @@ dependencies = [
"tracing", "tracing",
] ]
[[package]]
name = "tonic"
version = "0.14.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb7613188ce9f7df5bfe185db26c5814347d110db17920415cf2fbcad85e7203"
dependencies = [
"async-trait",
"base64 0.22.1",
"bytes",
"http",
"http-body",
"http-body-util",
"hyper",
"hyper-timeout",
"hyper-util",
"percent-encoding",
"pin-project",
"sync_wrapper",
"tokio",
"tokio-stream",
"tower",
"tower-layer",
"tower-service",
"tracing",
]
[[package]] [[package]]
name = "tonic-build" name = "tonic-build"
version = "0.13.1" version = "0.13.1"
...@@ -6324,6 +6453,17 @@ dependencies = [ ...@@ -6324,6 +6453,17 @@ dependencies = [
"syn 2.0.106", "syn 2.0.106",
] ]
[[package]]
name = "tonic-prost"
version = "0.14.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "66bd50ad6ce1252d87ef024b3d64fe4c3cf54a86fb9ef4c631fdd0ded7aeaa67"
dependencies = [
"bytes",
"prost 0.14.1",
"tonic 0.14.2",
]
[[package]] [[package]]
name = "tower" name = "tower"
version = "0.5.2" version = "0.5.2"
...@@ -6418,6 +6558,25 @@ dependencies = [ ...@@ -6418,6 +6558,25 @@ dependencies = [
"tracing-core", "tracing-core",
] ]
[[package]]
name = "tracing-opentelemetry"
version = "0.32.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e6e5658463dd88089aba75c7791e1d3120633b1bfde22478b28f625a9bb1b8e"
dependencies = [
"js-sys",
"opentelemetry",
"opentelemetry_sdk",
"rustversion",
"smallvec",
"thiserror 2.0.16",
"tracing",
"tracing-core",
"tracing-log",
"tracing-subscriber",
"web-time",
]
[[package]] [[package]]
name = "tracing-serde" name = "tracing-serde"
version = "0.2.0" version = "0.2.0"
......
...@@ -14,7 +14,7 @@ use std::path::PathBuf; ...@@ -14,7 +14,7 @@ use std::path::PathBuf;
use std::time::Duration; use std::time::Duration;
use std::{fmt::Display, sync::Arc}; use std::{fmt::Display, sync::Arc};
use tokio::sync::Mutex; use tokio::sync::Mutex;
use tracing::{Instrument, info_span}; use tracing::Instrument;
use dynamo_runtime::{ use dynamo_runtime::{
self as rs, logging, self as rs, logging,
...@@ -64,35 +64,14 @@ static INIT: OnceCell<()> = OnceCell::new(); ...@@ -64,35 +64,14 @@ static INIT: OnceCell<()> = OnceCell::new();
const DEFAULT_ANNOTATED_SETTING: Option<bool> = Some(true); const DEFAULT_ANNOTATED_SETTING: Option<bool> = Some(true);
// Helper to create client span - always emit spans for consistency
fn create_client_span(
operation: &str,
request_id: &str,
trace_context: Option<&dynamo_runtime::logging::DistributedTraceContext>,
) -> tracing::Span {
if let Some(ctx) = trace_context {
info_span!(
"client_request",
operation = operation,
request_id = request_id,
trace_id = ctx.trace_id.as_str(),
parent_id = ctx.span_id.as_str(),
x_request_id = ctx.x_request_id.as_deref().unwrap_or(""),
x_dynamo_request_id = ctx.x_dynamo_request_id.as_deref().unwrap_or(""),
tracestate = ctx.tracestate.as_deref().unwrap_or("")
)
} else {
info_span!(
"client_request",
operation = operation,
request_id = request_id,
)
}
}
// Helper to get appropriate span for instrumentation - always emit spans // Helper to get appropriate span for instrumentation - always emit spans
fn get_span_for_context(context: &context::Context, operation: &str) -> tracing::Span { fn get_span_for_context(context: &context::Context, operation: &str) -> tracing::Span {
create_client_span(operation, context.inner().id(), context.trace_context()) logging::make_client_request_span(
operation,
context.inner().id(),
context.trace_context(),
None,
)
} }
// Helper to create span for direct method with instance_id // Helper to create span for direct method with instance_id
...@@ -101,26 +80,12 @@ fn get_span_for_direct_context( ...@@ -101,26 +80,12 @@ fn get_span_for_direct_context(
operation: &str, operation: &str,
instance_id: &str, instance_id: &str,
) -> tracing::Span { ) -> tracing::Span {
if let Some(trace_ctx) = context.trace_context() { logging::make_client_request_span(
info_span!( operation,
"client_request", context.inner().id(),
operation = operation, context.trace_context(),
request_id = context.inner().id(), Some(instance_id),
instance_id = instance_id, )
trace_id = trace_ctx.trace_id.as_str(),
parent_id = trace_ctx.span_id.as_str(),
x_request_id = trace_ctx.x_request_id.as_deref().unwrap_or(""),
x_dynamo_request_id = trace_ctx.x_dynamo_request_id.as_deref().unwrap_or(""),
tracestate = trace_ctx.tracestate.as_deref().unwrap_or("")
)
} else {
info_span!(
"client_request",
operation = operation,
request_id = context.inner().id(),
instance_id = instance_id,
)
}
} }
/// A Python module implemented in Rust. The name of this function must match /// A Python module implemented in Rust. The name of this function must match
...@@ -128,7 +93,6 @@ fn get_span_for_direct_context( ...@@ -128,7 +93,6 @@ fn get_span_for_direct_context(
/// import the module. /// import the module.
#[pymodule] #[pymodule]
fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> { fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
logging::init();
m.add_function(wrap_pyfunction!(llm::kv::compute_block_hash_for_seq_py, m)?)?; m.add_function(wrap_pyfunction!(llm::kv::compute_block_hash_for_seq_py, m)?)?;
m.add_function(wrap_pyfunction!(log_message, m)?)?; m.add_function(wrap_pyfunction!(log_message, m)?)?;
m.add_function(wrap_pyfunction!(register_llm, m)?)?; m.add_function(wrap_pyfunction!(register_llm, m)?)?;
...@@ -392,6 +356,12 @@ impl DistributedRuntime { ...@@ -392,6 +356,12 @@ impl DistributedRuntime {
let runtime = worker.runtime().clone(); let runtime = worker.runtime().clone();
// Initialize logging in context where tokio runtime is available
// otel exporter requires it
runtime.secondary().block_on(async {
rs::logging::init();
});
let inner = let inner =
if is_static { if is_static {
runtime.secondary().block_on( runtime.secondary().block_on(
......
...@@ -46,6 +46,10 @@ tokio-util = { workspace = true } ...@@ -46,6 +46,10 @@ tokio-util = { workspace = true }
tower-http = { workspace = true } tower-http = { workspace = true }
tracing = { workspace = true } tracing = { workspace = true }
tracing-subscriber = { workspace = true } tracing-subscriber = { workspace = true }
tracing-opentelemetry = { workspace = true }
opentelemetry = { workspace = true }
opentelemetry_sdk = { workspace = true }
opentelemetry-otlp = { workspace = true }
thiserror = { workspace = true } thiserror = { workspace = true }
uuid = { workspace = true } uuid = { workspace = true }
url = { workspace = true } url = { workspace = true }
......
This diff is collapsed.
...@@ -8,6 +8,7 @@ use tracing as log; ...@@ -8,6 +8,7 @@ use tracing as log;
use super::*; use super::*;
use crate::logging::DistributedTraceContext; use crate::logging::DistributedTraceContext;
use crate::logging::get_distributed_tracing_context; use crate::logging::get_distributed_tracing_context;
use crate::logging::inject_otel_context_into_nats_headers;
use crate::{Result, protocols::maybe_error::MaybeError}; use crate::{Result, protocols::maybe_error::MaybeError};
use tokio_stream::{StreamExt, StreamNotifyClose, wrappers::ReceiverStream}; use tokio_stream::{StreamExt, StreamNotifyClose, wrappers::ReceiverStream};
use tracing::Instrument; use tracing::Instrument;
...@@ -145,12 +146,13 @@ where ...@@ -145,12 +146,13 @@ where
// Enables span to be created in push_endpoint before // Enables span to be created in push_endpoint before
// payload is parsed // payload is parsed
// Prepare trace headers using the OpenTelemetry injector pattern
// This handles traceparent and tracestate headers according to W3C Trace Context standard
let mut headers = HeaderMap::new(); let mut headers = HeaderMap::new();
inject_otel_context_into_nats_headers(&mut headers, None);
// Add additional custom headers that aren't handled by the OpenTelemetry propagator
if let Some(trace_context) = get_distributed_tracing_context() { if let Some(trace_context) = get_distributed_tracing_context() {
headers.insert("traceparent", trace_context.create_traceparent());
if let Some(tracestate) = trace_context.tracestate {
headers.insert("tracestate", tracestate);
}
if let Some(x_request_id) = trace_context.x_request_id { if let Some(x_request_id) = trace_context.x_request_id {
headers.insert("x-request-id", x_request_id); headers.insert("x-request-id", x_request_id);
} }
......
...@@ -6,7 +6,7 @@ use std::sync::atomic::{AtomicU64, Ordering}; ...@@ -6,7 +6,7 @@ use std::sync::atomic::{AtomicU64, Ordering};
use super::*; use super::*;
use crate::SystemHealth; use crate::SystemHealth;
use crate::config::HealthStatus; use crate::config::HealthStatus;
use crate::logging::TraceParent; use crate::logging::make_handle_payload_span;
use crate::protocols::LeaseId; use crate::protocols::LeaseId;
use anyhow::Result; use anyhow::Result;
use async_nats::service::endpoint::Endpoint; use async_nats::service::endpoint::Endpoint;
...@@ -94,33 +94,23 @@ impl PushEndpoint { ...@@ -94,33 +94,23 @@ impl PushEndpoint {
let notify_clone = notify.clone(); let notify_clone = notify.clone();
// Handle headers here for tracing // Handle headers here for tracing
let span = if let Some(headers) = req.message.headers.as_ref() {
let mut traceparent = TraceParent::default(); make_handle_payload_span(
headers,
if let Some(headers) = req.message.headers.as_ref() { component_name.as_ref(),
traceparent = TraceParent::from_headers(headers); endpoint_name.as_ref(),
} namespace.as_ref(),
instance_id,
)
} else {
tracing::info_span!("handle_payload")
};
tokio::spawn(async move { tokio::spawn(async move {
tracing::trace!(instance_id, "handling new request"); tracing::trace!(instance_id, "handling new request");
let result = ingress let result = ingress
.handle_payload(req.message.payload) .handle_payload(req.message.payload)
.instrument( .instrument(span)
// Create span with trace ids as set
// in headers.
tracing::info_span!(
"handle_payload",
component = component_name.as_ref(),
endpoint = endpoint_name.as_ref(),
namespace = namespace.as_ref(),
instance_id = instance_id,
trace_id = traceparent.trace_id,
parent_id = traceparent.parent_id,
x_request_id = traceparent.x_request_id,
x_dynamo_request_id = traceparent.x_dynamo_request_id,
tracestate = traceparent.tracestate
),
)
.await; .await;
match result { match result {
Ok(_) => { Ok(_) => {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment