Unverified Commit 734c0b8b authored by jh-nv's avatar jh-nv Committed by GitHub
Browse files

feat: Add Loki log aggregation, and enable unified OTLP ingestion for both...

feat: Add Loki log aggregation, and enable unified OTLP ingestion for both traces and logs through a single endpoint (#6974)
parent 96f3bdcc
......@@ -2172,6 +2172,7 @@ dependencies = [
"nuid",
"once_cell",
"opentelemetry",
"opentelemetry-appender-tracing",
"opentelemetry-otlp",
"opentelemetry_sdk",
"parking_lot",
......@@ -3251,7 +3252,7 @@ dependencies = [
"libc",
"percent-encoding",
"pin-project-lite",
"socket2 0.6.2",
"socket2 0.6.3",
"system-configuration 0.7.0",
"tokio",
"tower-service",
......@@ -4067,9 +4068,9 @@ checksum = "7a79a3332a6609480d7d0c9eab957bca6b455b91bb84e66d19f5ff66294b85b8"
[[package]]
name = "libc"
version = "0.2.182"
version = "0.2.183"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112"
checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d"
[[package]]
name = "libdynamo_llm"
......@@ -5254,6 +5255,18 @@ dependencies = [
"tracing",
]
[[package]]
name = "opentelemetry-appender-tracing"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef6a1ac5ca3accf562b8c306fa8483c85f4390f768185ab775f242f7fe8fdcc2"
dependencies = [
"opentelemetry",
"tracing",
"tracing-core",
"tracing-subscriber",
]
[[package]]
name = "opentelemetry-http"
version = "0.31.0"
......@@ -5706,11 +5719,11 @@ dependencies = [
[[package]]
name = "proc-macro-crate"
version = "3.4.0"
version = "3.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983"
checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f"
dependencies = [
"toml_edit 0.23.10+spec-1.0.0",
"toml_edit 0.25.4+spec-1.1.0",
]
[[package]]
......@@ -6018,7 +6031,7 @@ dependencies = [
"quinn-udp",
"rustc-hash 2.1.1",
"rustls",
"socket2 0.6.2",
"socket2 0.6.3",
"thiserror 2.0.18",
"tokio",
"tracing",
......@@ -6027,9 +6040,9 @@ dependencies = [
[[package]]
name = "quinn-proto"
version = "0.11.13"
version = "0.11.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31"
checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098"
dependencies = [
"bytes",
"getrandom 0.3.4",
......@@ -6055,7 +6068,7 @@ dependencies = [
"cfg_aliases",
"libc",
"once_cell",
"socket2 0.6.2",
"socket2 0.6.3",
"tracing",
"windows-sys 0.60.2",
]
......@@ -7363,12 +7376,12 @@ dependencies = [
[[package]]
name = "socket2"
version = "0.6.2"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "86f4aa3ad99f2088c990dfa82d367e19cb29268ed67c574d10d0a4bfe71f07e0"
checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e"
dependencies = [
"libc",
"windows-sys 0.60.2",
"windows-sys 0.61.2",
]
[[package]]
......@@ -7832,7 +7845,7 @@ dependencies = [
"parking_lot",
"pin-project-lite",
"signal-hook-registry",
"socket2 0.6.2",
"socket2 0.6.3",
"tokio-macros",
"tracing",
"windows-sys 0.61.2",
......@@ -7997,6 +8010,15 @@ dependencies = [
"serde_core",
]
[[package]]
name = "toml_datetime"
version = "1.0.0+spec-1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32c2555c699578a4f59f0cc68e5116c8d7cabbd45e1409b989d4be085b53f13e"
dependencies = [
"serde_core",
]
[[package]]
name = "toml_edit"
version = "0.22.27"
......@@ -8013,12 +8035,12 @@ dependencies = [
[[package]]
name = "toml_edit"
version = "0.23.10+spec-1.0.0"
version = "0.25.4+spec-1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "84c8b9f757e028cee9fa244aea147aab2a9ec09d5325a9b01e0a49730c2b5269"
checksum = "7193cbd0ce53dc966037f54351dbbcf0d5a642c7f0038c382ef9e677ce8c13f2"
dependencies = [
"indexmap 2.13.0",
"toml_datetime 0.7.5+spec-1.1.0",
"toml_datetime 1.0.0+spec-1.1.0",
"toml_parser",
"winnow",
]
......@@ -8116,7 +8138,7 @@ dependencies = [
"hyper-util",
"percent-encoding",
"pin-project",
"socket2 0.6.2",
"socket2 0.6.3",
"sync_wrapper 1.0.2",
"tokio",
"tokio-rustls",
......@@ -8672,9 +8694,9 @@ dependencies = [
[[package]]
name = "uuid"
version = "1.21.0"
version = "1.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b672338555252d43fd2240c714dc444b8c6fb0a5c5335e65a07bba7742735ddb"
checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37"
dependencies = [
"getrandom 0.4.2",
"js-sys",
......@@ -8789,7 +8811,7 @@ dependencies = [
"reqwest 0.12.28",
"rmp-serde",
"serde_json",
"socket2 0.6.2",
"socket2 0.6.3",
"thiserror 2.0.18",
"tokio",
"tokio-stream",
......@@ -9397,9 +9419,9 @@ checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650"
[[package]]
name = "winnow"
version = "0.7.14"
version = "0.7.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829"
checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945"
dependencies = [
"memchr",
]
......@@ -9572,18 +9594,18 @@ dependencies = [
[[package]]
name = "zerocopy"
version = "0.8.40"
version = "0.8.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a789c6e490b576db9f7e6b6d661bcc9799f7c0ac8352f56ea20193b2681532e5"
checksum = "f2578b716f8a7a858b7f02d5bd870c14bf4ddbbcf3a4c05414ba6503640505e3"
dependencies = [
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.8.40"
version = "0.8.42"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f65c489a7071a749c849713807783f70672b28094011623e200cb86dcb835953"
checksum = "7e6cc098ea4d3bd6246687de65af3f920c430e236bee1e3bf2e441463f08a02f"
dependencies = [
"proc-macro2",
"quote",
......
......@@ -127,9 +127,10 @@ tracing-subscriber = { version = "0.3", features = [
"json",
] }
tracing-opentelemetry = { version = "0.32.0" }
opentelemetry = { version = "0.31.0", features = ["trace"] }
opentelemetry_sdk = { version = "0.31.0", features = ["trace", "rt-tokio"] }
opentelemetry-otlp = { version = "0.31.0", features = ["trace", "grpc-tonic"] }
opentelemetry = { version = "0.31.0", features = ["trace", "logs"] }
opentelemetry_sdk = { version = "0.31.0", features = ["trace", "logs", "rt-tokio"] }
opentelemetry-otlp = { version = "0.31.0", features = ["trace", "logs", "grpc-tonic"] }
opentelemetry-appender-tracing = { version = "0.31.0" }
validator = { version = "0.20.0", features = ["derive"] }
uuid = { version = "1.18.1", features = ["v4", "serde"] }
url = { version = "2.5", features = ["serde"] }
......
......@@ -17,6 +17,7 @@ networks:
volumes:
grafana-data:
tempo-data:
loki-data:
services:
# DCGM stands for Data Center GPU Manager: https://developer.nvidia.com/dcgm
......@@ -83,7 +84,21 @@ services:
- dcgm-exporter
- nats-prometheus-exporter
# Loki - Log aggregation backend
loki:
image: grafana/loki:3.5.0
command: [ "-config.file=/etc/loki.yaml" ]
user: root
volumes:
- ./observability/loki.yaml:/etc/loki.yaml
- loki-data:/loki
ports:
- "3100:3100" # Loki HTTP API (push/query)
networks:
- server
# Tempo - Distributed tracing backend
# Note: OTLP ports (4317/4318) are internal-only; the OTel Collector handles host-facing ingestion.
tempo:
image: grafana/tempo:2.8.2
command: [ "-config.file=/etc/tempo.yaml" ]
......@@ -92,11 +107,26 @@ services:
- ./observability/tempo.yaml:/etc/tempo.yaml
- tempo-data:/tmp/tempo
ports:
- "3200:3200" # Tempo HTTP
- "3200:3200" # Tempo HTTP (query API)
networks:
- server
# OpenTelemetry Collector - Unified ingestion point for traces and logs
# Dynamo services send OTLP to localhost:4317 (gRPC) or localhost:4318 (HTTP).
# The collector routes traces to Tempo and logs to Loki.
otel-collector:
image: otel/opentelemetry-collector:0.120.0
command: [ "--config=/etc/otel-collector.yaml" ]
volumes:
- ./observability/otel-collector.yaml:/etc/otel-collector.yaml
ports:
- "4317:4317" # OTLP gRPC receiver (accessible from host)
- "4318:4318" # OTLP HTTP receiver (accessible from host)
networks:
- server
depends_on:
- tempo
- loki
# Grafana - Visualization and dashboards
# Supports both Prometheus (metrics) and Tempo (tracing) datasources
......@@ -111,6 +141,7 @@ services:
- ./observability/grafana_dashboards:/etc/grafana/provisioning/dashboards
- ./observability/grafana-datasources.yml:/etc/grafana/provisioning/datasources/prometheus.yml
- ./observability/tempo-datasource.yml:/etc/grafana/provisioning/datasources/tempo.yml
- ./observability/loki-datasource.yml:/etc/grafana/provisioning/datasources/loki.yml
environment:
- GF_SERVER_HTTP_PORT=3000
# do not make it admin/admin, because you will be prompted to change the password every time
......@@ -134,4 +165,5 @@ services:
depends_on:
- prometheus
- tempo
- loki
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: 1
datasources:
- name: Loki
type: loki
access: proxy
url: http://loki:3100
uid: loki
isDefault: false
editable: true
jsonData:
maxLines: 1000
derivedFields:
- datasourceUid: tempo
matcherRegex: "trace_id"
matcherType: label
name: TraceID
url: "$${__value.raw}"
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
auth_enabled: false
server:
http_listen_port: 3100
common:
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
schema_config:
configs:
- from: "2024-01-01"
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
limits_config:
reject_old_samples: true
reject_old_samples_max_age: 168h
allow_structured_metadata: true
compactor:
working_directory: /loki/compactor
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# OpenTelemetry Collector configuration
# Receives OTLP signals and routes traces to Tempo, logs to Loki.
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
exporters:
otlp/tempo:
endpoint: tempo:4317
tls:
insecure: true
otlphttp/loki:
endpoint: http://loki:3100/otlp
processors:
batch: {}
service:
pipelines:
traces:
receivers: [otlp]
processors: [batch]
exporters: [otlp/tempo]
logs:
receivers: [otlp]
processors: [batch]
exporters: [otlphttp/loki]
......@@ -15,6 +15,12 @@ datasources:
httpMethod: GET
serviceMap:
datasourceUid: tempo
tracesToLogsV2:
datasourceUid: loki
filterByTraceID: false
filterBySpanID: false
customQuery: true
query: '{service_name=~".+"} | trace_id = "$${__span.traceId}"'
search:
hide: false
nodeGraph:
......
......@@ -18,7 +18,7 @@ Install these on your machine:
### Starting the Observability Stack
Dynamo provides a Docker Compose-based observability stack that includes Prometheus, Grafana, Tempo, and various exporters for metrics, tracing, and visualization.
Dynamo provides a Docker Compose-based observability stack that includes Prometheus, Grafana, Tempo, Loki, an OpenTelemetry Collector, and various exporters for metrics, tracing, logging, and visualization.
From the Dynamo root directory:
......@@ -40,7 +40,7 @@ For detailed setup instructions and configuration, see [Prometheus + Grafana Set
| [Operator Metrics (Kubernetes)](../kubernetes/observability/operator-metrics.md) | Operator controller and webhook metrics for Kubernetes | N/A (configured via Helm) |
| [Health Checks](health-checks.md) | Component health monitoring and readiness probes | `DYN_SYSTEM_PORT`†, `DYN_SYSTEM_STARTING_HEALTH_STATUS`, `DYN_SYSTEM_HEALTH_PATH`, `DYN_SYSTEM_LIVE_PATH`, `DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS` |
| [Tracing](tracing.md) | Distributed tracing with OpenTelemetry and Tempo | `DYN_LOGGING_JSONL`†, `OTEL_EXPORT_ENABLED`†, `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`†, `OTEL_SERVICE_NAME`† |
| [Logging](logging.md) | Structured logging configuration | `DYN_LOGGING_JSONL`†, `DYN_LOG`, `DYN_LOG_USE_LOCAL_TZ`, `DYN_LOGGING_CONFIG_PATH`, `OTEL_SERVICE_NAME`†, `OTEL_EXPORT_ENABLED`†, `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`† |
| [Logging](logging.md) | Structured logging and OTLP log export to Loki | `DYN_LOGGING_JSONL`†, `DYN_LOG`, `DYN_LOG_USE_LOCAL_TZ`, `DYN_LOGGING_CONFIG_PATH`, `OTEL_SERVICE_NAME`†, `OTEL_EXPORT_ENABLED`†, `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`, `OTEL_EXPORTER_OTLP_LOGS_ENDPOINT` |
**Variables marked with † are shared across multiple observability systems.**
......@@ -64,6 +64,8 @@ This provides:
- **Prometheus** on `http://localhost:9090` - metrics collection and querying
- **Grafana** on `http://localhost:3000` - visualization dashboards (username: `dynamo`, password: `dynamo`)
- **Tempo** on `http://localhost:3200` - distributed tracing backend
- **Loki** on `http://localhost:3100` - log aggregation backend
- **OpenTelemetry Collector** on `http://localhost:4317` (gRPC) / `http://localhost:4318` (HTTP) - receives OTLP signals and routes traces to Tempo and logs to Loki
- **DCGM Exporter** on `http://localhost:9401/metrics` - GPU metrics
- **NATS Exporter** on `http://localhost:7777/metrics` - NATS messaging metrics
......@@ -79,7 +81,13 @@ graph TD
PROMETHEUS -->|:8000/metrics| DYNAMOFE[Dynamo HTTP FE :8000]
PROMETHEUS -->|:8081/metrics| DYNAMOBACKEND[Dynamo backend :8081]
DYNAMOFE --> DYNAMOBACKEND
DYNAMOFE -->|OTLP :4317| OTEL_COLLECTOR[OTel Collector :4317/:4318]
DYNAMOBACKEND -->|OTLP :4317| OTEL_COLLECTOR
OTEL_COLLECTOR -->|traces| TEMPO[Tempo :3200]
OTEL_COLLECTOR -->|logs| LOKI[Loki :3100]
GRAFANA -->|:9090/query API| PROMETHEUS
GRAFANA -->|:3200/query API| TEMPO
GRAFANA -->|:3100/query API| LOKI
end
```
......@@ -92,6 +100,9 @@ The following configuration files are located in the `deploy/observability/` dir
- [docker-observability.yml](https://github.com/ai-dynamo/dynamo/tree/main/deploy/docker-observability.yml): Defines Prometheus, Grafana, Tempo, and exporters
- [prometheus.yml](https://github.com/ai-dynamo/dynamo/tree/main/deploy/observability/prometheus.yml): Contains Prometheus scraping configuration
- [grafana-datasources.yml](https://github.com/ai-dynamo/dynamo/tree/main/deploy/observability/grafana-datasources.yml): Contains Grafana datasource configuration
- [otel-collector.yaml](https://github.com/ai-dynamo/dynamo/tree/main/deploy/observability/otel-collector.yaml): OpenTelemetry Collector configuration (routes traces to Tempo, logs to Loki)
- [loki.yaml](https://github.com/ai-dynamo/dynamo/tree/main/deploy/observability/loki.yaml): Loki log aggregation configuration
- [loki-datasource.yml](https://github.com/ai-dynamo/dynamo/tree/main/deploy/observability/loki-datasource.yml): Grafana Loki datasource with trace ID linking to Tempo
- [grafana_dashboards/dashboard-providers.yml](https://github.com/ai-dynamo/dynamo/tree/main/deploy/observability/grafana_dashboards/dashboard-providers.yml): Contains Grafana dashboard provider configuration
- [grafana_dashboards/dynamo.json](https://github.com/ai-dynamo/dynamo/tree/main/deploy/observability/grafana_dashboards/dynamo.json): A general Dynamo Dashboard for both SW and HW metrics
- [grafana_dashboards/dcgm-metrics.json](https://github.com/ai-dynamo/dynamo/tree/main/deploy/observability/grafana_dashboards/dcgm-metrics.json): Contains Grafana dashboard configuration for DCGM GPU metrics
......
......@@ -24,14 +24,30 @@ enabled via the `DYN_LOGGING_SPAN_EVENTS` environment variable.
| `TLLM_LOG_LEVEL` | TensorRT-LLM backend log level (independent of `DYN_LOG`) | `INFO` | `DEBUG` |
| `DYN_SKIP_SGLANG_LOG_FORMATTING` | Disable Dynamo's SGLang log configuration | `false` | `true` |
| `OTEL_SERVICE_NAME` | Service name for trace and span information | `dynamo` | `dynamo-frontend` |
| `OTEL_EXPORT_ENABLED` | Enable OTLP trace exporting | `false` | `true` |
| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP exporter endpoint | `http://localhost:4317` | `http://tempo:4317` |
| `OTEL_EXPORT_ENABLED` | Enable OTLP export of both traces and logs | `false` | `true` |
| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP gRPC endpoint for traces | `http://localhost:4317` | `http://tempo:4317` |
| `OTEL_EXPORTER_OTLP_LOGS_ENDPOINT` | OTLP gRPC endpoint for logs (defaults to `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` if not set) | same as traces endpoint | `http://localhost:4317` |
## OTLP Log Export
When `OTEL_EXPORT_ENABLED=true`, Dynamo exports both **traces and logs** via OTLP. Logs are sent to an OpenTelemetry Collector which routes them to Grafana Loki for aggregation and querying.
By default, logs are exported to the same endpoint as traces (`OTEL_EXPORTER_OTLP_TRACES_ENDPOINT`). To send logs to a different endpoint, set `OTEL_EXPORTER_OTLP_LOGS_ENDPOINT`:
```bash
export OTEL_EXPORT_ENABLED=true
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://localhost:4317
# Optional: send logs to a different endpoint
# export OTEL_EXPORTER_OTLP_LOGS_ENDPOINT=http://localhost:4317
```
The local observability stack (see [Getting Started](README.md#getting-started-quickly)) includes an OpenTelemetry Collector that receives OTLP on `localhost:4317` and routes traces to Tempo and logs to Loki. In Grafana, the Loki datasource is pre-configured with a derived field that links `trace_id` labels to Tempo, so you can jump directly from a log line to its corresponding trace.
## Getting Started Quickly
### Start Observability Stack
For collecting and visualizing logs with Grafana Loki (Kubernetes), or viewing trace context in logs alongside Grafana Tempo, start the observability stack. See [Observability Getting Started](README.md#getting-started-quickly) for instructions.
For collecting and visualizing logs with Grafana Loki, or viewing trace context in logs alongside Grafana Tempo, start the observability stack. See [Observability Getting Started](README.md#getting-started-quickly) for instructions. The stack includes Loki, an OpenTelemetry Collector, and Tempo — all pre-wired together.
### Enable Structured Logging
......@@ -313,7 +329,9 @@ command to set the SGLang engine's log level directly (e.g.
## Related Documentation
- [Distributed Tracing with Tempo](tracing.md)
- [Log Aggregation in Kubernetes](../kubernetes/observability/logging.md)
- [Observability Getting Started](README.md)
- [Distributed Runtime Architecture](../design-docs/distributed-runtime.md)
- [Dynamo Architecture Overview](../design-docs/architecture.md)
- [Backend Guide](../development/backend-guide.md)
- [Log Aggregation in Kubernetes](../kubernetes/observability/logging.md)
......@@ -10,6 +10,8 @@ Dynamo supports OpenTelemetry-based distributed tracing for visualizing request
**Requirements:** Set `DYN_LOGGING_JSONL=true` and `OTEL_EXPORT_ENABLED=true` to export traces to Tempo.
**Note:** When OTLP export is enabled, Dynamo exports both **traces and logs**. Traces are sent to Tempo and logs are sent to Loki (via the OpenTelemetry Collector). To send logs to a separate endpoint, set `OTEL_EXPORTER_OTLP_LOGS_ENDPOINT`; otherwise it defaults to the traces endpoint. See [Logging](logging.md#otlp-log-export) for details.
This guide covers single GPU demo setup using Docker Compose. For Kubernetes deployments, see [Kubernetes Deployment](#kubernetes-deployment).
**Note:** This section has overlap with [Logging of OpenTelemetry Tracing](logging.md) since OpenTelemetry has aspects of both logging and tracing. The tracing approach documented here is for persistent trace visualization and analysis. For short debugging sessions examining trace context directly in logs, see the [Logging](logging.md) guide.
......@@ -20,7 +22,8 @@ This guide covers single GPU demo setup using Docker Compose. For Kubernetes dep
|----------|-------------|---------|---------|
| `DYN_LOGGING_JSONL` | Enable JSONL logging format (required for tracing) | `false` | `true` |
| `OTEL_EXPORT_ENABLED` | Enable OTLP trace export | `false` | `true` |
| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP gRPC endpoint for Tempo | `http://localhost:4317` | `http://tempo:4317` |
| `OTEL_EXPORTER_OTLP_TRACES_ENDPOINT` | OTLP gRPC endpoint for traces | `http://localhost:4317` | `http://tempo:4317` |
| `OTEL_EXPORTER_OTLP_LOGS_ENDPOINT` | OTLP gRPC endpoint for logs (defaults to traces endpoint) | same as traces | `http://localhost:4317` |
| `OTEL_SERVICE_NAME` | Service name for identifying components | `dynamo` | `dynamo-frontend` |
## Getting Started Quickly
......
......@@ -1720,6 +1720,7 @@ dependencies = [
"nuid",
"once_cell",
"opentelemetry",
"opentelemetry-appender-tracing",
"opentelemetry-otlp",
"opentelemetry_sdk",
"parking_lot",
......@@ -4355,6 +4356,18 @@ dependencies = [
"tracing",
]
[[package]]
name = "opentelemetry-appender-tracing"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef6a1ac5ca3accf562b8c306fa8483c85f4390f768185ab775f242f7fe8fdcc2"
dependencies = [
"opentelemetry",
"tracing",
"tracing-core",
"tracing-subscriber",
]
[[package]]
name = "opentelemetry-http"
version = "0.31.0"
......
......@@ -1756,6 +1756,7 @@ dependencies = [
"nuid",
"once_cell",
"opentelemetry",
"opentelemetry-appender-tracing",
"opentelemetry-otlp",
"opentelemetry_sdk",
"parking_lot",
......@@ -4404,6 +4405,18 @@ dependencies = [
"tracing",
]
[[package]]
name = "opentelemetry-appender-tracing"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef6a1ac5ca3accf562b8c306fa8483c85f4390f768185ab775f242f7fe8fdcc2"
dependencies = [
"opentelemetry",
"tracing",
"tracing-core",
"tracing-subscriber",
]
[[package]]
name = "opentelemetry-http"
version = "0.31.0"
......
......@@ -59,6 +59,7 @@ tracing-opentelemetry = { workspace = true }
opentelemetry = { workspace = true }
opentelemetry_sdk = { workspace = true }
opentelemetry-otlp = { workspace = true }
opentelemetry-appender-tracing = { workspace = true }
thiserror = { workspace = true }
uuid = { workspace = true }
url = { workspace = true }
......
......@@ -841,6 +841,7 @@ dependencies = [
"nuid",
"once_cell",
"opentelemetry",
"opentelemetry-appender-tracing",
"opentelemetry-otlp",
"opentelemetry_sdk",
"parking_lot",
......@@ -2218,6 +2219,18 @@ dependencies = [
"tracing",
]
[[package]]
name = "opentelemetry-appender-tracing"
version = "0.31.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ef6a1ac5ca3accf562b8c306fa8483c85f4390f768185ab775f242f7fe8fdcc2"
dependencies = [
"opentelemetry",
"tracing",
"tracing-core",
"tracing-subscriber",
]
[[package]]
name = "opentelemetry-http"
version = "0.31.0"
......
......@@ -42,16 +42,19 @@ pub mod logging {
/// Enable span event logging (create/close events)
pub const DYN_LOGGING_SPAN_EVENTS: &str = "DYN_LOGGING_SPAN_EVENTS";
/// OTLP (OpenTelemetry Protocol) tracing configuration
/// OTLP (OpenTelemetry Protocol) tracing and logging configuration
pub mod otlp {
/// Enable OTLP trace exporting (set to "1" to enable)
/// Enable OTLP export for traces and logs (set to "1" to enable)
pub const OTEL_EXPORT_ENABLED: &str = "OTEL_EXPORT_ENABLED";
/// OTLP exporter endpoint URL
/// OTLP exporter endpoint URL for traces
/// Spec: https://opentelemetry.io/docs/specs/otel/protocol/exporter/
pub const OTEL_EXPORTER_OTLP_TRACES_ENDPOINT: &str = "OTEL_EXPORTER_OTLP_TRACES_ENDPOINT";
/// Service name for OTLP traces
/// OTLP exporter endpoint URL for logs (defaults to traces endpoint if unset)
pub const OTEL_EXPORTER_OTLP_LOGS_ENDPOINT: &str = "OTEL_EXPORTER_OTLP_LOGS_ENDPOINT";
/// Service name for OTLP traces and logs
pub const OTEL_SERVICE_NAME: &str = "OTEL_SERVICE_NAME";
}
}
......@@ -408,6 +411,7 @@ mod tests {
logging::otlp::OTEL_EXPORT_ENABLED,
logging::otlp::OTEL_EXPORTER_OTLP_TRACES_ENDPOINT,
logging::otlp::OTEL_SERVICE_NAME,
logging::otlp::OTEL_EXPORTER_OTLP_LOGS_ENDPOINT,
// Runtime
runtime::DYN_RUNTIME_NUM_WORKER_THREADS,
runtime::DYN_RUNTIME_MAX_BLOCKING_THREADS,
......
......@@ -72,11 +72,13 @@ use uuid::Uuid;
use opentelemetry::propagation::{Extractor, Injector, TextMapPropagator};
use opentelemetry::trace::TraceContextExt;
use opentelemetry::{global, trace::Tracer};
use opentelemetry_appender_tracing::layer::OpenTelemetryTracingBridge;
use opentelemetry_otlp::WithExportConfig;
use opentelemetry::trace::TracerProvider as _;
use opentelemetry::{Key, KeyValue};
use opentelemetry_sdk::Resource;
use opentelemetry_sdk::logs::SdkLoggerProvider;
use opentelemetry_sdk::trace::SdkTracerProvider;
use tracing::error;
use tracing_subscriber::layer::SubscriberExt;
......@@ -902,6 +904,7 @@ fn setup_logging() -> Result<(), Box<dyn std::error::Error>> {
let fmt_filter_layer = filters(load_config());
let trace_filter_layer = filters(load_config());
let otel_filter_layer = filters(load_config());
let otel_logs_filter_layer = filters(load_config());
if jsonl_logging_enabled() {
let span_events = if span_events_enabled() {
......@@ -919,29 +922,46 @@ fn setup_logging() -> Result<(), Box<dyn std::error::Error>> {
// Create OpenTelemetry tracer - conditionally export to OTLP based on env var
let service_name = get_service_name();
// Build tracer provider - with or without OTLP export
let (tracer_provider, endpoint_opt) = if otlp_exporter_enabled() {
// Export enabled: create OTLP exporter with batch processor
let endpoint = std::env::var(env_logging::otlp::OTEL_EXPORTER_OTLP_TRACES_ENDPOINT)
// Build tracer and logger providers - with or without OTLP export
let (tracer_provider, logger_provider_opt, endpoint_opt) = if otlp_exporter_enabled() {
// Export enabled: create OTLP exporters with batch processors
let traces_endpoint =
std::env::var(env_logging::otlp::OTEL_EXPORTER_OTLP_TRACES_ENDPOINT)
.unwrap_or_else(|_| DEFAULT_OTLP_ENDPOINT.to_string());
let logs_endpoint = std::env::var(env_logging::otlp::OTEL_EXPORTER_OTLP_LOGS_ENDPOINT)
.unwrap_or_else(|_| traces_endpoint.clone());
// Initialize OTLP exporter using gRPC (Tonic)
let otlp_exporter = opentelemetry_otlp::SpanExporter::builder()
let resource = opentelemetry_sdk::Resource::builder_empty()
.with_service_name(service_name.clone())
.build();
// Initialize OTLP span exporter using gRPC (Tonic)
let span_exporter = opentelemetry_otlp::SpanExporter::builder()
.with_tonic()
.with_endpoint(&endpoint)
.with_endpoint(&traces_endpoint)
.build()?;
// Create tracer provider with batch exporter and service name
let provider = opentelemetry_sdk::trace::SdkTracerProvider::builder()
.with_batch_exporter(otlp_exporter)
.with_resource(
opentelemetry_sdk::Resource::builder_empty()
.with_service_name(service_name.clone())
.build(),
)
let tracer_provider = opentelemetry_sdk::trace::SdkTracerProvider::builder()
.with_batch_exporter(span_exporter)
.with_resource(resource.clone())
.build();
// Initialize OTLP log exporter using gRPC (Tonic)
let log_exporter = opentelemetry_otlp::LogExporter::builder()
.with_tonic()
.with_endpoint(&logs_endpoint)
.build()?;
let logger_provider = SdkLoggerProvider::builder()
.with_batch_exporter(log_exporter)
.with_resource(resource)
.build();
(provider, Some(endpoint))
(
tracer_provider,
Some(logger_provider),
Some(traces_endpoint),
)
} else {
// No export - traces generated locally only (for logging/trace IDs)
let provider = opentelemetry_sdk::trace::SdkTracerProvider::builder()
......@@ -952,18 +972,24 @@ fn setup_logging() -> Result<(), Box<dyn std::error::Error>> {
)
.build();
(provider, None)
(provider, None, None)
};
// Get a tracer from the provider
let tracer = tracer_provider.tracer(service_name.clone());
// Build the OTLP logs bridge layer (only when export is enabled)
let otel_logs_layer = logger_provider_opt
.as_ref()
.map(|lp| OpenTelemetryTracingBridge::new(lp).with_filter(otel_logs_filter_layer));
tracing_subscriber::registry()
.with(
tracing_opentelemetry::layer()
.with_tracer(tracer)
.with_filter(otel_filter_layer),
)
.with(otel_logs_layer)
.with(DistributedTraceIdLayer.with_filter(trace_filter_layer))
.with(l)
.init();
......@@ -973,7 +999,7 @@ fn setup_logging() -> Result<(), Box<dyn std::error::Error>> {
tracing::info!(
endpoint = %endpoint,
service = %service_name,
"OpenTelemetry OTLP export enabled"
"OpenTelemetry OTLP export enabled (traces and logs)"
);
} else {
tracing::info!(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment