fix: add prefill metrics support for TensorRT-LLM disaggregated mode (#3983)

Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com> Co-authored-by: Keiven Chang <keivenchang@users.noreply.github.com>

fix: add prefill metrics support for TensorRT-LLM disaggregated mode (#3983)
Signed-off-by: Keiven Chang <keivenchang@users.noreply.github.com> Co-authored-by: Keiven Chang <keivenchang@users.noreply.github.com>
ba51c683 · Keiven C · GitHub · 49ecfe60 · ba51c683 · ba51c683
Unverified Commit ba51c683 authored Nov 04, 2025 by Keiven C Committed by GitHub Nov 04, 2025
7 changed files
--- a/components/src/dynamo/trtllm/main.py
+++ b/components/src/dynamo/trtllm/main.py
@@ -241,18 +241,19 @@ async def init(runtime: DistributedRuntime, config: Config):
    if config.publish_events_and_metrics:
        # 'event_buffer_max_size' is required to enable TRTLLM to publish kv cache events.
-        # Convert KvCacheConfig object to dict and add the parameter
+        # Add it to kv_cache_config while preserving cache_transceiver_config from YAML
        current_kv_config = arg_map["kv_cache_config"]
        if isinstance(current_kv_config, KvCacheConfig):
+            # Convert KvCacheConfig object to dict (no cache_transceiver_config to preserve)
            arg_map["kv_cache_config"] = {
                "free_gpu_memory_fraction": config.free_gpu_memory_fraction,
                "event_buffer_max_size": DEFAULT_KV_EVENT_BUFFER_MAX_SIZE,
            }
        elif isinstance(current_kv_config, dict):
-            if "event_buffer_max_size" not in current_kv_config:
+            # Add event_buffer_max_size while preserving cache_transceiver_config and other YAML settings
-                current_kv_config[
+            current_kv_config[
-                    "event_buffer_max_size"
+                "event_buffer_max_size"
-                ] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
+            ] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
        # Only pytorch backend is supported for now to publish events and metrics.
        if "backend" not in arg_map:

--- a/examples/backends/trtllm/launch/disagg.sh
+++ b/examples/backends/trtllm/launch/disagg.sh
@@ -46,4 +46,4 @@ CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
  --extra-engine-args  "$DECODE_ENGINE_ARGS" \
  --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \
  --modality "$MODALITY" \
  --disaggregation-mode decode
\ No newline at end of file
--- a/examples/backends/trtllm/launch/disagg_same_gpu.sh
+++ b/examples/backends/trtllm/launch/disagg_same_gpu.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# Disaggregated mode on single GPU - for testing only
+# Both prefill and decode workers share the same GPU with reduced memory
+# Check GPU memory availability
+FREE_GPU_GB=$(python3 -c "import torch; print(torch.cuda.mem_get_info()[0]/1024**3)" 2>/dev/null)
+if [ $? -ne 0 ]; then
+    echo "Error: Failed to check GPU memory. Is PyTorch with CUDA available?"
+    exit 1
+fi
+REQUIRED_GB=16
+# Use bash arithmetic instead of bc to avoid external dependency
+FREE_GPU_INT=$(python3 -c "print(int(float('$FREE_GPU_GB')))" 2>/dev/null)
+if [ $? -ne 0 ]; then
+    echo "Error: Failed to parse GPU memory value."
+    exit 1
+fi
+if (( FREE_GPU_INT < REQUIRED_GB )); then
+    echo "Error: Insufficient GPU memory. Required: ${REQUIRED_GB}GB, Available: ${FREE_GPU_GB}GB"
+    echo "Please free up GPU memory before running disaggregated mode on single GPU."
+    exit 1
+fi
+echo "GPU memory check passed: ${FREE_GPU_GB}GB available (required: ${REQUIRED_GB}GB)"
+# Environment variables with defaults
+export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
+export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
+export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
+export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}
+export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/tests/serve/configs/trtllm/prefill.yaml"}
+export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/tests/serve/configs/trtllm/decode.yaml"}
+export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"}
+export MODALITY=${MODALITY:-"text"}
+# Setup cleanup trap
+cleanup() {
+    echo "Cleaning up background processes..."
+    kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
+    wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
+    echo "Cleanup complete."
+}
+trap cleanup EXIT INT TERM
+# run frontend
+python3 -m dynamo.frontend --http-port 8000 &
+DYNAMO_PID=$!
+# run prefill worker (shares GPU with decode)
+CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
+DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \
+python3 -m dynamo.trtllm \
+  --model-path "$MODEL_PATH" \
+  --served-model-name "$SERVED_MODEL_NAME" \
+  --extra-engine-args  "$PREFILL_ENGINE_ARGS" \
+  --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \
+  --modality "$MODALITY" \
+  --publish-events-and-metrics \
+  --disaggregation-mode prefill &
+PREFILL_PID=$!
+# run decode worker (shares GPU with prefill)
+CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
+DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8082 \
+python3 -m dynamo.trtllm \
+  --model-path "$MODEL_PATH" \
+  --served-model-name "$SERVED_MODEL_NAME" \
+  --extra-engine-args  "$DECODE_ENGINE_ARGS" \
+  --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \
+  --modality "$MODALITY" \
+  --publish-events-and-metrics \
+  --disaggregation-mode decode
--- a/tests/serve/configs/trtllm/agg.yaml
+++ b/tests/serve/configs/trtllm/agg.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# Memory-optimized config for single GPU deployment (FP16)
+# This is for testing. Do not use this for production.
+# How many models can fit?
+# - RTX 4090 (24GB):      10x 0.6B, 6x 1B, 2x 3B
+# - RTX 6000 Ada (48GB):  20x 0.6B, 8x 1.5B, 4x 3.5B, 2x 7B
+# - A100 (40GB):          16x 0.6B, 4x 2.5B, 2x 6B
+# - A100 (80GB):          32x 0.6B, 8x 3B, 4x 6B, 2x 12B
+# - H100 (80GB):          32x 0.6B, 8x 3B, 4x 6B, 2x 12B
+#
+# For production (85% memory): RTX 6000 can handle 70x 0.6B, 4x 6B, 2x 12B, 1x 25B
+tensor_parallel_size: 1
+moe_expert_parallel_size: 1
+enable_attention_dp: false
+max_num_tokens: 1024
+max_batch_size: 4
+trust_remote_code: true
+backend: pytorch
+enable_chunked_prefill: true
+kv_cache_config:
+  free_gpu_memory_fraction: 0.24
+# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
+# NOTE: overlap_scheduler enabled by default since this commit and changed
+# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
+# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
+cuda_graph_config:
+  max_batch_size: 4
--- a/tests/serve/configs/trtllm/decode.yaml
+++ b/tests/serve/configs/trtllm/decode.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# Decode worker config for disaggregated mode (shares GPU with prefill worker)
+# This is for testing. Do not use this for production.
+tensor_parallel_size: 1
+moe_expert_parallel_size: 1
+enable_attention_dp: false
+max_num_tokens: 1024
+trust_remote_code: true
+backend: pytorch
+# Enable chunked prefill to process large contexts in smaller chunks
+enable_chunked_prefill: true
+# Overlap scheduler enabled - decode workers can overlap multiple decode operations
+disable_overlap_scheduler: false
+cuda_graph_config:
+  max_batch_size: 4
+kv_cache_config:
+  free_gpu_memory_fraction: 0.24
+# Cache transceiver receives KV cache from prefill worker
+# Required for disaggregated mode - decode worker needs KV cache from prefill
+cache_transceiver_config:
+  backend: DEFAULT
--- a/tests/serve/configs/trtllm/prefill.yaml
+++ b/tests/serve/configs/trtllm/prefill.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# Prefill worker config for disaggregated mode (shares GPU with decode worker)
+# This is for testing. Do not use this for production.
+tensor_parallel_size: 1
+moe_expert_parallel_size: 1
+enable_attention_dp: false
+max_num_tokens: 1024
+trust_remote_code: true
+backend: pytorch
+# Enable chunked prefill to process large contexts in smaller chunks
+enable_chunked_prefill: true
+# Disable overlap scheduler - prefill workers only handle context-only requests
+# PyTorch backend does not support overlap for context-only requests
+disable_overlap_scheduler: true
+cuda_graph_config:
+  max_batch_size: 4
+kv_cache_config:
+  free_gpu_memory_fraction: 0.24
+# Cache transceiver enables KV cache transfer from prefill to decode worker
+# Required for disaggregated mode - decode worker needs KV cache from prefill
+cache_transceiver_config:
+  backend: DEFAULT
--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -60,6 +60,20 @@ trtllm_configs = {
            completion_payload_default(),
        ],
    ),
+    "disaggregated_same_gpu": TRTLLMConfig(
+        name="disaggregated_same_gpu",
+        directory=trtllm_dir,
+        script_name="disagg_same_gpu.sh",
+        marks=[pytest.mark.gpu_1, pytest.mark.trtllm_marker],
+        model="Qwen/Qwen3-0.6B",
+        models_port=8000,
+        request_payloads=[
+            chat_payload_default(),
+            completion_payload_default(),
+            metric_payload_default(port=8081, min_num_requests=6, backend="trtllm"),
+            metric_payload_default(port=8082, min_num_requests=6, backend="trtllm"),
+        ],
+    ),
    "aggregated_router": TRTLLMConfig(
        name="aggregated_router",
        directory=trtllm_dir,