Unverified Commit ba51c683 authored by Keiven C's avatar Keiven C Committed by GitHub
Browse files

fix: add prefill metrics support for TensorRT-LLM disaggregated mode (#3983)


Signed-off-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
Co-authored-by: default avatarKeiven Chang <keivenchang@users.noreply.github.com>
parent 49ecfe60
...@@ -241,18 +241,19 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -241,18 +241,19 @@ async def init(runtime: DistributedRuntime, config: Config):
if config.publish_events_and_metrics: if config.publish_events_and_metrics:
# 'event_buffer_max_size' is required to enable TRTLLM to publish kv cache events. # 'event_buffer_max_size' is required to enable TRTLLM to publish kv cache events.
# Convert KvCacheConfig object to dict and add the parameter # Add it to kv_cache_config while preserving cache_transceiver_config from YAML
current_kv_config = arg_map["kv_cache_config"] current_kv_config = arg_map["kv_cache_config"]
if isinstance(current_kv_config, KvCacheConfig): if isinstance(current_kv_config, KvCacheConfig):
# Convert KvCacheConfig object to dict (no cache_transceiver_config to preserve)
arg_map["kv_cache_config"] = { arg_map["kv_cache_config"] = {
"free_gpu_memory_fraction": config.free_gpu_memory_fraction, "free_gpu_memory_fraction": config.free_gpu_memory_fraction,
"event_buffer_max_size": DEFAULT_KV_EVENT_BUFFER_MAX_SIZE, "event_buffer_max_size": DEFAULT_KV_EVENT_BUFFER_MAX_SIZE,
} }
elif isinstance(current_kv_config, dict): elif isinstance(current_kv_config, dict):
if "event_buffer_max_size" not in current_kv_config: # Add event_buffer_max_size while preserving cache_transceiver_config and other YAML settings
current_kv_config[ current_kv_config[
"event_buffer_max_size" "event_buffer_max_size"
] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE ] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
# Only pytorch backend is supported for now to publish events and metrics. # Only pytorch backend is supported for now to publish events and metrics.
if "backend" not in arg_map: if "backend" not in arg_map:
......
...@@ -46,4 +46,4 @@ CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ ...@@ -46,4 +46,4 @@ CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--extra-engine-args "$DECODE_ENGINE_ARGS" \ --extra-engine-args "$DECODE_ENGINE_ARGS" \
--disaggregation-strategy "$DISAGGREGATION_STRATEGY" \ --disaggregation-strategy "$DISAGGREGATION_STRATEGY" \
--modality "$MODALITY" \ --modality "$MODALITY" \
--disaggregation-mode decode --disaggregation-mode decode
\ No newline at end of file
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Disaggregated mode on single GPU - for testing only
# Both prefill and decode workers share the same GPU with reduced memory
# Check GPU memory availability
FREE_GPU_GB=$(python3 -c "import torch; print(torch.cuda.mem_get_info()[0]/1024**3)" 2>/dev/null)
if [ $? -ne 0 ]; then
echo "Error: Failed to check GPU memory. Is PyTorch with CUDA available?"
exit 1
fi
REQUIRED_GB=16
# Use bash arithmetic instead of bc to avoid external dependency
FREE_GPU_INT=$(python3 -c "print(int(float('$FREE_GPU_GB')))" 2>/dev/null)
if [ $? -ne 0 ]; then
echo "Error: Failed to parse GPU memory value."
exit 1
fi
if (( FREE_GPU_INT < REQUIRED_GB )); then
echo "Error: Insufficient GPU memory. Required: ${REQUIRED_GB}GB, Available: ${FREE_GPU_GB}GB"
echo "Please free up GPU memory before running disaggregated mode on single GPU."
exit 1
fi
echo "GPU memory check passed: ${FREE_GPU_GB}GB available (required: ${REQUIRED_GB}GB)"
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/tests/serve/configs/trtllm/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/tests/serve/configs/trtllm/decode.yaml"}
export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"}
export MODALITY=${MODALITY:-"text"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
wait $DYNAMO_PID $PREFILL_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run frontend
python3 -m dynamo.frontend --http-port 8000 &
DYNAMO_PID=$!
# run prefill worker (shares GPU with decode)
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8081 \
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$PREFILL_ENGINE_ARGS" \
--disaggregation-strategy "$DISAGGREGATION_STRATEGY" \
--modality "$MODALITY" \
--publish-events-and-metrics \
--disaggregation-mode prefill &
PREFILL_PID=$!
# run decode worker (shares GPU with prefill)
CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES \
DYN_SYSTEM_ENABLED=true DYN_SYSTEM_PORT=8082 \
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$DECODE_ENGINE_ARGS" \
--disaggregation-strategy "$DISAGGREGATION_STRATEGY" \
--modality "$MODALITY" \
--publish-events-and-metrics \
--disaggregation-mode decode
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Memory-optimized config for single GPU deployment (FP16)
# This is for testing. Do not use this for production.
# How many models can fit?
# - RTX 4090 (24GB): 10x 0.6B, 6x 1B, 2x 3B
# - RTX 6000 Ada (48GB): 20x 0.6B, 8x 1.5B, 4x 3.5B, 2x 7B
# - A100 (40GB): 16x 0.6B, 4x 2.5B, 2x 6B
# - A100 (80GB): 32x 0.6B, 8x 3B, 4x 6B, 2x 12B
# - H100 (80GB): 32x 0.6B, 8x 3B, 4x 6B, 2x 12B
#
# For production (85% memory): RTX 6000 can handle 70x 0.6B, 4x 6B, 2x 12B, 1x 25B
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 1024
max_batch_size: 4
trust_remote_code: true
backend: pytorch
enable_chunked_prefill: true
kv_cache_config:
free_gpu_memory_fraction: 0.24
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
cuda_graph_config:
max_batch_size: 4
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Decode worker config for disaggregated mode (shares GPU with prefill worker)
# This is for testing. Do not use this for production.
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 1024
trust_remote_code: true
backend: pytorch
# Enable chunked prefill to process large contexts in smaller chunks
enable_chunked_prefill: true
# Overlap scheduler enabled - decode workers can overlap multiple decode operations
disable_overlap_scheduler: false
cuda_graph_config:
max_batch_size: 4
kv_cache_config:
free_gpu_memory_fraction: 0.24
# Cache transceiver receives KV cache from prefill worker
# Required for disaggregated mode - decode worker needs KV cache from prefill
cache_transceiver_config:
backend: DEFAULT
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Prefill worker config for disaggregated mode (shares GPU with decode worker)
# This is for testing. Do not use this for production.
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 1024
trust_remote_code: true
backend: pytorch
# Enable chunked prefill to process large contexts in smaller chunks
enable_chunked_prefill: true
# Disable overlap scheduler - prefill workers only handle context-only requests
# PyTorch backend does not support overlap for context-only requests
disable_overlap_scheduler: true
cuda_graph_config:
max_batch_size: 4
kv_cache_config:
free_gpu_memory_fraction: 0.24
# Cache transceiver enables KV cache transfer from prefill to decode worker
# Required for disaggregated mode - decode worker needs KV cache from prefill
cache_transceiver_config:
backend: DEFAULT
...@@ -60,6 +60,20 @@ trtllm_configs = { ...@@ -60,6 +60,20 @@ trtllm_configs = {
completion_payload_default(), completion_payload_default(),
], ],
), ),
"disaggregated_same_gpu": TRTLLMConfig(
name="disaggregated_same_gpu",
directory=trtllm_dir,
script_name="disagg_same_gpu.sh",
marks=[pytest.mark.gpu_1, pytest.mark.trtllm_marker],
model="Qwen/Qwen3-0.6B",
models_port=8000,
request_payloads=[
chat_payload_default(),
completion_payload_default(),
metric_payload_default(port=8081, min_num_requests=6, backend="trtllm"),
metric_payload_default(port=8082, min_num_requests=6, backend="trtllm"),
],
),
"aggregated_router": TRTLLMConfig( "aggregated_router": TRTLLMConfig(
name="aggregated_router", name="aggregated_router",
directory=trtllm_dir, directory=trtllm_dir,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment