"...git@developer.sourcefind.cn:2222/OpenDAS/vllm_cscc.git" did not exist on "8c6de96ea1e6e51e49a170c28ad3efc16db9413e"
Unverified Commit 86a4a58e authored by Tanmay Verma's avatar Tanmay Verma Committed by GitHub
Browse files

feat: Use health check and improve instructions for perf sweeps (#2423)

parent 6f7f6b12
......@@ -25,17 +25,18 @@ This directory contains scripts for benchmarking TensorRT-LLM performance with D
Please note that:
- These scripts have not undergone formal quality assurance testing
- They were executed on GB200 systems
- These scripts were tested on GB200 systems. To run all configurations, you will need at least 16 nodes, with each node equipped with 4 GPUs.
- They are intended for demonstration and educational purposes
- Use at your own risk in production environments
- Always review and test scripts thoroughly before running in your specific environment
- In disaggregated mode, using `--exclusive` flag to launch worker processes can impact runtime performance. Hence, these scripts specify nodelist explicitly in srun call.
- We are actively working on refining the configuration sweeps.
## Scripts Overview
### Core Scripts
1. `submit.sh` - Main entry point for submitting benchmark jobs for disaggregated configurations. This includes WideEP optimization for DEP>=16.
1. `submit_disagg.sh` - Main entry point for submitting benchmark jobs for disaggregated configurations. This includes WideEP optimization for DEP>=16.
2. `submit_agg.sh` - Main entry point for submitting benchmark jobs for aggregated configurations.
3. `post_process.py` - Scan the genai-perf results to produce a json with entries to each config point.
4. `plot_performance_comparison.py` - Takes the json result file for disaggregated and/or aggregated configuration sweeps and plots a pareto line for better visualization.
......@@ -104,7 +105,7 @@ export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4"
```bash
# Queues the SLURM jobs for disaggregated configurations for DeepSeek R1 without MTP
./submit.sh mtp=off all
./submit_disagg.sh mtp=off all
```
### Disaggregated (Includes WideEP) - MTP on
......
......@@ -117,7 +117,4 @@ srun -l --container-name=${CONTAINER_NAME} \
-w ${nodes[0]} \
bash ${SCRIPTS_DIR}/scripts/bench.sh ${served_model_name} ${MULTI_ROUND} 1 "${concurrency_list}" ${STREAMING} ${full_logdir} ${tp_size} ${artifacts_dir} ${model_path} ${isl} ${osl} ${kind} > ${full_logdir}/bench.log 2>&1
# Wait for all background processes to complete
wait
# Cleanup will be handled by the EXIT trap
\ No newline at end of file
......@@ -171,7 +171,7 @@ for ((i=1; i<=PREFILL_COUNT; i++)); do
--overlap \
--ntasks 4 \
--nodes 1 \
bash ${SCRIPTS_DIR}/scripts/start_worker.sh ${full_logdir}/prefill_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'prefill' &> ${full_logdir}/output_workers.log &
bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'prefill' &> ${full_logdir}/output_workers.log &
prefill_pids+=($!)
echo "$!" >> "$PID_FILE"
done
......@@ -202,7 +202,7 @@ for ((i=1; i<=DECODE_COUNT; i++)); do
--ntasks $gen_tp_size \
--oversubscribe \
--overlap \
bash ${SCRIPTS_DIR}/scripts/start_worker.sh ${full_logdir}/decode_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'decode' &> ${full_logdir}/output_workers.log &
bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'decode' &> ${full_logdir}/output_workers.log &
echo "$!" >> "$PID_FILE"
done
......@@ -215,7 +215,5 @@ srun -l --container-name=${CONTAINER_NAME} \
-w ${nodes[0]} \
bash ${SCRIPTS_DIR}/scripts/bench.sh ${served_model_name} ${MULTI_ROUND} ${num_gen_servers} "${concurrency_list}" ${STREAMING} ${full_logdir} ${total_gpus} ${artifacts_dir} ${model_path} ${isl} ${osl} ${kind} > ${full_logdir}/bench.log 2>&1
# Wait for all background processes to complete
wait
# Cleanup will be handled by the EXIT trap
......@@ -7,6 +7,8 @@ set -e
set -u
trap 'echo "Error occurred at line $LINENO"; exit 1' ERR
WAIT_TIME=300
model=$1
multi_round=$2
num_gen_servers=$3
......@@ -96,48 +98,37 @@ if [ -f "${artifacts_dir}/deployment_config.json" ]; then
fi
echo "${deployment_config}" > "${artifacts_dir}/deployment_config.json"
# TODO: This is a temporary fix to check if the server is up.
# We should use a more robust health check mechanism.
# Loop up to 50 times
# Wait for server to become healthy (up to 50 attempts)
failed=true
for ((i=1; i<=50; i++)); do
# Run curl and capture response and HTTP code
response=$(curl -s -w "\n%{http_code}" "${hostname}:${port}/v1/chat/completions" \
-H "Content-Type: application/json" \
-d "{
\"model\": \"${model}\",
\"messages\": [
{
\"role\": \"user\",
\"content\": \"Tell me a story as if we were playing dungeons and dragons.\"
}
],
\"stream\": true,
\"max_tokens\": 30
}")
# Extract HTTP code
sleep $((i == 1 ? WAIT_TIME : 20))
response=$(curl -s -w "\n%{http_code}" "${hostname}:${port}/health")
http_code=$(echo "$response" | tail -n1)
if [ "$http_code" = "200" ]; then
echo "Success on attempt $i"
# Optional: Print the response body (excluding HTTP code)
echo "$response" | sed '$d'
break
else
echo "Attempt $i failed (HTTP $http_code)."
# Wait: 100 seconds after first failure, 10 seconds after subsequent
if [ "$i" -eq 1 ]; then
sleep 300
body=$(echo "$response" | sed '$d')
if [[ "$http_code" == "200" ]] && echo "$body" | grep -q '"status":"healthy"' && echo "$body" | grep -q '"endpoints":\[[^]]*"dyn://dynamo.tensorrt_llm.generate"'; then
if [[ "$kind" == *disagg* ]]; then
if echo "$body" | grep -q '"tensorrt_llm_next"'; then
echo "Health check succeeded on attempt $i"
echo "$body"
failed=false
break
else
echo "Attempt $i: tensorrt_llm_next key not found in etcd."
fi
else
sleep 10
echo "Health check succeeded on attempt $i"
echo "$body"
failed=false
break
fi
else
echo "Attempt $i failed: /health not ready (HTTP $http_code)."
fi
done
if [ "$http_code" != "200" ]; then
echo "Server did not respond correctly after 50 attempts."
if [[ "$failed" == "true" ]]; then
echo "Server did not respond with healthy status after 50 attempts."
exit 1
fi
......
......@@ -100,9 +100,6 @@ if [ ${mtp} -gt 0 ]; then
cat << EOF > ${extra_llm_api_file}
tensor_parallel_size: ${tp_size}
moe_expert_parallel_size: ${ep_size}
max_batch_size: ${max_batch}
max_num_tokens: ${max_num_tokens}
max_seq_len: ${max_seq_len}
trust_remote_code: true
cuda_graph_config:
enable_padding: true
......@@ -119,15 +116,11 @@ speculative_config:
num_nextn_predict_layers: ${mtp}
moe_config:
backend: ${moe_backend}
max_num_tokens: 37376
EOF
else
cat << EOF > ${extra_llm_api_file}
tensor_parallel_size: ${tp_size}
moe_expert_parallel_size: ${ep_size}
max_batch_size: ${max_batch}
max_num_tokens: ${max_num_tokens}
max_seq_len: ${max_seq_len}
trust_remote_code: true
cuda_graph_config:
enable_padding: true
......@@ -141,7 +134,6 @@ kv_cache_config:
stream_interval: 10
moe_config:
backend: ${moe_backend}
max_num_tokens: 37376
EOF
fi
......@@ -154,5 +146,12 @@ echo "TRT_LLM_VERSION: $TRT_LLM_VERSION"
echo "TRT_LLM_GIT_COMMIT: $TRT_LLM_GIT_COMMIT"
# start the server
trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path $model_path --served-model-name $model_name --extra-engine-args ${extra_llm_api_file}
trtllm-llmapi-launch python3 -m dynamo.trtllm \
--model-path $model_path \
--served-model-name $model_name \
--max-num-tokens ${max_num_tokens} \
--max-batch-size ${max_batch} \
--max-seq-len ${max_seq_len} \
--extra-engine-args ${extra_llm_api_file}
#! /bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
config_file=$1
enable_pdl=$2
ctx_gpus=$3
model_name=$4
model_path=$5
disaggregation_mode=$6
unset UCX_TLS
echo "config_file: ${config_file}, enable_pdl: ${enable_pdl}, ctx_gpus: ${ctx_gpus}, disaggregation_mode: ${disaggregation_mode}"
# Read configuration values from the YAML config file
if [ ! -f "${config_file}" ]; then
echo "Error: Config file ${config_file} not found"
exit 1
fi
# Note: TensorRT-LLM config file is a YAML file may not respect the max_num_tokens,
# max_batch_size, max_seq_len when provided as yaml. Providing these values via
# command line to make sure they are respected.
max_num_tokens=$(grep "^max_num_tokens:" "${config_file}" | sed 's/.*: *//')
max_batch_size=$(grep "^max_batch_size:" "${config_file}" | sed 's/.*: *//')
max_seq_len=$(grep "^max_seq_len:" "${config_file}" | sed 's/.*: *//')
# Validate that we got the values
if [ -z "${max_num_tokens}" ] || [ -z "${max_batch_size}" ] || [ -z "${max_seq_len}" ]; then
echo "Error: Failed to read required configuration values from ${config_file}"
echo "max_num_tokens: ${max_num_tokens}"
echo "max_batch_size: ${max_batch_size}"
echo "max_seq_len: ${max_seq_len}"
exit 1
fi
echo "Configuration loaded from ${config_file}:"
echo " max_num_tokens: ${max_num_tokens}"
echo " max_batch_size: ${max_batch_size}"
echo " max_seq_len: ${max_seq_len}"
export TLLM_LOG_LEVEL=INFO
export TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER=1
if [ "${enable_pdl}" = "true" ]; then
export TRTLLM_ENABLE_PDL=1
fi
trtllm-llmapi-launch python3 -m dynamo.trtllm \
--model-path ${model_path} \
--served-model-name ${model_name} \
--max-num-tokens ${max_num_tokens} \
--max-batch-size ${max_batch_size} \
--max-seq-len ${max_seq_len} \
--disaggregation-mode ${disaggregation_mode} \
--extra-engine-args ${config_file}
......@@ -17,7 +17,7 @@ nats-server -js &
etcd --listen-client-urls http://0.0.0.0:2379 --advertise-client-urls http://0.0.0.0:2379 --data-dir /tmp/etcd &
# Wait for NATS/etcd to startup
sleep 3
sleep 2
# Start OpenAI Frontend which will dynamically discover workers when they startup
# NOTE: This is a blocking call.
......
#! /bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
config_file=$1
enable_pdl=$2
ctx_gpus=$3
model_name=$4
model_path=$5
disaggregation_mode=$6
unset UCX_TLS
echo "config_file: ${config_file}, enable_pdl: ${enable_pdl}, ctx_gpus: ${ctx_gpus}, disaggregation_mode: ${disaggregation_mode}"
export TLLM_LOG_LEVEL=INFO
export TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER=1
if [ "${enable_pdl}" = "true" ]; then
export TRTLLM_ENABLE_PDL=1
fi
trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path $model_path --served-model-name $model_name --disaggregation-mode $disaggregation_mode --extra-engine-args $config_file
......@@ -94,7 +94,7 @@ run_single() {
total_nodes=$((ctx_num + gen_nodes))
total_tasks=$((total_nodes * 4))
set -x
sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark.slurm ${ctx_num} 4 1 8448 true ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL}
sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} 4 1 8448 true ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL}
set +x
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment