feat: Use health check and improve instructions for perf sweeps (#2423)

86a4a58e · Tanmay Verma · GitHub · 6f7f6b12 · 86a4a58e · 86a4a58e
Unverified Commit 86a4a58e authored Aug 19, 2025 by Tanmay Verma Committed by GitHub Aug 19, 2025
9 changed files
--- a/components/backends/trtllm/performance_sweeps/README.md
+++ b/components/backends/trtllm/performance_sweeps/README.md
@@ -25,17 +25,18 @@ This directory contains scripts for benchmarking TensorRT-LLM performance with D
 Please note that:

 - These scripts have not undergone formal quality assurance testing
- They were executed on GB200 systems
+- These scripts were tested on GB200 systems. To run all configurations, you will need at least 16 nodes, with each node equipped with 4 GPUs.
 - They are intended for demonstration and educational purposes
 - Use at your own risk in production environments
 - Always review and test scripts thoroughly before running in your specific environment
+- In disaggregated mode, using `--exclusive` flag to launch worker processes can impact runtime performance. Hence, these scripts specify nodelist explicitly in srun call.
 - We are actively working on refining the configuration sweeps.

 ## Scripts Overview

 ### Core Scripts

-1. `submit.sh` - Main entry point for submitting benchmark jobs for disaggregated configurations. This includes WideEP optimization for DEP>=16.
+1. `submit_disagg.sh` - Main entry point for submitting benchmark jobs for disaggregated configurations. This includes WideEP optimization for DEP>=16.
 2. `submit_agg.sh` - Main entry point for submitting benchmark jobs for aggregated configurations.
 3. `post_process.py` - Scan the genai-perf results to produce a json with entries to each config point.
 4. `plot_performance_comparison.py` - Takes the json result file for disaggregated and/or aggregated configuration sweeps and plots a pareto line for better visualization.
@@ -104,7 +105,7 @@ export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4"

 ```bash
 # Queues the SLURM jobs for disaggregated configurations for DeepSeek R1 without MTP
-./submit.sh mtp=off all
+./submit_disagg.sh mtp=off all
 ```

 ### Disaggregated (Includes WideEP) - MTP on

--- a/components/backends/trtllm/performance_sweeps/benchmark_agg.slurm
+++ b/components/backends/trtllm/performance_sweeps/benchmark_agg.slurm
@@ -117,7 +117,4 @@ srun -l --container-name=${CONTAINER_NAME} \
 	-w ${nodes[0]} \
        bash ${SCRIPTS_DIR}/scripts/bench.sh ${served_model_name} ${MULTI_ROUND} 1 "${concurrency_list}" ${STREAMING} ${full_logdir} ${tp_size} ${artifacts_dir} ${model_path} ${isl} ${osl} ${kind} > ${full_logdir}/bench.log 2>&1

-# Wait for all background processes to complete
-wait
-
 # Cleanup will be handled by the EXIT trap
\ No newline at end of file
--- a/components/backends/trtllm/performance_sweeps/benchmark.slurm
+++ b/components/backends/trtllm/performance_sweeps/benchmark.slurm
@@ -171,7 +171,7 @@ for ((i=1; i<=PREFILL_COUNT; i++)); do
      --overlap \
      --ntasks 4 \
      --nodes 1 \
-      bash ${SCRIPTS_DIR}/scripts/start_worker.sh ${full_logdir}/prefill_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'prefill' &> ${full_logdir}/output_workers.log &
+      bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'prefill' &> ${full_logdir}/output_workers.log &
  prefill_pids+=($!)
  echo "$!" >> "$PID_FILE"
 done
@@ -202,7 +202,7 @@ for ((i=1; i<=DECODE_COUNT; i++)); do
      --ntasks $gen_tp_size \
      --oversubscribe \
      --overlap \
-      bash ${SCRIPTS_DIR}/scripts/start_worker.sh ${full_logdir}/decode_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'decode' &> ${full_logdir}/output_workers.log &
+      bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml "${enable_pdl}" ${ctx_gpus} ${nsys_on} ${served_model_name} ${model_path} 'decode' &> ${full_logdir}/output_workers.log &
  echo "$!" >> "$PID_FILE"
 done

@@ -215,7 +215,5 @@ srun -l --container-name=${CONTAINER_NAME} \
 	-w ${nodes[0]} \
        bash ${SCRIPTS_DIR}/scripts/bench.sh ${served_model_name} ${MULTI_ROUND} ${num_gen_servers} "${concurrency_list}" ${STREAMING} ${full_logdir} ${total_gpus} ${artifacts_dir} ${model_path} ${isl} ${osl} ${kind} > ${full_logdir}/bench.log 2>&1

-# Wait for all background processes to complete
-wait

 # Cleanup will be handled by the EXIT trap
--- a/components/backends/trtllm/performance_sweeps/scripts/bench.sh
+++ b/components/backends/trtllm/performance_sweeps/scripts/bench.sh
@@ -7,6 +7,8 @@ set -e
 set -u
 trap 'echo "Error occurred at line $LINENO"; exit 1' ERR

+WAIT_TIME=300
+
 model=$1
 multi_round=$2
 num_gen_servers=$3
@@ -96,48 +98,37 @@ if [ -f "${artifacts_dir}/deployment_config.json" ]; then
 fi
 echo "${deployment_config}" > "${artifacts_dir}/deployment_config.json"

-# TODO: This is a temporary fix to check if the server is up.
-# We should use a more robust health check mechanism.
-
-# Loop up to 50 times
+# Wait for server to become healthy (up to 50 attempts)
+failed=true
 for ((i=1; i<=50; i++)); do
-    # Run curl and capture response and HTTP code
-    response=$(curl -s -w "\n%{http_code}" "${hostname}:${port}/v1/chat/completions" \
-      -H "Content-Type: application/json" \
-      -d "{
-        \"model\": \"${model}\",
-        \"messages\": [
-           {
-            \"role\": \"user\",
-            \"content\": \"Tell me a story as if we were playing dungeons and dragons.\"
-           }
-        ],
-        \"stream\": true,
-        \"max_tokens\": 30
-      }")
-
-    # Extract HTTP code
+    sleep $((i == 1 ? WAIT_TIME : 20))
+    response=$(curl -s -w "\n%{http_code}" "${hostname}:${port}/health")
    http_code=$(echo "$response" | tail -n1)
-
-    if [ "$http_code" = "200" ]; then
-        echo "Success on attempt $i"
-        # Optional: Print the response body (excluding HTTP code)
-        echo "$response" | sed '$d'
-        break
-    else
-        echo "Attempt $i failed (HTTP $http_code)."
-
-        # Wait: 100 seconds after first failure, 10 seconds after subsequent
-        if [ "$i" -eq 1 ]; then
-            sleep 300
+    body=$(echo "$response" | sed '$d')
+
+    if [[ "$http_code" == "200" ]] && echo "$body" | grep -q '"status":"healthy"' && echo "$body" | grep -q '"endpoints":\[[^]]*"dyn://dynamo.tensorrt_llm.generate"'; then
+        if [[ "$kind" == *disagg* ]]; then
+            if echo "$body" | grep -q '"tensorrt_llm_next"'; then
+                echo "Health check succeeded on attempt $i"
+                echo "$body"
+                failed=false
+                break
+            else
+                echo "Attempt $i: tensorrt_llm_next key not found in etcd."
+            fi
        else
-            sleep 10
+            echo "Health check succeeded on attempt $i"
+            echo "$body"
+            failed=false
+            break
        fi
+    else
+        echo "Attempt $i failed: /health not ready (HTTP $http_code)."
    fi
 done

-if [ "$http_code" != "200" ]; then
-    echo "Server did not respond correctly after 50 attempts."
+if [[ "$failed" == "true" ]]; then
+    echo "Server did not respond with healthy status after 50 attempts."
    exit 1
 fi


--- a/components/backends/trtllm/performance_sweeps/scripts/start_agg_worker.sh
+++ b/components/backends/trtllm/performance_sweeps/scripts/start_agg_worker.sh
@@ -100,9 +100,6 @@ if [ ${mtp} -gt 0 ]; then
 cat << EOF > ${extra_llm_api_file}
 tensor_parallel_size: ${tp_size}
 moe_expert_parallel_size: ${ep_size}
-max_batch_size: ${max_batch}
-max_num_tokens: ${max_num_tokens}
-max_seq_len: ${max_seq_len}
 trust_remote_code: true
 cuda_graph_config:
    enable_padding: true
@@ -119,15 +116,11 @@ speculative_config:
  num_nextn_predict_layers: ${mtp}
 moe_config:
    backend: ${moe_backend}
-    max_num_tokens: 37376
 EOF
 else
 cat << EOF > ${extra_llm_api_file}
 tensor_parallel_size: ${tp_size}
 moe_expert_parallel_size: ${ep_size}
-max_batch_size: ${max_batch}
-max_num_tokens: ${max_num_tokens}
-max_seq_len: ${max_seq_len}
 trust_remote_code: true
 cuda_graph_config:
    enable_padding: true
@@ -141,7 +134,6 @@ kv_cache_config:
 stream_interval: 10
 moe_config:
    backend: ${moe_backend}
-    max_num_tokens: 37376
 EOF
 fi

@@ -154,5 +146,12 @@ echo "TRT_LLM_VERSION: $TRT_LLM_VERSION"
 echo "TRT_LLM_GIT_COMMIT: $TRT_LLM_GIT_COMMIT"

 # start the server
-trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path $model_path --served-model-name $model_name --extra-engine-args ${extra_llm_api_file}
+trtllm-llmapi-launch python3 -m dynamo.trtllm \
+    --model-path $model_path \
+    --served-model-name $model_name \
+    --max-num-tokens ${max_num_tokens} \
+    --max-batch-size ${max_batch} \
+    --max-seq-len ${max_seq_len} \
+    --extra-engine-args ${extra_llm_api_file}
+

--- a/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh
+++ b/components/backends/trtllm/performance_sweeps/scripts/start_disagg_worker.sh
+#! /bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+config_file=$1
+enable_pdl=$2
+ctx_gpus=$3
+model_name=$4
+model_path=$5
+disaggregation_mode=$6
+unset UCX_TLS
+echo "config_file: ${config_file}, enable_pdl: ${enable_pdl}, ctx_gpus: ${ctx_gpus}, disaggregation_mode: ${disaggregation_mode}"
+
+# Read configuration values from the YAML config file
+if [ ! -f "${config_file}" ]; then
+    echo "Error: Config file ${config_file} not found"
+    exit 1
+fi
+
+# Note: TensorRT-LLM config file is a YAML file may not respect the max_num_tokens,
+# max_batch_size, max_seq_len when provided as yaml. Providing these values via
+# command line to make sure they are respected.
+max_num_tokens=$(grep "^max_num_tokens:" "${config_file}" | sed 's/.*: *//')
+max_batch_size=$(grep "^max_batch_size:" "${config_file}" | sed 's/.*: *//')
+max_seq_len=$(grep "^max_seq_len:" "${config_file}" | sed 's/.*: *//')
+
+
+# Validate that we got the values
+if [ -z "${max_num_tokens}" ] || [ -z "${max_batch_size}" ] || [ -z "${max_seq_len}" ]; then
+    echo "Error: Failed to read required configuration values from ${config_file}"
+    echo "max_num_tokens: ${max_num_tokens}"
+    echo "max_batch_size: ${max_batch_size}"
+    echo "max_seq_len: ${max_seq_len}"
+    exit 1
+fi
+
+echo "Configuration loaded from ${config_file}:"
+echo "  max_num_tokens: ${max_num_tokens}"
+echo "  max_batch_size: ${max_batch_size}"
+echo "  max_seq_len: ${max_seq_len}"
+
+export TLLM_LOG_LEVEL=INFO
+export TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER=1
+
+if [ "${enable_pdl}" = "true" ]; then
+    export TRTLLM_ENABLE_PDL=1
+fi
+
+trtllm-llmapi-launch python3 -m dynamo.trtllm \
+    --model-path ${model_path} \
+    --served-model-name ${model_name} \
+    --max-num-tokens ${max_num_tokens} \
+    --max-batch-size ${max_batch_size} \
+    --max-seq-len ${max_seq_len} \
+    --disaggregation-mode ${disaggregation_mode} \
+    --extra-engine-args ${config_file}
--- a/components/backends/trtllm/performance_sweeps/scripts/start_frontend.sh
+++ b/components/backends/trtllm/performance_sweeps/scripts/start_frontend.sh
@@ -17,7 +17,7 @@ nats-server -js &
 etcd --listen-client-urls http://0.0.0.0:2379 --advertise-client-urls http://0.0.0.0:2379 --data-dir /tmp/etcd &

 # Wait for NATS/etcd to startup
-sleep 3
+sleep 2

 # Start OpenAI Frontend which will dynamically discover workers when they startup
 # NOTE: This is a blocking call.

--- a/components/backends/trtllm/performance_sweeps/scripts/start_worker.sh
+++ b/components/backends/trtllm/performance_sweeps/scripts/start_worker.sh
-#! /bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-config_file=$1
-enable_pdl=$2
-ctx_gpus=$3
-model_name=$4
-model_path=$5
-disaggregation_mode=$6
-unset UCX_TLS
-echo "config_file: ${config_file}, enable_pdl: ${enable_pdl}, ctx_gpus: ${ctx_gpus}, disaggregation_mode: ${disaggregation_mode}"
-
-export TLLM_LOG_LEVEL=INFO
-export TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER=1
-
-if [ "${enable_pdl}" = "true" ]; then
-    export TRTLLM_ENABLE_PDL=1
-fi
-
-trtllm-llmapi-launch python3 -m dynamo.trtllm --model-path $model_path --served-model-name $model_name --disaggregation-mode $disaggregation_mode  --extra-engine-args $config_file
--- a/components/backends/trtllm/performance_sweeps/submit.sh
+++ b/components/backends/trtllm/performance_sweeps/submit.sh
@@ -94,7 +94,7 @@ run_single() {
    total_nodes=$((ctx_num + gen_nodes))
    total_tasks=$((total_nodes * 4))
    set -x
-    sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark.slurm ${ctx_num} 4 1 8448 true ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL}
+    sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} 4 1 8448 true ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL}
    set +x
 }