feat: sglang multiple frontend slurm deployment scripts (#2623)

Signed-off-by: ishandhanani <ishandhanani@gmail.com> Signed-off-by: Elnifio <elnifio0519@gmail.com> Co-authored-by: Yunzhou Liu <46603306+Elnifio@users.noreply.github.com>

feat: sglang multiple frontend slurm deployment scripts (#2623)
Signed-off-by: ishandhanani <ishandhanani@gmail.com> Signed-off-by: Elnifio <elnifio0519@gmail.com> Co-authored-by: Yunzhou Liu <46603306+Elnifio@users.noreply.github.com>
4b3a2c1a · ishandhanani · GitHub · cfb7aed7 · 4b3a2c1a · 4b3a2c1a
Unverified Commit 4b3a2c1a authored Sep 10, 2025 by ishandhanani Committed by GitHub Sep 10, 2025
14 changed files
--- a/components/backends/sglang/slurm_jobs/job_script_template.j2
+++ b/components/backends/sglang/slurm_jobs/job_script_template.j2
@@ -10,6 +10,7 @@
 #SBATCH --partition={{ partition }}

 # Constants
+set -x
 PREFILL_NODES={{ prefill_nodes }}
 DECODE_NODES={{ decode_nodes }}
 PREFILL_WORKERS={{ prefill_workers }}
@@ -18,7 +19,7 @@ TOTAL_NODES=$((PREFILL_NODES + DECODE_NODES))
 GPUS_PER_NODE={{ gpus_per_node }}
 PREFILL_NODES_PER_WORKER=$((PREFILL_NODES / PREFILL_WORKERS))
 DECODE_NODES_PER_WORKER=$((DECODE_NODES / DECODE_WORKERS))
-LOG_DIR="${SLURM_SUBMIT_DIR}/logs/${SLURM_JOB_ID}/"
+LOG_DIR="${SLURM_SUBMIT_DIR}/logs/${SLURM_JOB_ID}"
 SCRIPT_DIR="${SLURM_SUBMIT_DIR}/scripts"
 OUTPUT_DIR="${SLURM_SUBMIT_DIR}/outputs"
 MODEL_DIR="{{ model_dir }}"
@@ -26,6 +27,7 @@ CONFIG_DIR="{{ config_dir }}"
 CONTAINER_IMAGE="{{ container_image }}"
 NETWORK_INTERFACE="{{ network_interface }}"
 GPU_TYPE="{{ gpu_type | default('h100') }}"
+set +x

 {% raw %}

@@ -42,24 +44,114 @@ for i in "${!nodes[@]}"; do
    echo "Node $i: ${nodes[$i]}"
 done

-# Get IP address of the master node (first prefill node) for NATS/ETCD
+{% endraw %}
+{% if enable_multiple_frontends %}
+{% raw %}
+# Multiple frontend architecture
+# Node 0: nginx only
+# Node 1: NATS/ETCD + first frontend + prefill worker
+# Node 2+: prefill/decode workers + optional additional frontends
+
+NGINX_NODE=${nodes[0]}
+MASTER_NODE=${nodes[1]}
+MASTER_IP=$(srun --nodes=1 --ntasks=1 --nodelist=$MASTER_NODE ip addr show $NETWORK_INTERFACE | grep 'inet ' | awk '{print $2}' | cut -d'/' -f1)
+if [ -z "$MASTER_IP" ]; then
+    echo "Error: Could not retrieve IP address for master host $MASTER_NODE on interface $NETWORK_INTERFACE"
+    exit 1
+fi
+echo "Master IP address (node 1): $MASTER_IP"
+echo "Nginx node (node 0): $NGINX_NODE"
+
+# Generate frontend IP list for nginx config
+frontend_hosts=()
+frontend_ips=()
+# Node 1 always has a frontend (with NATS/ETCD)
+frontend_hosts+=("$MASTER_NODE")
+frontend_ips+=("$MASTER_IP")
+
+# Add additional frontends based on num_additional_frontends
+{% endraw %}ADDITIONAL_FRONTENDS={{ num_additional_frontends }}{% raw %}
+if [ "$ADDITIONAL_FRONTENDS" -gt 0 ]; then
+    # Calculate which nodes get additional frontends
+    # We have TOTAL_NODES prefill/decode nodes, distribute additional frontends across them
+    nodes_per_frontend=$(( (TOTAL_NODES - 1 + ADDITIONAL_FRONTENDS - 1) / ADDITIONAL_FRONTENDS ))  # ceil division
+    frontend_node_idx=2  # Start from node 2 (node 1 already has frontend)
+
+    for i in $(seq 1 $ADDITIONAL_FRONTENDS); do
+        if [ $frontend_node_idx -lt $TOTAL_NODES ]; then
+            node_name=${nodes[$frontend_node_idx]}
+            node_ip=$(srun --nodes=1 --ntasks=1 --nodelist=$node_name ip addr show $NETWORK_INTERFACE | grep 'inet ' | awk '{print $2}' | cut -d'/' -f1)
+            frontend_hosts+=("$node_name")
+            frontend_ips+=("$node_ip")
+            echo "Additional frontend $i on node $frontend_node_idx: $node_name ($node_ip)"
+            frontend_node_idx=$((frontend_node_idx + nodes_per_frontend))
+        fi
+    done
+fi
+
+echo "Frontend hosts: ${frontend_hosts[@]}"
+echo "Frontend IPs: ${frontend_ips[@]}"
+
+# Generate nginx configuration
+# Build a Python list literal of frontend hosts from the bash array
+FRONTEND_LIST=$(printf "'%s'," "${frontend_ips[@]}")
+FRONTEND_LIST="[${FRONTEND_LIST%,}]"
+export FRONTEND_LIST SCRIPT_DIR LOG_DIR
+python3 - <<'PY'
+import os
+from jinja2 import Template
+
+template_path = os.path.join(os.environ['SCRIPT_DIR'], 'nginx.conf.j2')
+output_path = os.path.join(os.environ['LOG_DIR'], 'nginx.conf')
+
+with open(template_path, 'r') as f:
+    tmpl = Template(f.read())
+
+frontend_hosts = eval(os.environ['FRONTEND_LIST'])
+config = tmpl.render(frontend_hosts=frontend_hosts)
+
+with open(output_path, 'w') as f:
+    f.write(config)
+PY
+
+{% endraw %}
+{% else %}
+{% raw %}
+# Traditional architecture - first prefill node handles everything
 MASTER_IP=$(srun --nodes=1 --ntasks=1 --nodelist=${nodes[0]} ip addr show $NETWORK_INTERFACE | grep 'inet ' | awk '{print $2}' | cut -d'/' -f1)
 if [ -z "$MASTER_IP" ]; then
    echo "Error: Could not retrieve IP address for master host ${nodes[0]} on interface $NETWORK_INTERFACE"
    exit 1
 fi
 echo "Master IP address: $MASTER_IP"
+{% endraw %}
+{% endif %}
+{% raw %}

 # Compute leader nodes for each worker
+{% endraw %}
+{% if enable_multiple_frontends %}
+{% raw %}
+# With multiple frontends: keep offset 0; nginx coexists on node 0
+WORKER_NODE_OFFSET=0
+{% endraw %}
+{% else %}
+{% raw %}
+# Traditional: workers start from node 0
+WORKER_NODE_OFFSET=0
+{% endraw %}
+{% endif %}
+{% raw %}
+
 prefill_leaders=()
 for i in $(seq 0 $((PREFILL_WORKERS - 1))); do
-    leader_idx=$((i * PREFILL_NODES_PER_WORKER))
+    leader_idx=$((WORKER_NODE_OFFSET + i * PREFILL_NODES_PER_WORKER))
    prefill_leaders[$i]=$leader_idx
 done

 decode_leaders=()
 for i in $(seq 0 $((DECODE_WORKERS - 1))); do
-    leader_idx=$((PREFILL_NODES + i * DECODE_NODES_PER_WORKER))
+    leader_idx=$((WORKER_NODE_OFFSET + PREFILL_NODES + i * DECODE_NODES_PER_WORKER))
    decode_leaders[$i]=$leader_idx
 done

@@ -76,6 +168,57 @@ ENROOT_ARGS="\

 # Build common worker arguments
 WORKER_ARGS="--gpu_type ${GPU_TYPE} --gpus_per_node ${GPUS_PER_NODE} --master_ip ${MASTER_IP}"
+{% endraw %}
+{% if enable_multiple_frontends %}
+{% raw %}
+# Add multiple frontends flag for worker setup
+WORKER_ARGS="$WORKER_ARGS --multiple-frontends-enabled"
+{% endraw %}
+{% endif %}
+{% if use_init_location %}
+{% raw %}
+# Add multiple frontends flag for worker setup
+WORKER_ARGS="$WORKER_ARGS --use_init_locations"
+{% endraw %}
+{% endif %}
+{% raw %}
+
+{% endraw %}
+{% if enable_multiple_frontends %}
+{% raw %}
+# Launch nginx on node 0
+echo "Launching nginx on ${NGINX_NODE}"
+cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$NGINX_NODE --output=${LOG_DIR}/${NGINX_NODE}_nginx.out --error=${LOG_DIR}/${NGINX_NODE}_nginx.err python /scripts/worker_setup.py --worker_type nginx --nginx_config /logs/nginx.conf ${WORKER_ARGS}"
+echo "$cmd"
+$cmd &
+
+# Launch frontend on master node (node 1) - this will also start NATS/ETCD
+echo "Launching frontend + NATS/ETCD on master node ${MASTER_NODE}"
+cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$MASTER_NODE --output=${LOG_DIR}/${MASTER_NODE}_frontend_0.out --error=${LOG_DIR}/${MASTER_NODE}_frontend.err python /scripts/worker_setup.py --worker_type frontend --worker_idx 0 ${WORKER_ARGS}"
+echo "$cmd"
+$cmd &
+
+# Launch additional frontends on designated nodes
+if [ "$ADDITIONAL_FRONTENDS" -gt 0 ]; then
+    frontend_idx=1  # Start from 1 since node 1 is frontend 0
+    nodes_per_frontend=$(( (TOTAL_NODES - 2 + ADDITIONAL_FRONTENDS - 1) / ADDITIONAL_FRONTENDS ))
+    frontend_node_idx=2
+
+    for i in $(seq 1 $ADDITIONAL_FRONTENDS); do
+        if [ $frontend_node_idx -lt $TOTAL_NODES ]; then
+            node=${nodes[$frontend_node_idx]}
+            echo "Launching additional frontend $frontend_idx on node $frontend_node_idx: $node"
+            cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_frontend_${frontend_idx}.out --error=${LOG_DIR}/${node}_frontend_${frontend_idx}.err python /scripts/worker_setup.py --worker_type frontend --worker_idx ${frontend_idx} ${WORKER_ARGS}"
+            echo "$cmd"
+            $cmd &
+            frontend_idx=$((frontend_idx + 1))
+            frontend_node_idx=$((frontend_node_idx + nodes_per_frontend))
+        fi
+    done
+fi
+{% endraw %}
+{% endif %}
+{% raw %}

 # Launch prefill workers
 for worker_idx in $(seq 0 $((PREFILL_WORKERS - 1))); do
@@ -83,7 +226,7 @@ for worker_idx in $(seq 0 $((PREFILL_WORKERS - 1))); do
    leader_node=${nodes[$leader_idx]}

    # Get leader IP for this worker group
-    LEADER_IP=$(srun --nodes=1 --ntasks=1 --nodelist=$leader_node ip route get $(getent ahosts $leader_node | grep STREAM | head -1 | awk '{print $1}') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}')
+    LEADER_IP=$(srun --nodes=1 --ntasks=1 --nodelist=$leader_node ip addr show $NETWORK_INTERFACE | grep 'inet ' | awk '{print $2}' | cut -d'/' -f1)
    echo "Prefill worker $worker_idx leader: $leader_node ($LEADER_IP)"

    # Launch all nodes for this worker
@@ -94,7 +237,7 @@ for worker_idx in $(seq 0 $((PREFILL_WORKERS - 1))); do

        echo "Launching prefill worker $worker_idx, node $global_node_idx (local_rank $local_rank): $node"

-        cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill_w${worker_idx}.out --error=${LOG_DIR}/${node}_prefill_w${worker_idx}.err python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${PREFILL_NODES_PER_WORKER} --worker_type prefill --gpu_utilization_log /logs/${node}_prefill_w${worker_idx}_gpu_utilization.log ${WORKER_ARGS}"
+        cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_prefill_w${worker_idx}.out --error=${LOG_DIR}/${node}_prefill_w${worker_idx}.err python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${PREFILL_NODES_PER_WORKER} --worker_type prefill --gpu_utilization_log /logs/${node}_prefill_w${worker_idx}_gpu_utilization.log ${WORKER_ARGS}"
        echo "$cmd"
        $cmd &
    done
@@ -106,7 +249,7 @@ for worker_idx in $(seq 0 $((DECODE_WORKERS - 1))); do
    leader_node=${nodes[$leader_idx]}

    # Get leader IP for this worker group
-    LEADER_IP=$(srun --nodes=1 --ntasks=1 --nodelist=$leader_node ip route get $(getent ahosts $leader_node | grep STREAM | head -1 | awk '{print $1}') | awk '{for(i=1;i<=NF;i++) if($i=="src") print $(i+1)}')
+    LEADER_IP=$(srun --nodes=1 --ntasks=1 --nodelist=$leader_node ip addr show $NETWORK_INTERFACE | grep 'inet ' | awk '{print $2}' | cut -d'/' -f1)
    echo "Decode worker $worker_idx leader: $leader_node ($LEADER_IP)"

    # Launch all nodes for this worker
@@ -117,22 +260,50 @@ for worker_idx in $(seq 0 $((DECODE_WORKERS - 1))); do

        echo "Launching decode worker $worker_idx, node $global_node_idx (local_rank $local_rank): $node"

-        cmd="srun $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode_w${worker_idx}.out --error=${LOG_DIR}/${node}_decode_w${worker_idx}.err python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${DECODE_NODES_PER_WORKER} --worker_type decode --gpu_utilization_log /logs/${node}_decode_w${worker_idx}_gpu_utilization.log ${WORKER_ARGS}"
+        cmd="srun --overlap $ENROOT_ARGS --nodes=1 --ntasks=1 --nodelist=$node --output=${LOG_DIR}/${node}_decode_w${worker_idx}.out --error=${LOG_DIR}/${node}_decode_w${worker_idx}.err python /scripts/worker_setup.py --leader_ip ${LEADER_IP} --worker_idx ${worker_idx} --local_rank ${local_rank} --nodes_per_worker ${DECODE_NODES_PER_WORKER} --worker_type decode --gpu_utilization_log /logs/${node}_decode_w${worker_idx}_gpu_utilization.log ${WORKER_ARGS}"
        echo "$cmd"
        $cmd &
    done
 done

 echo ""
+{% endraw %}
+{% if enable_multiple_frontends %}
+{% raw %}
+echo "Frontend available at: http://${NGINX_NODE}:8000"
+echo "To connect to the nginx node:"
+echo "srun $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${NGINX_NODE} --overlap --pty bash"
+echo "To connect to the master node (NATS/ETCD):"
+echo "srun $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${MASTER_NODE} --overlap --pty bash"
+{% endraw %}
+{% else %}
+{% raw %}
 echo "To connect to the host prefill node:"
 echo "srun $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --overlap --pty bash"
+{% endraw %}
+{% endif %}
+{% raw %}

 echo ""
 echo "Make sure to cancel the job at the end:"
 echo "scancel $SLURM_JOB_ID"

-# Wait for all tasks to complete
-wait
-echo "Script finished at $(date)"
+# Instead of waiting for all tasks to complete, wait for profile.sh to complete and then exit.

 {% endraw %}
+
+PROFILER_TYPE={{ profiler_type }}
+PROFILER_ARGS="{{ profiler_arg }}"
+
+{% if do_profile %}
+{% raw %}
+srun --nodes=1 --ntasks=1 $ENROOT_ARGS --jobid $SLURM_JOB_ID -w ${nodes[0]} --output=${LOG_DIR}/profile.out --error=${LOG_DIR}/profile.err --overlap bash /scripts/${PROFILER_TYPE}/bench.sh $PREFILL_WORKERS $DECODE_WORKERS ${PROFILER_ARGS} &
+{% endraw %}
+{% endif %}
+
+{% raw %}
+wait -n
+first_exit_code=$?
+echo "Script finished at $(date) with exit code ${first_exit_code}"
+exit $first_exit_code
+{% endraw %}
--- a/components/backends/sglang/slurm_jobs/scripts/benchmark_utils.sh
+++ b/components/backends/sglang/slurm_jobs/scripts/benchmark_utils.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+wait_for_model() {
+
+    local model_host=$1
+    local model_port=$2
+    local poll=${3:-1}
+    local timeout=${4:-600}
+    local report_every=${5:-60}
+
+    local health_addr="http://${model_host}:${model_port}/health"
+    echo "Polling ${health_addr} every ${poll} seconds"
+
+    local start_ts=$(date +%s)
+    local report_ts=$(date +%s)
+
+    while :; do
+        curl_result=$(curl ${health_addr} 2>/dev/null)
+        health=$(grep '"status":"healthy"' <<< $curl_result)
+        if [[ -n $health ]]; then
+            echo "Model is alive. Health response: ${curl_result}; "
+            return 0;
+        fi
+
+        time_now=$(date +%s)
+        if [[ $((time_now - start_ts)) -ge $timeout ]]; then
+            echo "Model did not get healthy in ${timeout} seconds"
+            exit 2;
+        fi
+
+        if [[ $((time_now - report_ts)) -ge $report_every ]]; then
+            echo "Waiting for model to come alive. Current result: ${curl_result}"
+            report_ts=$time_now
+        fi
+
+        sleep $poll
+    done
+}
+
+warmup_model() {
+    service_host=$1
+    service_port=$2
+    served_model_name=$3
+    model_path=$4
+    config=$5
+
+    IFS='x' read -r -a config_list <<< "$config"
+    isl=${config_list[0]}
+    osl=${config_list[1]}
+    num_prompts=${config_list[2]}
+    concurrency=${config_list[3]}
+    request_rate=${config_list[4]}
+
+    command=(
+        python3 -m sglang.bench_serving
+        --base-url "http://${service_host}:${service_port}"
+        --model ${served_model_name} --tokenizer ${model_path}
+        --backend sglang-oai
+        --dataset-name random --random-input ${isl} --random-output ${osl}
+        --random-range-ratio 1
+        --num-prompts ${num_prompts} --request-rate ${request_rate} --max-concurrency ${concurrency}
+    )
+
+    echo "Config ${config}. Running command ${command[@]}"
+
+    ${command[@]}
+}
--- a/components/backends/sglang/slurm_jobs/scripts/gap/bench.sh
+++ b/components/backends/sglang/slurm_jobs/scripts/gap/bench.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+prefill_workers=$1
+decode_workers=$2
+
+chosen_isl=$3
+chosen_osl=$4
+chosen_concurrencies=$5
+
+echo "Profiling for model with PrefillDP=${prefill_workers}, DecodeDP=${decode_workers}"
+
+head_node="localhost"
+head_port="8000"
+
+SERVED_MODEL_NAME="deepseek-ai/DeepSeek-R1"
+MODEL_PATH=/model/
+
+random_seed=$(python3 -c "import random; print(random.randint(0, 65535))")
+random_seed=$RANDOM
+echo "Chosen random seed ${random_seed}"
+
+source /scripts/benchmark_utils.sh
+
+wait_for_model $head_node $head_port 5 2400 60
+
+set -e
+warmup_model $head_node $head_port $SERVED_MODEL_NAME $MODEL_PATH "${chosen_isl}x${chosen_osl}x10000x10000x250"
+set +e
+
+genai_perf_warmup_workers=$(python3 -c "print(max(${DP:-0}, ${prefill_workers:-0}, ${decode_workers:-0}))")
+
+IFS='x' read -r -a concurrency_list <<< "$chosen_concurrencies"
+
+profile_folder="/logs/gap_isl_${chosen_isl}_osl_${chosen_osl}"
+mkdir -p $profile_folder
+
+tmp_work_dir=$(mktemp -d -t genai-perf-XXXXXXXX)
+for concurrency in ${concurrency_list[@]}; do
+    export_folder="${tmp_work_dir}/concurrency_${concurrency}"
+    mkdir -p $export_folder
+    export_model_name=${SERVED_MODEL_NAME//\//_}
+    export_file="${export_model_name}_generation_${concurrency}.json"
+
+    echo "Run benchmark for concurrency $concurrency; ISL $chosen_isl; OSL $chosen_osl"
+    command=(
+        genai-perf profile
+        -m ${SERVED_MODEL_NAME}
+        --tokenizer ${MODEL_PATH}
+        --endpoint-type chat
+        --endpoint /v1/chat/completions
+        --url "${head_node}:${head_port}"
+        --streaming
+
+        --concurrency ${concurrency}
+        --warmup-request-count $(( 2*genai_perf_warmup_workers ))
+        --request-count $(( 5*concurrency ))
+
+        --synthetic-input-tokens-mean ${chosen_isl} --synthetic-input-tokens-stddev 0
+        --output-tokens-mean ${chosen_osl} --output-tokens-stddev 0
+        --extra-inputs "max_tokens:${chosen_osl}" --extra-inputs "min_tokens:${chosen_osl}"
+
+        --artifact-dir ${export_folder}
+        --profile-export-file ${export_file}
+
+        --random-seed ${random_seed}
+
+        --tokenizer-trust-remote-code
+        --num-dataset-entries 3000
+        --
+        --max-threads ${concurrency}
+    )
+
+    set -e
+    ${command[@]}
+    set +e
+
+    cp $export_folder/*/*_genai_perf.json $profile_folder
+done
--- a/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
+++ b/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh
@@ -57,11 +57,20 @@ if [ -z "$TOTAL_NODES" ]; then
    exit 1
 fi

+if [ -z "$USE_INIT_LOCATIONS" ]; then
+    echo "Error: USE_INIT_LOCATIONS environment variable is not set"
+    exit 1
+fi
+
 # Construct command based on mode
 if [ "$mode" = "prefill" ]; then
    # GB200 dynamo prefill command
+    set -x
+    # SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \
+
+    if [[ "${USE_INIT_LOCATIONS,,}" == "true" ]]; then command_suffix="--init-expert-location /configs/prefill_dsr1-0528_in1000out1000_num40000.json"; fi
+
    DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
-    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=2048 \
    MC_TE_METRIC=true \
    SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE=100000 \
    SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=100000 \
@@ -87,12 +96,12 @@ if [ "$mode" = "prefill" ]; then
        --dp-size "$TOTAL_GPUS" \
        --enable-dp-attention \
        --host 0.0.0.0 \
-        --decode-log-interval 1 \
+        --decode-log-interval 1000 \
        --max-running-requests 12288 \
        --context-length 9600 \
        --disable-radix-cache \
        --enable-deepep-moe \
-        --deepep-mode low_latency \
+        --deepep-mode normal \
        --ep-dispatch-algorithm dynamic \
        --moe-dense-tp-size 1 \
        --enable-dp-lm-head \
@@ -101,15 +110,18 @@ if [ "$mode" = "prefill" ]; then
        --eplb-algorithm deepseek \
        --attention-backend cutlass_mla \
        --watchdog-timeout 1000000 \
-        --init-expert-location /configs/prefill_dsr1-0528_in1000out1000_num40000.json  \
        --disable-cuda-graph \
-        --chunked-prefill-size 16384 \
-        --max-total-tokens 65536 \
+        --chunked-prefill-size 131072 \
+        --max-total-tokens 524288 \
        --deepep-config /configs/deepep_config.json \
        --stream-interval 50 \
-        --log-level debug
+        --log-level debug ${command_suffix}

 elif [ "$mode" = "decode" ]; then
+    set -x
+    command_suffix=""
+    if [[ "${USE_INIT_LOCATIONS,,}" == "true" ]]; then command_suffix="--init-expert-location /configs/decode_dsr1-0528_loadgen_in1024out1024_num2000_2p12d.json"; fi
+
    # GB200 dynamo decode command
    DYN_SKIP_SGLANG_LOG_FORMATTING=1 \
    SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=512 \
@@ -139,7 +151,7 @@ elif [ "$mode" = "decode" ]; then
        --dp-size "$TOTAL_GPUS" \
        --enable-dp-attention \
        --host 0.0.0.0 \
-        --decode-log-interval 1 \
+        --decode-log-interval 1000 \
        --max-running-requests 36864 \
        --context-length 9600 \
        --disable-radix-cache \
@@ -155,8 +167,7 @@ elif [ "$mode" = "decode" ]; then
        --eplb-algorithm deepseek \
        --attention-backend cutlass_mla \
        --watchdog-timeout 1000000 \
-        --init-expert-location /configs/decode_dsr1-0528_loadgen_in1024out1024_num2000_2p12d.json \
        --chunked-prefill-size 36864 \
        --stream-interval 50 \
-        --mem-fraction-static 0.82
+        --mem-fraction-static 0.82 ${command_suffix}
 fi
--- a/components/backends/sglang/slurm_jobs/scripts/nginx.conf.j2
+++ b/components/backends/sglang/slurm_jobs/scripts/nginx.conf.j2
+# Defines the group of servers to which NGINX will proxy requests.
+# NGINX will cycle through these servers in a round-robin fashion by default.
+worker_processes auto;
+
+http {
+    access_log off;
+    upstream backend_servers {
+        {% for frontend_host in frontend_hosts %}
+        server {{ frontend_host }}:8000;
+        {% endfor %}
+    }
+
+    # The main server block that listens for incoming traffic.
+    server {
+        listen 8000; # Listen on port 8000 for incoming HTTP requests.
+
+        location / {
+            # Pass the request to the upstream group defined above.
+            proxy_pass http://backend_servers;
+
+            proxy_buffering off;
+            proxy_read_timeout 24h;
+            proxy_send_timeout 24h;
+        }
+    }
+}
+
+events {
+
+    #
+    # Determines how many clients will be served by each worker process.
+    # (Max clients = worker_connections * worker_processes)
+    # Should be equal to `ulimit -n / worker_processes`
+    #
+    worker_connections 65535;
+
+    #
+    # Let each process accept multiple connections.
+    # Accept as many connections as possible, after nginx gets notification
+    # about a new connection.
+    # May flood worker_connections, if that option is set too low.
+    #
+    multi_accept on;
+
+    #
+    # Preferred connection method for newer linux versions.
+    # Essential for linux, optmized to serve many clients with each thread.
+    #
+    use epoll;
+}
--- a/components/backends/sglang/slurm_jobs/scripts/sglang/bench.sh
+++ b/components/backends/sglang/slurm_jobs/scripts/sglang/bench.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+chosen_isl=$3
+chosen_osl=$4
+
+concurrency_list=$5
+IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list"
+chosen_req_rate=$6
+
+echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[@]}; ${chosen_req_rate}"
+
+head_node="localhost"
+head_port="8000"
+
+SERVED_MODEL_NAME="deepseek-ai/DeepSeek-R1"
+MODEL_PATH=/model/
+
+source /scripts/benchmark_utils.sh
+
+wait_for_model $head_node $head_port 5 2400 60
+
+sleep 300
+
+set -e
+warmup_model $head_node $head_port $SERVED_MODEL_NAME $MODEL_PATH "${chosen_isl}x${chosen_osl}x10000x10000x${chosen_req_rate}"
+set +e
+
+profile_folder="/logs/sglang_isl_${chosen_isl}_osl_${chosen_osl}"
+mkdir -p $profile_folder
+
+for max_concurrency in ${chosen_concurrencies[@]}; do
+
+    chosen_n_requests=$((5*max_concurrency))
+
+    export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}.json"
+
+    command=(
+        python3 -m sglang.bench_serving
+        --base-url "http://${head_node}:${head_port}"
+        --model ${SERVED_MODEL_NAME} --tokenizer ${MODEL_PATH}
+        --backend sglang-oai
+        --dataset-name random --random-input ${chosen_isl} --random-output ${chosen_osl}
+        --random-range-ratio 1
+        --num-prompts ${chosen_n_requests} --request-rate ${chosen_req_rate} --max-concurrency ${max_concurrency}
+        --output-file $export_file
+    )
+
+    echo "Running command ${command[@]}"
+
+    ${command[@]}
+
+    echo "-----------------------------------------"
+done
--- a/components/backends/sglang/slurm_jobs/scripts/sglang_bench_serving.sh
+++ b/components/backends/sglang/slurm_jobs/scripts/sglang_bench_serving.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+### Benchmark configuration and setup
+# Benchmarking script setup - ISL/OSL/concurrencies/request_rate
+chosen_isl=1024
+chosen_osl=1024
+chosen_req_rate=250
+chosen_concurrencies=(2 10 20 50 100 200 500 1000 2000 2500 3000 3500 4000 4500 5000 7500 10000 12500 15000 16250 17500 18750 20000)
+
+# Model config setup - frontend URL, model name, and path
+head_node="localhost"
+head_port="8000"
+SERVED_MODEL_NAME="deepseek-ai/DeepSeek-R1"
+MODEL_PATH=/model/
+
+# This file contains `wait_for_model` and `warmup_model`
+source /scripts/benchmark_utils.sh
+
+### Benchmark runs
+# 1. wait for model to come alive - `wait_for_model`
+# 2. warms up the model - `warmup_model`
+# 3. benchmark model - for concurrency in concurrencies; do <benchmark script>; done
+wait_for_model $head_node $head_port 5 2400 60
+
+set -e
+warmup_model $head_node $head_port $SERVED_MODEL_NAME $MODEL_PATH "${chosen_isl}x${chosen_osl}x10000x10000x${chosen_req_rate}"
+set +e
+
+for max_concurrency in ${chosen_concurrencies[@]}; do
+
+    chosen_n_requests=$((5*max_concurrency))
+
+    command=(
+        python3 -m sglang.bench_serving
+        --base-url "http://${head_node}:${head_port}"
+        --model ${SERVED_MODEL_NAME} --tokenizer ${MODEL_PATH}
+        --backend sglang-oai
+        --dataset-name random --random-input ${chosen_isl} --random-output ${chosen_osl}
+        --random-range-ratio 1
+        --num-prompts ${chosen_n_requests} --request-rate ${chosen_req_rate} --max-concurrency ${max_concurrency}
+    )
+
+    echo "Running command ${command[@]}"
+
+    ${command[@]}
+
+    echo "-----------------------------------------"
+done
+
--- a/components/backends/sglang/slurm_jobs/scripts/vllm/backend_request_func.py
+++ b/components/backends/sglang/slurm_jobs/scripts/vllm/backend_request_func.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# pytest: skip-file
+
+import json
+import os
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field
+from typing import List, Optional, Union
+
+import aiohttp
+import huggingface_hub.constants
+from tqdm.asyncio import tqdm
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+
+@dataclass
+class RequestFuncInput:
+    prompt: str
+    api_url: str
+    prompt_len: int
+    output_len: int
+    model: str
+    model_name: Optional[str] = None
+    best_of: int = 1
+    logprobs: Optional[int] = None
+    extra_body: Optional[dict] = None
+    multi_modal_content: Optional[dict] = None
+    ignore_eos: bool = False
+
+
+@dataclass
+class RequestFuncOutput:
+    generated_text: str = ""
+    success: bool = False
+    latency: float = 0.0
+    # output_tokens: int = 0
+    output_tokens: Optional[int] = None
+    ttft: float = 0.0  # Time to first token
+    itl: List[float] = field(default_factory=list)  # List of inter-token latencies
+    tpot: float = 0.0  # avg next-token latencies
+    prompt_len: int = 0
+    error: str = ""
+
+
+async def async_request_tgi(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        params = {
+            "best_of": request_func_input.best_of,
+            "max_new_tokens": request_func_input.output_len,
+            "do_sample": True,
+            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
+            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
+            "truncate": request_func_input.prompt_len,
+            # TGI does not accept ignore_eos flag.
+        }
+        payload = {
+            "inputs": request_func_input.prompt,
+            "parameters": params,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
+
+                        # NOTE: Sometimes TGI returns a ping response without
+                        # any data, we should skip it.
+                        if chunk_bytes.startswith(":"):
+                            continue
+                        chunk = chunk_bytes.removeprefix("data:")
+
+                        data = json.loads(chunk)
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp - most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+                    output.generated_text = data["generated_text"]
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_trt_llm(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        assert request_func_input.best_of == 1
+        payload = {
+            "accumulate_tokens": True,
+            "text_input": request_func_input.prompt,
+            "temperature": 0.0,
+            "top_p": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        if request_func_input.ignore_eos:
+            payload["min_length"] = request_func_input.output_len
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data:")
+
+                        data = json.loads(chunk)
+                        output.generated_text += data["text_output"]
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = timestamp - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp - most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_deepspeed_mii(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        assert request_func_input.best_of == 1
+
+        payload = {
+            "prompt": request_func_input.prompt,
+            "max_tokens": request_func_input.output_len,
+            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
+            "top_p": 1.0,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
+        # will use 0 as placeholder.
+        # See https://github.com/microsoft/DeepSpeed-MII/pull/311
+        output.ttft = 0
+
+        st = time.perf_counter()
+        try:
+            async with session.post(
+                url=request_func_input.api_url, json=payload
+            ) as response:
+                if response.status == 200:
+                    parsed_resp = await response.json()
+                    output.latency = time.perf_counter() - st
+                    output.generated_text = parsed_resp["text"][0]
+                    output.success = True
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("completions", "profile")
+    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        payload = {
+            "model": request_func_input.model_name
+            if request_func_input.model_name
+            else request_func_input.model,
+            "prompt": request_func_input.prompt,
+            "temperature": 0.0,
+            "best_of": request_func_input.best_of,
+            "max_tokens": request_func_input.output_len,
+            "logprobs": request_func_input.logprobs,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+            },
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    first_chunk_received = False
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+                        if chunk != "[DONE]":
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if choices := data.get("choices"):
+                                # Note that text could be empty here
+                                # e.g. for special tokens
+                                text = choices[0].get("text")
+                                timestamp = time.perf_counter()
+                                # First token
+                                if not first_chunk_received:
+                                    first_chunk_received = True
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text += text or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get("completion_tokens")
+                    if first_chunk_received:
+                        output.success = True
+                    else:
+                        output.success = False
+                        output.error = (
+                            "Never received a valid chunk to calculate TTFT."
+                            "This response will be marked as failed!"
+                        )
+                    output.generated_text = generated_text
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "chat/completions"
+    ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        if request_func_input.multi_modal_content:
+            content.append(request_func_input.multi_modal_content)
+        payload = {
+            "model": request_func_input.model_name
+            if request_func_input.model_name
+            else request_func_input.model,
+            "messages": [
+                {"role": "user", "content": content},
+            ],
+            "temperature": 0.0,
+            "max_completion_tokens": request_func_input.output_len,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+            },
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+                        if chunk != "[DONE]":
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
+
+                            if choices := data.get("choices"):
+                                content = choices[0]["delta"].get("content")
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = timestamp - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                generated_text += content or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get("completion_tokens")
+
+                            most_recent_timestamp = timestamp
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+def get_model(pretrained_model_name_or_path: str) -> str:
+    if os.getenv("VLLM_USE_MODELSCOPE", "False").lower() == "true":
+        from modelscope import snapshot_download
+
+        model_path = snapshot_download(
+            model_id=pretrained_model_name_or_path,
+            local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
+            ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
+        )
+
+        return model_path
+    return pretrained_model_name_or_path
+
+
+def get_tokenizer(
+    pretrained_model_name_or_path: str,
+    tokenizer_mode: str = "auto",
+    trust_remote_code: bool = False,
+    **kwargs,
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    if pretrained_model_name_or_path is not None and not os.path.exists(
+        pretrained_model_name_or_path
+    ):
+        pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
+    if tokenizer_mode == "slow":
+        if kwargs.get("use_fast", False):
+            raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
+        kwargs["use_fast"] = False
+    if tokenizer_mode == "mistral":
+        try:
+            from vllm.transformers_utils.tokenizer import MistralTokenizer
+        except ImportError as e:
+            raise ImportError(
+                "MistralTokenizer requires vllm package.\n"
+                "Please install it with `pip install vllm` "
+                "to use mistral tokenizer mode."
+            ) from e
+        return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path))
+    else:
+        return AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path,
+            trust_remote_code=trust_remote_code,
+            **kwargs,
+        )
+
+
+async def async_request_dynamo_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        "chat/completions"
+    ), "Dynamo Chat API URL must end with 'chat/completions'."
+
+    headers = {"Content-Type": "application/json"}
+    user_content = request_func_input.prompt
+    if request_func_input.multi_modal_content:
+        pass
+    payload = {
+        "model": (
+            request_func_input.model_name
+            if request_func_input.model_name
+            else request_func_input.model
+        ),
+        "messages": [{"role": "user", "content": user_content}],
+        "temperature": 0.0,  # keep deterministic for benchmarks unless you want entropy
+        "max_tokens": request_func_input.output_len,
+        "stream": True,
+    }
+    if request_func_input.ignore_eos:
+        payload["ignore_eos"] = True  # if unsupported, we’ll retry w/o it below
+
+    output = RequestFuncOutput(prompt_len=request_func_input.prompt_len)
+
+    async with aiohttp.ClientSession(
+        trust_env=True, timeout=AIOHTTP_TIMEOUT
+    ) as session:
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        ttft = 0.0
+        got_stream_chunk = False
+        parts: list[str] = []
+
+        async def _do_request(json_payload):
+            return await session.post(url=api_url, json=json_payload, headers=headers)
+
+        try:
+            # First try with the requested flags
+            response = await _do_request(payload)
+
+            # If server errors, fall back progressively by removing the usual culprits.
+            if response.status >= 500:
+                # remove ignore_eos
+                if "ignore_eos" in payload:
+                    payload.pop("ignore_eos", None)
+                    response = await _do_request(payload)
+            if response.status >= 500:
+                # disable streaming
+                payload["stream"] = False
+                response = await _do_request(payload)
+
+            # --- STREAMING (SSE) ---
+            if response.status == 200 and response.headers.get(
+                "content-type", ""
+            ).startswith("text/event-stream"):
+                async for chunk_bytes in response.content:
+                    chunk_bytes = chunk_bytes.strip()
+                    if not chunk_bytes:
+                        continue
+                    s = chunk_bytes.decode("utf-8").lstrip()
+                    if s.startswith(":"):  # ping/keepalive
+                        continue
+                    if s.startswith("data:"):
+                        s = s.partition("data:")[2].strip()
+                    if s == "[DONE]":
+                        continue
+
+                    data = json.loads(s)
+                    timestamp = time.perf_counter()
+
+                    if choices := data.get("choices"):
+                        got_stream_chunk = True
+                        delta = choices[0].get("delta", {})
+                        # preserve order; keep <think> intact
+                        rc = delta.get("reasoning_content")
+                        if rc is not None:
+                            parts.append(rc)
+                        c = delta.get("content")
+                        if c is not None:
+                            parts.append(c)
+
+                        if ttft == 0.0:
+                            ttft = timestamp - st
+                            output.ttft = ttft
+                        else:
+                            output.itl.append(timestamp - most_recent_timestamp)
+                        most_recent_timestamp = timestamp
+
+                    elif usage := data.get("usage"):
+                        output.output_tokens = usage.get(
+                            "completion_tokens", output.output_tokens
+                        )
+
+                output.generated_text = "".join(parts)
+                output.latency = (
+                    most_recent_timestamp if got_stream_chunk else time.perf_counter()
+                ) - st
+                output.success = got_stream_chunk
+                if not got_stream_chunk:
+                    # capture error body if available
+                    try:
+                        output.error = await response.text()
+                    except Exception:
+                        output.error = "No stream token chunks received."
+
+            # --- NON-STREAMING JSON ---
+            elif response.status == 200:
+                body = await response.json()
+                ch0 = (body.get("choices") or [{}])[0]
+                msg = ch0.get("message") or {}
+
+                # Prefer reasoning + content from message
+                if msg.get("reasoning_content") is not None:
+                    parts.append(msg["reasoning_content"])
+                if msg.get("content") is not None:
+                    parts.append(msg["content"])
+
+                # Fallbacks (some implementations put text/rc at choice-level)
+                if not parts:
+                    if ch0.get("reasoning_content") is not None:
+                        parts.append(ch0["reasoning_content"])
+                    if ch0.get("text") is not None:
+                        parts.append(ch0["text"])
+
+                output.generated_text = "".join(parts)
+                output.latency = time.perf_counter() - st
+                output.ttft = 0.0
+                if usage := body.get("usage"):
+                    output.output_tokens = usage.get("completion_tokens", 0)
+                output.success = True
+
+            else:
+                # Better error visibility for your “Initial test run failed”
+                output.success = False
+                output.error = f"HTTP {response.status}: " + (await response.text())
+
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+ASYNC_REQUEST_FUNCS = {
+    "tgi": async_request_tgi,
+    "vllm": async_request_openai_completions,
+    "lmdeploy": async_request_openai_completions,
+    "deepspeed-mii": async_request_deepspeed_mii,
+    "openai": async_request_openai_completions,
+    "openai-chat": async_request_openai_chat_completions,
+    "tensorrt-llm": async_request_trt_llm,
+    "scalellm": async_request_openai_completions,
+    "sglang": async_request_openai_completions,
+    "dynamo": async_request_dynamo_chat_completions,
+}
--- a/components/backends/sglang/slurm_jobs/scripts/vllm/bench.sh
+++ b/components/backends/sglang/slurm_jobs/scripts/vllm/bench.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Example script adapted from https://github.com/kedarpotdar-nv/bench_serving/tree/dynamo-fix.
+
+model_name="deepseek-ai/DeepSeek-R1"
+model_path="/model/"
+head_node="localhost"
+head_port=8000
+
+source /scripts/benchmark_utils.sh
+work_dir="/scripts/vllm/"
+cd $work_dir
+
+chosen_isl=$3
+chosen_osl=$4
+concurrency_list=$5
+IFS='x' read -r -a chosen_concurrencies <<< "$concurrency_list"
+chosen_req_rate=$6
+
+echo "Config ${chosen_isl}; ${chosen_osl}; ${chosen_concurrencies[@]}; ${chosen_req_rate}"
+
+wait_for_model $head_node $head_port 5 2400 60
+
+set -e
+warmup_model $head_node $head_port $model_name $model_path "${chosen_isl}x${chosen_osl}x10000x10000x${chosen_req_rate}"
+set +e
+
+result_dir="/logs/vllm_isl_${chosen_isl}_osl_${chosen_osl}"
+mkdir -p $result_dir
+
+set -e
+for concurrency in "${chosen_concurrencies[@]}"
+do
+    num_prompts=$((concurrency * 5))
+    echo "Running benchmark with concurrency: $concurrency and num-prompts: $num_prompts, writing to file ${result_dir}"
+    result_filename="isl_${chosen_isl}_osl_${chosen_osl}_concurrency_${concurrency}_req_rate_${chosen_req_rate}.json"
+
+    set -x
+    python3 benchmark_serving.py \
+        --model ${model_name} --tokenizer ${model_path} \
+        --host $head_node --port $head_port \
+        --backend "dynamo" --endpoint /v1/chat/completions \
+        --disable-tqdm \
+        --dataset-name random \
+        --num-prompts "$num_prompts" \
+        --random-input-len $chosen_isl \
+        --random-output-len $chosen_osl \
+        --random-range-ratio 0.8 \
+        --ignore-eos \
+        --request-rate ${chosen_req_rate} \
+        --percentile-metrics ttft,tpot,itl,e2el \
+        --max-concurrency "$concurrency" \
+        --save-result --result-dir $result_dir --result-filename $result_filename
+    set +x
+
+    echo "Completed benchmark with concurrency: $concurrency"
+    echo "-----------------------------------------"
+done
+set +e
--- a/components/backends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py
+++ b/components/backends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# pytest: skip-file
+
+r"""Benchmark online serving throughput.
+
+On the server side, run one of the following commands:
+    vLLM OpenAI API server
+    vllm serve <your_model> \
+        --swap-space 16 \
+        --disable-log-requests
+
+    (TGI backend)
+    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
+
+On the client side, run:
+    python benchmarks/benchmark_serving.py \
+        --backend <backend> \
+        --model <your_model> \
+        --dataset-name sharegpt \
+        --dataset-path <path to dataset> \
+        --request-rate <request_rate> \ # By default <request_rate> is inf
+        --num-prompts <num_prompts> # By default <num_prompts> is 1000
+
+    when using tgi backend, add
+        --endpoint /generate_stream
+    to the end of the command above.
+"""
+import argparse
+import asyncio
+import base64
+import gc
+import io
+import json
+import os
+import random
+import time
+import warnings
+from dataclasses import dataclass
+from datetime import datetime
+from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple
+
+import numpy as np
+import pandas as pd
+from backend_request_func import (
+    ASYNC_REQUEST_FUNCS,
+    RequestFuncInput,
+    RequestFuncOutput,
+)
+from datasets import load_dataset
+from PIL.Image import Image
+from tqdm.asyncio import tqdm
+from transformers import PreTrainedTokenizerBase
+
+try:
+    from vllm.transformers_utils.tokenizer import get_tokenizer
+except ImportError:
+    from backend_request_func import get_tokenizer
+
+try:
+    from vllm.utils import FlexibleArgumentParser
+except ImportError:
+    from argparse import ArgumentParser as FlexibleArgumentParser
+
+from benchmark_utils import convert_to_pytorch_benchmark_format
+
+MILLISECONDS_TO_SECONDS_CONVERSION = 1000
+
+
+@dataclass
+class BenchmarkMetrics:
+    completed: int
+    total_input: int
+    total_output: int
+    request_throughput: float
+    request_goodput: float
+    output_throughput: float
+    total_token_throughput: float
+    mean_ttft_ms: float
+    median_ttft_ms: float
+    std_ttft_ms: float
+    percentiles_ttft_ms: List[Tuple[float, float]]
+    mean_tpot_ms: float
+    median_tpot_ms: float
+    std_tpot_ms: float
+    percentiles_tpot_ms: List[Tuple[float, float]]
+    mean_itl_ms: float
+    median_itl_ms: float
+    std_itl_ms: float
+    percentiles_itl_ms: List[Tuple[float, float]]
+    # E2EL stands for end-to-end latency per request.
+    # It is the time taken on the client side from sending
+    # a request to receiving a complete response.
+    mean_e2el_ms: float
+    median_e2el_ms: float
+    std_e2el_ms: float
+    percentiles_e2el_ms: List[Tuple[float, float]]
+
+
+def sample_sharegpt_requests(
+    dataset_path: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int] = None,
+) -> List[Tuple[str, int, int, None]]:
+    # Load the dataset.
+    with open(dataset_path, encoding="utf-8") as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Only keep the first two turns of each conversation.
+    dataset = [
+        (data["conversations"][0]["value"], data["conversations"][1]["value"])
+        for data in dataset
+    ]
+
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: List[Tuple[str, int, int]] = []
+    for i in range(len(dataset)):
+        if len(filtered_dataset) == num_requests:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt = dataset[i][0]
+        prompt_token_ids = tokenizer(prompt).input_ids
+        completion = dataset[i][1]
+        completion_token_ids = tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = (
+            len(completion_token_ids) if fixed_output_len is None else fixed_output_len
+        )
+        if prompt_len < 4 or (fixed_output_len is None and output_len < 4):
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+        filtered_dataset.append((prompt, prompt_len, output_len, None))
+
+    return filtered_dataset
+
+
+def sample_burstgpt_requests(
+    dataset_path: str,
+    num_requests: int,
+    random_seed: int,
+    tokenizer: PreTrainedTokenizerBase,
+) -> List[Tuple[str, int, int, None]]:
+    df = pd.read_csv(dataset_path)
+    gpt4_df = df[df["Model"] == "GPT-4"]
+    # Remove the failed requests (i.e., response length is 0)
+    gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
+    # Randomly sample num_requests from the dataset
+    if num_requests <= len(gpt4_df):
+        gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed)
+    else:
+        gpt4_df = gpt4_df.sample(n=num_requests, random_state=random_seed, replace=True)
+    # Convert the dataframe to a list of tuples
+    dataset = gpt4_df.values.tolist()
+    input_requests = []
+    for i in range(num_requests):
+        input_len = int(dataset[i][2])
+        output_len = int(dataset[i][3])
+        prompt = tokenizer.decode(
+            [(i + j) % tokenizer.vocab_size for j in range(input_len)]
+        )
+        input_requests.append((prompt, input_len, output_len, None))
+    return input_requests
+
+
+def sample_sonnet_requests(
+    dataset_path: str,
+    num_requests: int,
+    input_len: int,
+    output_len: int,
+    prefix_len: int,
+    tokenizer: PreTrainedTokenizerBase,
+) -> List[Tuple[str, str, int, int, None]]:
+    assert (
+        input_len > prefix_len
+    ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
+
+    # Load the dataset.
+    with open(dataset_path, encoding="utf-8") as f:
+        poem_lines = f.readlines()
+
+    # Tokenize the poem lines.
+    poem_token_ids = tokenizer(poem_lines).input_ids
+    average_poem_len = sum(len(token_ids) for token_ids in poem_token_ids) / len(
+        poem_token_ids
+    )
+
+    # Base prefix for all requests.
+    base_prompt = "Pick as many lines as you can from these poem lines:\n"
+    base_message = [
+        {
+            "role": "user",
+            "content": base_prompt,
+        }
+    ]
+    base_prompt_formatted = tokenizer.apply_chat_template(
+        base_message, add_generation_prompt=True, tokenize=False
+    )
+    base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
+
+    assert (
+        input_len > base_prompt_offset
+    ), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}."
+    num_input_lines = round((input_len - base_prompt_offset) / average_poem_len)
+
+    # First approximately `prefix_len` number of tokens in the
+    # prompt are fixed poem lines.
+    assert (
+        prefix_len > base_prompt_offset
+    ), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}."
+
+    num_prefix_lines = round((prefix_len - base_prompt_offset) / average_poem_len)
+    prefix_lines = poem_lines[:num_prefix_lines]
+
+    # Sample the rest of lines per request.
+    sampled_requests: List[Tuple[str, int, int]] = []
+    for _ in range(num_requests):
+        num_lines_needed = num_input_lines - num_prefix_lines
+        sampled_lines = "".join(
+            prefix_lines + random.choices(poem_lines, k=num_lines_needed)
+        )
+
+        prompt = f"{base_prompt}{sampled_lines}"
+        message = [
+            {
+                "role": "user",
+                "content": prompt,
+            },
+        ]
+        prompt_formatted = tokenizer.apply_chat_template(
+            message, add_generation_prompt=True, tokenize=False
+        )
+        prompt_len = len(tokenizer(prompt_formatted).input_ids)
+        sampled_requests.append(
+            (prompt, prompt_formatted, prompt_len, output_len, None)
+        )
+
+    return sampled_requests
+
+
+def sample_vision_arena_requests(
+    dataset,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    fixed_output_len: Optional[int] = None,
+) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
+    sampled_requests: List[Tuple[str, int, int, Dict[str, Collection[str]]]] = []
+    for data in dataset:
+        if len(sampled_requests) == num_requests:
+            break
+
+        prompt = data["turns"][0][0]["content"]
+
+        prompt_token_ids = tokenizer(prompt).input_ids
+        if fixed_output_len is None:
+            # Default max output len is set to 128
+            print("--hf-output-len is not provided. Using default value 128.")
+            fixed_output_len = 128
+
+        prompt_len = len(prompt_token_ids)
+        output_len = fixed_output_len
+
+        assert isinstance(data["images"][0], Image), (
+            "Input image format must be `PIL.Image.Image`, "
+            f"given {type(data['image'])}."
+        )
+        image: Image = data["images"][0]
+        image = image.convert("RGB")
+        image_data = io.BytesIO()
+        image.save(image_data, format="JPEG")
+        image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
+        mm_content = {
+            "type": "image_url",
+            "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
+        }
+
+        sampled_requests.append((prompt, prompt_len, output_len, mm_content))
+
+    return sampled_requests
+
+
+def sample_hf_requests(
+    dataset_path: str,
+    dataset_subset: Optional[str],
+    dataset_split: str,
+    num_requests: int,
+    tokenizer: PreTrainedTokenizerBase,
+    random_seed: int,
+    fixed_output_len: Optional[int] = None,
+) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
+    # Special case for vision_arena dataset
+    if dataset_path == "lmarena-ai/vision-arena-bench-v0.1" and dataset_subset is None:
+        assert dataset_split == "train"
+        dataset = load_dataset(
+            dataset_path, name=dataset_subset, split=dataset_split, streaming=True
+        )
+        dataset = dataset.shuffle(seed=random_seed)
+        return sample_vision_arena_requests(
+            dataset, num_requests, tokenizer, fixed_output_len
+        )
+
+    dataset = load_dataset(
+        dataset_path, name=dataset_subset, split=dataset_split, streaming=True
+    )
+    assert (
+        "conversations" in dataset.features
+    ), "HF Dataset must have 'conversations' column."
+    filter_func = lambda x: len(x["conversations"]) >= 2  # noqa: E731
+    filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
+    sampled_requests: List[Tuple[str, int, int, Dict[str, Collection[str]]]] = []
+    for data in filtered_dataset:
+        if len(sampled_requests) == num_requests:
+            break
+
+        # Tokenize the prompts and completions.
+        prompt = data["conversations"][0]["value"]
+        prompt_token_ids = tokenizer(prompt).input_ids
+        completion = data["conversations"][1]["value"]
+        completion_token_ids = tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = (
+            len(completion_token_ids) if fixed_output_len is None else fixed_output_len
+        )
+        if fixed_output_len is None and (prompt_len < 4 or output_len < 4):
+            # Prune too short sequences.
+            continue
+        if fixed_output_len is None and (
+            prompt_len > 1024 or prompt_len + output_len > 2048
+        ):
+            # Prune too long sequences.
+            continue
+
+        if "image" in data and isinstance(data["image"], Image):
+            image: Image = data["image"]
+            image = image.convert("RGB")
+            image_data = io.BytesIO()
+            image.save(image_data, format="JPEG")
+            image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
+            mm_content = {
+                "type": "image_url",
+                "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
+            }
+        elif "image" in data and isinstance(data["image"], str):
+            if data["image"].startswith("http://") or data["image"].startswith(
+                "file://"
+            ):
+                image_url = data["image"]
+            else:
+                image_url = f"file://{data['image']}"
+
+            mm_content = {
+                "type": "image_url",
+                "image_url": {"url": image_url},
+            }
+        else:
+            mm_content = None
+
+        sampled_requests.append((prompt, prompt_len, output_len, mm_content))
+
+    return sampled_requests
+
+
+def sample_random_requests(
+    prefix_len: int,
+    input_len: int,
+    output_len: int,
+    num_prompts: int,
+    range_ratio: float,
+    tokenizer: PreTrainedTokenizerBase,
+    use_chat_template: bool = False,
+) -> List[Tuple[str, int, int]]:
+    prefix_token_ids = np.random.randint(
+        0, tokenizer.vocab_size, size=prefix_len
+    ).tolist()
+    if use_chat_template:
+        chat_template_dummy = tokenizer.apply_chat_template(
+            [{"role": "user", "content": "a"}],
+            add_generation_prompt=True,
+            tokenize=False,
+        )
+        tokenized_chat_template_dummy = tokenizer.encode(
+            chat_template_dummy, add_special_tokens=False
+        )
+        chat_template_len = len(tokenized_chat_template_dummy) - 1
+        input_len = input_len - chat_template_len
+
+    input_lens = np.random.randint(
+        int(input_len * range_ratio),
+        input_len + 1,
+        size=num_prompts,
+    )
+    output_lens = np.random.randint(
+        int(output_len * range_ratio),
+        output_len + 1,
+        size=num_prompts,
+    )
+    offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
+    input_requests = []
+    for i in range(num_prompts):
+        prompt = tokenizer.decode(
+            prefix_token_ids
+            + [
+                (offsets[i] + i + j) % tokenizer.vocab_size
+                for j in range(input_lens[i])
+            ]
+        )
+        re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
+            : (prefix_len + input_lens[i])
+        ]
+        prompt = tokenizer.decode(re_encoded_sequence)
+        if use_chat_template:
+            prompt = tokenizer.apply_chat_template(
+                [{"role": "user", "content": prompt}],
+                add_generation_prompt=True,
+                tokenize=False,
+            )
+            input_lens[i] += chat_template_len
+
+        input_requests.append(
+            (prompt, int(prefix_len + input_lens[i]), int(output_lens[i]), None)
+        )
+
+    return input_requests
+
+
+async def get_request(
+    input_requests: List[Tuple[str, int, int]],
+    request_rate: float,
+    burstiness: float = 1.0,
+) -> AsyncGenerator[Tuple[str, int, int], None]:
+    """
+    Asynchronously generates requests at a specified rate
+    with OPTIONAL burstiness.
+
+    Args:
+        input_requests:
+            A list of input requests, each represented as a tuple.
+        request_rate:
+            The rate at which requests are generated (requests/s).
+        burstiness (optional):
+            The burstiness factor of the request generation.
+            Only takes effect when request_rate is not inf.
+            Default value is 1, which follows a Poisson process.
+            Otherwise, the request intervals follow a gamma distribution.
+            A lower burstiness value (0 < burstiness < 1) results
+            in more bursty requests, while a higher burstiness value
+            (burstiness > 1) results in a more uniform arrival of requests.
+    """
+    input_requests = iter(input_requests)
+
+    # Calculate scale parameter theta to maintain the desired request_rate.
+    assert (
+        burstiness > 0
+    ), f"A positive burstiness factor is expected, but given {burstiness}."
+    theta = 1.0 / (request_rate * burstiness)
+
+    for request in input_requests:
+        yield request
+
+        if request_rate == float("inf"):
+            # If the request rate is infinity, then we don't need to wait.
+            continue
+
+        # Sample the request interval from the gamma distribution.
+        # If burstiness is 1, it follows exponential distribution.
+        interval = np.random.gamma(shape=burstiness, scale=theta)
+        # The next request will be sent after the interval.
+        await asyncio.sleep(interval)
+
+
+def calculate_metrics(
+    input_requests: List[Tuple[str, int, int]],
+    outputs: List[RequestFuncOutput],
+    dur_s: float,
+    tokenizer: PreTrainedTokenizerBase,
+    selected_percentile_metrics: List[str],
+    selected_percentiles: List[float],
+    goodput_config_dict: Dict[str, float],
+) -> Tuple[BenchmarkMetrics, List[int]]:
+    actual_output_lens: List[int] = []
+    total_input = 0
+    completed = 0
+    good_completed = 0
+    itls: List[float] = []
+    tpots: List[float] = []
+    all_tpots: List[float] = []
+    ttfts: List[float] = []
+    e2els: List[float] = []
+    for i in range(len(outputs)):
+        if outputs[i].success:
+            output_len = outputs[i].output_tokens
+
+            if output_len is None:
+                # We use the tokenizer to count the number of output tokens
+                # for some serving backends instead of looking at
+                # len(outputs[i].itl) since multiple output tokens may be
+                # bundled together
+                # Note : this may inflate the output token count slightly
+                output_len = len(
+                    tokenizer(
+                        outputs[i].generated_text, add_special_tokens=False
+                    ).input_ids
+                )
+            actual_output_lens.append(output_len)
+            total_input += input_requests[i][1]
+            tpot = 0
+            if output_len > 1:
+                latency_minus_ttft = outputs[i].latency - outputs[i].ttft
+                tpot = latency_minus_ttft / (output_len - 1)
+                tpots.append(tpot)
+            # Note: if output_len <= 1, we regard tpot as 0 for goodput
+            all_tpots.append(tpot)
+            itls += outputs[i].itl
+            ttfts.append(outputs[i].ttft)
+            e2els.append(outputs[i].latency)
+            completed += 1
+        else:
+            actual_output_lens.append(0)
+
+    if goodput_config_dict:
+        valid_metrics = []
+        slo_values = []
+
+        if "ttft" in goodput_config_dict:
+            valid_metrics.append(ttfts)
+            slo_values.append(
+                goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
+        if "tpot" in goodput_config_dict:
+            valid_metrics.append(all_tpots)
+            slo_values.append(
+                goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
+        if "e2el" in goodput_config_dict:
+            valid_metrics.append(e2els)
+            slo_values.append(
+                goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
+            )
+
+        for req_metric in zip(*valid_metrics):
+            is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
+            if is_good_req:
+                good_completed += 1
+
+    if completed == 0:
+        warnings.warn(
+            "All requests failed. This is likely due to a misconfiguration "
+            "on the benchmark arguments.",
+            stacklevel=2,
+        )
+    metrics = BenchmarkMetrics(
+        completed=completed,
+        total_input=total_input,
+        total_output=sum(actual_output_lens),
+        request_throughput=completed / dur_s,
+        request_goodput=good_completed / dur_s,
+        output_throughput=sum(actual_output_lens) / dur_s,
+        total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
+        mean_ttft_ms=np.mean(ttfts or 0)
+        * 1000,  # ttfts is empty if streaming is not supported by backend
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
+        median_ttft_ms=np.median(ttfts or 0) * 1000,
+        percentiles_ttft_ms=[
+            (p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
+        ],
+        mean_tpot_ms=np.mean(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
+        median_tpot_ms=np.median(tpots or 0) * 1000,
+        percentiles_tpot_ms=[
+            (p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
+        ],
+        mean_itl_ms=np.mean(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        percentiles_itl_ms=[
+            (p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
+        ],
+        mean_e2el_ms=np.mean(e2els or 0) * 1000,
+        std_e2el_ms=np.std(e2els or 0) * 1000,
+        median_e2el_ms=np.median(e2els or 0) * 1000,
+        percentiles_e2el_ms=[
+            (p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
+        ],
+    )
+
+    return metrics, actual_output_lens
+
+
+async def benchmark(
+    backend: str,
+    api_url: str,
+    base_url: str,
+    model_id: str,
+    model_name: str,
+    tokenizer: PreTrainedTokenizerBase,
+    input_requests: List[Tuple[str, int, int]],
+    logprobs: Optional[int],
+    best_of: int,
+    request_rate: float,
+    burstiness: float,
+    disable_tqdm: bool,
+    profile: bool,
+    selected_percentile_metrics: List[str],
+    selected_percentiles: List[str],
+    ignore_eos: bool,
+    goodput_config_dict: Dict[str, float],
+    max_concurrency: Optional[int],
+    lora_modules: Optional[List[str]],
+):
+    if backend in ASYNC_REQUEST_FUNCS:
+        request_func = ASYNC_REQUEST_FUNCS[backend]
+    else:
+        raise ValueError(f"Unknown backend: {backend}")
+
+    print("Starting initial single prompt test run...")
+    test_prompt, test_prompt_len, test_output_len, test_mm_content = input_requests[0]
+    if backend != "openai-chat" and test_mm_content is not None:
+        # multi-modal benchmark is only available on OpenAI Chat backend.
+        raise ValueError(
+            "Multi-modal content is only supported on 'openai-chat' backend."
+        )
+    test_input = RequestFuncInput(
+        model=model_id,
+        model_name=model_name,
+        prompt=test_prompt,
+        api_url=api_url,
+        prompt_len=test_prompt_len,
+        output_len=test_output_len,
+        logprobs=logprobs,
+        best_of=best_of,
+        multi_modal_content=test_mm_content,
+        ignore_eos=ignore_eos,
+    )
+
+    test_output = await request_func(request_func_input=test_input)
+    if not test_output.success:
+        raise ValueError(
+            "Initial test run failed - Please make sure benchmark arguments "
+            f"are correctly specified. Error: {test_output.error}"
+        )
+    else:
+        print("Initial test run completed. Starting main benchmark run...")
+
+    if lora_modules:
+        # For each input request, choose a LoRA module at random.
+        lora_modules = iter(
+            [random.choice(lora_modules) for _ in range(len(input_requests))]
+        )
+
+    if profile:
+        print("Starting profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            model_name=model_name,
+            prompt=test_prompt,
+            api_url=base_url + "/start_profile",
+            prompt_len=test_prompt_len,
+            output_len=test_output_len,
+            logprobs=logprobs,
+            best_of=best_of,
+            multi_modal_content=test_mm_content,
+            ignore_eos=ignore_eos,
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler started")
+
+    if burstiness == 1.0:
+        distribution = "Poisson process"
+    else:
+        distribution = "Gamma distribution"
+
+    print(f"Traffic request rate: {request_rate}")
+    print(f"Burstiness factor: {burstiness} ({distribution})")
+    print(f"Maximum request concurrency: {max_concurrency}")
+
+    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
+
+    # This can be used once the minimum Python version is 3.10 or higher,
+    # and it will simplify the code in limited_request_func.
+    #    semaphore = (asyncio.Semaphore(max_concurrency)
+    #                 if max_concurrency else contextlib.nullcontext())
+    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
+
+    async def limited_request_func(request_func_input, pbar):
+        if semaphore is None:
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
+        async with semaphore:
+            return await request_func(request_func_input=request_func_input, pbar=pbar)
+
+    benchmark_start_time = time.perf_counter()
+    tasks: List[asyncio.Task] = []
+    async for request in get_request(input_requests, request_rate, burstiness):
+        prompt, prompt_len, output_len, mm_content = request
+        req_model_id, req_model_name = model_id, model_name
+        if lora_modules:
+            req_lora_module = next(lora_modules)
+            req_model_id, req_model_name = req_lora_module, req_lora_module
+
+        request_func_input = RequestFuncInput(
+            model=req_model_id,
+            model_name=req_model_name,
+            prompt=prompt,
+            api_url=api_url,
+            prompt_len=prompt_len,
+            output_len=output_len,
+            logprobs=logprobs,
+            best_of=best_of,
+            multi_modal_content=mm_content,
+            ignore_eos=ignore_eos,
+        )
+        tasks.append(
+            asyncio.create_task(
+                limited_request_func(request_func_input=request_func_input, pbar=pbar)
+            )
+        )
+    outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
+
+    if profile:
+        print("Stopping profiler...")
+        profile_input = RequestFuncInput(
+            model=model_id,
+            prompt=test_prompt,
+            api_url=base_url + "/stop_profile",
+            prompt_len=test_prompt_len,
+            output_len=test_output_len,
+            logprobs=logprobs,
+            best_of=best_of,
+        )
+        profile_output = await request_func(request_func_input=profile_input)
+        if profile_output.success:
+            print("Profiler stopped")
+
+    if pbar is not None:
+        pbar.close()
+
+    benchmark_duration = time.perf_counter() - benchmark_start_time
+
+    metrics, actual_output_lens = calculate_metrics(
+        input_requests=input_requests,
+        outputs=outputs,
+        dur_s=benchmark_duration,
+        tokenizer=tokenizer,
+        selected_percentile_metrics=selected_percentile_metrics,
+        selected_percentiles=selected_percentiles,
+        goodput_config_dict=goodput_config_dict,
+    )
+
+    print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
+    print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
+    print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+    print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Request throughput (req/s):", metrics.request_throughput
+        )
+    )
+    if goodput_config_dict:
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Request goodput (req/s):", metrics.request_goodput
+            )
+        )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Output token throughput (tok/s):", metrics.output_throughput
+        )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total Token throughput (tok/s):", metrics.total_token_throughput
+        )
+    )
+
+    result = {
+        "duration": benchmark_duration,
+        "completed": metrics.completed,
+        "total_input_tokens": metrics.total_input,
+        "total_output_tokens": metrics.total_output,
+        "request_throughput": metrics.request_throughput,
+        "request_goodput:": metrics.request_goodput if goodput_config_dict else None,
+        "output_throughput": metrics.output_throughput,
+        "total_token_throughput": metrics.total_token_throughput,
+        "input_lens": [output.prompt_len for output in outputs],
+        "output_lens": actual_output_lens,
+        "ttfts": [output.ttft for output in outputs],
+        "itls": [output.itl for output in outputs],
+        "generated_texts": [output.generated_text for output in outputs],
+        "errors": [output.error for output in outputs],
+    }
+
+    def process_one_metric(
+        # E.g., "ttft"
+        metric_attribute_name: str,
+        # E.g., "TTFT"
+        metric_name: str,
+        # E.g., "Time to First Token"
+        metric_header: str,
+    ):
+        # This function prints and adds statistics of the specified
+        # metric.
+        if metric_attribute_name not in selected_percentile_metrics:
+            return
+        print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Mean {metric_name} (ms):",
+                getattr(metrics, f"mean_{metric_attribute_name}_ms"),
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                f"Median {metric_name} (ms):",
+                getattr(metrics, f"median_{metric_attribute_name}_ms"),
+            )
+        )
+        result[f"mean_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"mean_{metric_attribute_name}_ms"
+        )
+        result[f"median_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"median_{metric_attribute_name}_ms"
+        )
+        result[f"std_{metric_attribute_name}_ms"] = getattr(
+            metrics, f"std_{metric_attribute_name}_ms"
+        )
+        for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
+            p_word = str(int(p)) if int(p) == p else str(p)
+            print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
+            result[f"p{p_word}_{metric_attribute_name}_ms"] = value
+
+    process_one_metric("ttft", "TTFT", "Time to First Token")
+    process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
+    process_one_metric("itl", "ITL", "Inter-token Latency")
+    process_one_metric("e2el", "E2EL", "End-to-end Latency")
+
+    print("=" * 50)
+
+    return result
+
+
+def check_goodput_args(args):
+    # Check and parse goodput arguments
+    goodput_config_dict = {}
+    VALID_NAMES = ["ttft", "tpot", "e2el"]
+    if args.goodput:
+        goodput_config_dict = parse_goodput(args.goodput)
+        for slo_name, slo_val in goodput_config_dict.items():
+            if slo_name not in VALID_NAMES:
+                raise ValueError(
+                    f"Invalid metric name found, {slo_name}: {slo_val}. "
+                    "The service level objective name should be one of "
+                    f"{str(VALID_NAMES)}. "
+                )
+            if slo_val < 0:
+                raise ValueError(
+                    f"Invalid value found, {slo_name}: {slo_val}. "
+                    "The service level objective value should be "
+                    "non-negative."
+                )
+    return goodput_config_dict
+
+
+def parse_goodput(slo_pairs):
+    goodput_config_dict = {}
+    try:
+        for slo_pair in slo_pairs:
+            slo_name, slo_val = slo_pair.split(":")
+            goodput_config_dict[slo_name] = float(slo_val)
+    except ValueError as err:
+        raise argparse.ArgumentTypeError(
+            "Invalid format found for service level objectives. "
+            'Specify service level objectives for goodput as "KEY:VALUE" '
+            "pairs, where the key is a metric name, and the value is a "
+            "number in milliseconds."
+        ) from err
+    return goodput_config_dict
+
+
+def save_to_pytorch_benchmark_format(
+    args: argparse.Namespace, results: Dict[str, Any], file_name: str
+) -> None:
+    metrics = [
+        "median_ttft_ms",
+        "mean_ttft_ms",
+        "std_ttft_ms",
+        "p99_ttft_ms",
+        "mean_tpot_ms",
+        "median_tpot_ms",
+        "std_tpot_ms",
+        "p99_tpot_ms",
+        "median_itl_ms",
+        "mean_itl_ms",
+        "std_itl_ms",
+        "p99_itl_ms",
+    ]
+    # These raw data might be useful, but they are rather big. They can be added
+    # later if needed
+    ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
+    pt_records = convert_to_pytorch_benchmark_format(
+        args=args,
+        metrics={k: [results[k]] for k in metrics},
+        extra_info={
+            k: results[k]
+            for k in results
+            if k not in metrics and k not in ignored_metrics
+        },
+    )
+    if pt_records:
+        # Don't use json suffix here as we don't want CI to pick it up
+        pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
+        with open(pt_file, "w") as f:
+            json.dump(pt_records, f)
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    backend = args.backend
+    model_id = args.model
+    model_name = args.served_model_name
+    tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
+    tokenizer_mode = args.tokenizer_mode
+
+    if args.base_url is not None:
+        api_url = f"{args.base_url}{args.endpoint}"
+        base_url = f"{args.base_url}"
+    else:
+        api_url = f"http://{args.host}:{args.port}{args.endpoint}"
+        base_url = f"http://{args.host}:{args.port}"
+
+    tokenizer = get_tokenizer(
+        tokenizer_id,
+        tokenizer_mode=tokenizer_mode,
+        trust_remote_code=args.trust_remote_code,
+    )
+
+    if args.dataset is not None:
+        warnings.warn(
+            "The '--dataset' argument will be deprecated in the next "
+            "release. Please use '--dataset-name' and "
+            "'--dataset-path' in the future runs.",
+            stacklevel=2,
+        )
+        input_requests = sample_sharegpt_requests(
+            dataset_path=args.dataset,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            fixed_output_len=args.sharegpt_output_len,
+        )
+
+    elif args.dataset_name == "sharegpt":
+        input_requests = sample_sharegpt_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            fixed_output_len=args.sharegpt_output_len,
+        )
+
+    elif args.dataset_name == "burstgpt":
+        input_requests = sample_burstgpt_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            random_seed=args.seed,
+            tokenizer=tokenizer,
+        )
+
+    elif args.dataset_name == "sonnet":
+        # Do not format the prompt, pass to message directly
+        if args.backend == "openai-chat":
+            input_requests = sample_sonnet_requests(
+                dataset_path=args.dataset_path,
+                num_requests=args.num_prompts,
+                input_len=args.sonnet_input_len,
+                output_len=args.sonnet_output_len,
+                prefix_len=args.sonnet_prefix_len,
+                tokenizer=tokenizer,
+            )
+            input_requests = [
+                (prompt, prompt_len, output_len, None)
+                for prompt, prompt_formatted, prompt_len, output_len, _ in input_requests
+            ]
+        else:
+            assert (
+                tokenizer.chat_template or tokenizer.default_chat_template
+            ), "Tokenizer/model must have chat template for sonnet dataset."
+            input_requests = sample_sonnet_requests(
+                dataset_path=args.dataset_path,
+                num_requests=args.num_prompts,
+                input_len=args.sonnet_input_len,
+                output_len=args.sonnet_output_len,
+                prefix_len=args.sonnet_prefix_len,
+                tokenizer=tokenizer,
+            )
+            input_requests = [
+                (prompt_formatted, prompt_len, output_len, None)
+                for prompt, prompt_formatted, prompt_len, output_len, _ in input_requests
+            ]
+
+    elif args.dataset_name == "hf":
+        input_requests = sample_hf_requests(
+            dataset_path=args.dataset_path,
+            dataset_subset=args.hf_subset,
+            dataset_split=args.hf_split,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            random_seed=args.seed,
+            fixed_output_len=args.hf_output_len,
+        )
+
+    elif args.dataset_name == "random":
+        input_requests = sample_random_requests(
+            prefix_len=args.random_prefix_len,
+            input_len=args.random_input_len,
+            output_len=args.random_output_len,
+            num_prompts=args.num_prompts,
+            range_ratio=args.random_range_ratio,
+            tokenizer=tokenizer,
+            use_chat_template=args.use_chat_template,
+        )
+
+    else:
+        raise ValueError(f"Unknown dataset: {args.dataset_name}")
+
+    goodput_config_dict = check_goodput_args(args)
+
+    # Avoid GC processing "static" data - reduce pause times.
+    gc.collect()
+    gc.freeze()
+
+    benchmark_result = asyncio.run(
+        benchmark(
+            backend=backend,
+            api_url=api_url,
+            base_url=base_url,
+            model_id=model_id,
+            model_name=model_name,
+            tokenizer=tokenizer,
+            input_requests=input_requests,
+            logprobs=args.logprobs,
+            best_of=args.best_of,
+            request_rate=args.request_rate,
+            burstiness=args.burstiness,
+            disable_tqdm=args.disable_tqdm,
+            profile=args.profile,
+            selected_percentile_metrics=args.percentile_metrics.split(","),
+            selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
+            ignore_eos=args.ignore_eos,
+            goodput_config_dict=goodput_config_dict,
+            max_concurrency=args.max_concurrency,
+            lora_modules=args.lora_modules,
+        )
+    )
+
+    # Save config and results to json
+    if args.save_result:
+        result_json: Dict[str, Any] = {}
+
+        # Setup
+        current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
+        result_json["date"] = current_dt
+        result_json["backend"] = backend
+        result_json["model_id"] = model_id
+        result_json["tokenizer_id"] = tokenizer_id
+        result_json["best_of"] = args.best_of
+        result_json["num_prompts"] = args.num_prompts
+
+        # Metadata
+        if args.metadata:
+            for item in args.metadata:
+                if "=" in item:
+                    kvstring = item.split("=")
+                    result_json[kvstring[0].strip()] = kvstring[1].strip()
+                else:
+                    raise ValueError(
+                        "Invalid metadata format. Please use KEY=VALUE format."
+                    )
+
+        # Traffic
+        result_json["request_rate"] = (
+            args.request_rate if args.request_rate < float("inf") else "inf"
+        )
+        result_json["burstiness"] = args.burstiness
+        result_json["max_concurrency"] = args.max_concurrency
+
+        # Merge with benchmark result
+        result_json = {**result_json, **benchmark_result}
+
+        # Save to file
+        base_model_id = model_id.split("/")[-1]
+        max_concurrency_str = (
+            f"-concurrency{args.max_concurrency}"
+            if args.max_concurrency is not None
+            else ""
+        )
+        file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json"  # noqa
+        if args.result_filename:
+            file_name = args.result_filename
+        if args.result_dir:
+            file_name = os.path.join(args.result_dir, file_name)
+        with open(file_name, "w", encoding="utf-8") as outfile:
+            json.dump(result_json, outfile)
+        save_to_pytorch_benchmark_format(args, result_json, file_name)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark the online serving throughput."
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="vllm",
+        choices=list(ASYNC_REQUEST_FUNCS.keys()),
+    )
+    parser.add_argument(
+        "--base-url",
+        type=str,
+        default=None,
+        help="Server or API base url if not using http host and port.",
+    )
+    # Use 127.0.0.1 here instead of localhost to force the use of ipv4
+    parser.add_argument("--host", type=str, default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default="/v1/completions",
+        help="API endpoint.",
+    )
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        default=None,
+        help="Path to the ShareGPT dataset, will be deprecated in the " "next release.",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        default="sharegpt",
+        choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
+        help="Name of the dataset to benchmark on.",
+    )
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        default=None,
+        help="Path to the sharegpt/sonnet dataset. "
+        "Or the huggingface dataset ID if using HF dataset.",
+    )
+    parser.add_argument(
+        "--max-concurrency",
+        type=int,
+        default=None,
+        help="Maximum number of concurrent requests. This can be used "
+        "to help simulate an environment where a higher level component "
+        "is enforcing a maximum number of concurrent requests. While the "
+        "--request-rate argument controls the rate at which requests are "
+        "initiated, this argument will control how many are actually allowed "
+        "to execute at a time. This means that when used in combination, the "
+        "actual request rate may be lower than specified with --request-rate, "
+        "if the server is not processing requests fast enough to keep up.",
+    )
+
+    parser.add_argument(
+        "--model",
+        type=str,
+        required=True,
+        help="Name of the model.",
+    )
+    parser.add_argument(
+        "--tokenizer",
+        type=str,
+        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
+    )
+    parser.add_argument(
+        "--best-of",
+        type=int,
+        default=1,
+        help="Generates `best_of` sequences per prompt and " "returns the best one.",
+    )
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1000,
+        help="Number of prompts to process.",
+    )
+    parser.add_argument(
+        "--logprobs",
+        type=int,
+        default=None,
+        help=(
+            "Number of logprobs-per-token to compute & return as part of "
+            "the request. If unspecified, then either (1) if beam search "
+            "is disabled, no logprobs are computed & a single dummy "
+            "logprob is returned for each token; or (2) if beam search "
+            "is enabled 1 logprob per token is computed"
+        ),
+    )
+    parser.add_argument(
+        "--request-rate",
+        type=float,
+        default=float("inf"),
+        help="Number of requests per second. If this is inf, "
+        "then all the requests are sent at time 0. "
+        "Otherwise, we use Poisson process or gamma distribution "
+        "to synthesize the request arrival times.",
+    )
+    parser.add_argument(
+        "--burstiness",
+        type=float,
+        default=1.0,
+        help="Burstiness factor of the request generation. "
+        "Only take effect when request_rate is not inf. "
+        "Default value is 1, which follows Poisson process. "
+        "Otherwise, the request intervals follow a gamma distribution. "
+        "A lower burstiness value (0 < burstiness < 1) results in more "
+        "bursty requests. A higher burstiness value (burstiness > 1) "
+        "results in a more uniform arrival of requests.",
+    )
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        help="Trust remote code from huggingface",
+    )
+    parser.add_argument(
+        "--disable-tqdm",
+        action="store_true",
+        help="Specify to disable tqdm progress bar.",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Use Torch Profiler. The endpoint must be launched with "
+        "VLLM_TORCH_PROFILER_DIR to enable profiler.",
+    )
+    parser.add_argument(
+        "--save-result",
+        action="store_true",
+        help="Specify to save benchmark results to a json file",
+    )
+    parser.add_argument(
+        "--metadata",
+        metavar="KEY=VALUE",
+        nargs="*",
+        help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
+        "for metadata of this run to be saved in the result JSON file "
+        "for record keeping purposes.",
+    )
+    parser.add_argument(
+        "--result-dir",
+        type=str,
+        default=None,
+        help="Specify directory to save benchmark json results."
+        "If not specified, results are saved in the current directory.",
+    )
+    parser.add_argument(
+        "--result-filename",
+        type=str,
+        default=None,
+        help="Specify the filename to save benchmark json results."
+        "If not specified, results will be saved in "
+        "{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
+        " format.",
+    )
+    parser.add_argument(
+        "--ignore-eos",
+        action="store_true",
+        help="Set ignore_eos flag when sending the benchmark request."
+        "Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
+    )
+    parser.add_argument(
+        "--percentile-metrics",
+        type=str,
+        default="ttft,tpot,itl",
+        help="Comma-seperated list of selected metrics to report percentils. "
+        "This argument specifies the metrics to report percentiles. "
+        'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
+        'Default value is "ttft,tpot,itl".',
+    )
+    parser.add_argument(
+        "--metric-percentiles",
+        type=str,
+        default="99",
+        help="Comma-seperated list of percentiles for selected metrics. "
+        'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
+        'Default value is "99". '
+        'Use "--percentile-metrics" to select metrics.',
+    )
+    parser.add_argument(
+        "--goodput",
+        nargs="+",
+        required=False,
+        help='Specify service level objectives for goodput as "KEY:VALUE" '
+        "pairs, where the key is a metric name, and the value is in "
+        'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
+        "separated by spaces. Allowed request level metric names are "
+        '"ttft", "tpot", "e2el". For more context on the definition of '
+        "goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
+        "and the blog: https://hao-ai-lab.github.io/blogs/distserve",
+    )
+
+    # group for dataset specific arguments
+    sonnet_group = parser.add_argument_group("sonnet dataset options")
+    sonnet_group.add_argument(
+        "--sonnet-input-len",
+        type=int,
+        default=550,
+        help="Number of input tokens per request, used only for sonnet dataset.",
+    )
+    sonnet_group.add_argument(
+        "--sonnet-output-len",
+        type=int,
+        default=150,
+        help="Number of output tokens per request, used only for sonnet dataset.",
+    )
+    sonnet_group.add_argument(
+        "--sonnet-prefix-len",
+        type=int,
+        default=200,
+        help="Number of prefix tokens per request, used only for sonnet dataset.",
+    )
+
+    sharegpt_group = parser.add_argument_group("sharegpt dataset options")
+    sharegpt_group.add_argument(
+        "--sharegpt-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output length "
+        "from the ShareGPT dataset.",
+    )
+
+    random_group = parser.add_argument_group("random dataset options")
+    random_group.add_argument(
+        "--random-input-len",
+        type=int,
+        default=1024,
+        help="Number of input tokens per request, used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-output-len",
+        type=int,
+        default=128,
+        help="Number of output tokens per request, used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=1.0,
+        help="Range of sampled ratio of input/output length, "
+        "used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-prefix-len",
+        type=int,
+        default=0,
+        help="Number of fixed prefix tokens before random "
+        " context. The length range of context in a random "
+        " request is [random-prefix-len, "
+        " random-prefix-len + random-prefix-len * random-range-ratio).",
+    )
+    random_group.add_argument(
+        "--use-chat-template",
+        action="store_true",
+        help="Use chat template to format the prompt.",
+    )
+
+    hf_group = parser.add_argument_group("hf dataset options")
+    hf_group.add_argument(
+        "--hf-subset", type=str, default=None, help="Subset of the HF dataset."
+    )
+    hf_group.add_argument(
+        "--hf-split", type=str, default=None, help="Split of the HF dataset."
+    )
+    hf_group.add_argument(
+        "--hf-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output lengths "
+        "from the sampled HF dataset.",
+    )
+
+    parser.add_argument(
+        "--tokenizer-mode",
+        type=str,
+        default="auto",
+        choices=["auto", "slow", "mistral", "custom"],
+        help='The tokenizer mode.\n\n* "auto" will use the '
+        'fast tokenizer if available.\n* "slow" will '
+        "always use the slow tokenizer. \n* "
+        '"mistral" will always use the `mistral_common` tokenizer. \n*'
+        '"custom" will use --tokenizer to select the preregistered tokenizer.',
+    )
+
+    parser.add_argument(
+        "--served-model-name",
+        type=str,
+        default=None,
+        help="The model name used in the API. "
+        "If not specified, the model name will be the "
+        "same as the ``--model`` argument. ",
+    )
+
+    parser.add_argument(
+        "--lora-modules",
+        nargs="+",
+        default=None,
+        help="A subset of LoRA module names passed in when "
+        "launching the server. For each request, the "
+        "script chooses a LoRA module at random.",
+    )
+
+    args = parser.parse_args()
+    main(args)
--- a/components/backends/sglang/slurm_jobs/scripts/vllm/benchmark_utils.py
+++ b/components/backends/sglang/slurm_jobs/scripts/vllm/benchmark_utils.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# pytest: skip-file
+
+import argparse
+import os
+from typing import Any, Dict, List
+
+
+def convert_to_pytorch_benchmark_format(
+    args: argparse.Namespace, metrics: Dict[str, List], extra_info: Dict[str, Any]
+) -> List:
+    """
+    Save the benchmark results in the format used by PyTorch OSS benchmark with
+    on metric per record
+    https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
+    """
+    records = []
+    if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
+        return records
+
+    for name, benchmark_values in metrics.items():
+        record = {
+            "benchmark": {
+                "name": "vLLM benchmark",
+                "extra_info": {
+                    "args": vars(args),
+                },
+            },
+            "model": {
+                "name": args.model,
+            },
+            "metric": {
+                "name": name,
+                "benchmark_values": benchmark_values,
+                "extra_info": extra_info,
+            },
+        }
+        records.append(record)
+
+    return records
--- a/components/backends/sglang/slurm_jobs/scripts/worker_setup.py
+++ b/components/backends/sglang/slurm_jobs/scripts/worker_setup.py
@@ -126,7 +126,7 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
    parser.add_argument(
        "--leader_ip",
        type=str,
-        required=True,
+        required=False,
        help="IP address of the leader node for this worker group",
    )
    parser.add_argument(
@@ -138,24 +138,24 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
    parser.add_argument(
        "--worker_idx",
        type=int,
-        required=True,
+        required=False,
        help="Index of the worker group (0-based)",
    )
    parser.add_argument(
        "--local_rank",
        type=int,
-        required=True,
+        required=False,
        help="Local rank within the worker group (0 for leader)",
    )
    parser.add_argument(
        "--nodes_per_worker",
        type=int,
-        required=True,
+        required=False,
        help="Number of nodes per worker",
    )
    parser.add_argument(
        "--worker_type",
-        choices=["decode", "prefill"],
+        choices=["decode", "prefill", "frontend", "nginx"],
        required=True,
        help="Type of worker to run",
    )
@@ -180,18 +180,40 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
        help="Type of GPU to use",
    )

+    parser.add_argument(
+        "--nginx_config",
+        type=str,
+        help="Path to nginx configuration file (required for nginx worker type)",
+    )
+
+    parser.add_argument(
+        "--multiple-frontends-enabled",
+        action="store_true",
+        help="Whether multiple frontend architecture is enabled (affects infrastructure setup)",
+    )
+
+    parser.add_argument(
+        "--use_init_locations",
+        action="store_true",
+        help="Whether we add --init-expert-locations to launch commands",
+    )
+
    return parser.parse_args(args)


 def _validate_args(args: argparse.Namespace) -> None:
    """Validate command line arguments"""
-    if args.worker_idx < 0:
-        raise ValueError("Worker index must be non-negative")
+    if args.worker_type in ["prefill", "decode"]:
+        if args.worker_idx is None or args.worker_idx < 0:
+            raise ValueError(
+                "Worker index must be provided and non-negative for prefill/decode"
+            )

-    if args.local_rank < 0:
+    if args.worker_type in ["prefill", "decode"]:
+        if args.local_rank is None or args.local_rank < 0:
            raise ValueError("Local rank must be non-negative")

-    if args.nodes_per_worker < 1:
+        if args.nodes_per_worker is None or args.nodes_per_worker < 1:
            raise ValueError("Nodes per worker must be at least 1")

        if args.gpus_per_node < 1:
@@ -202,6 +224,10 @@ def _validate_args(args: argparse.Namespace) -> None:
                f"Local rank ({args.local_rank}) must be less than nodes per worker ({args.nodes_per_worker})"
            )

+    # Validate nginx-specific arguments
+    if args.worker_type == "nginx" and not args.nginx_config:
+        raise ValueError("--nginx_config is required for nginx worker type")
+

 def setup_env_vars_for_gpu_script(
    host_ip: str,
@@ -209,6 +235,7 @@ def setup_env_vars_for_gpu_script(
    total_gpus: int,
    total_nodes: int,
    port: int = DIST_INIT_PORT,
+    use_init_locations: bool = True,
 ):
    """Setup environment variables required by GPU scripts (h100.sh, gb200-fp8.sh, gb200-fp4.sh)"""
    os.environ["HOST_IP"] = host_ip
@@ -216,12 +243,14 @@ def setup_env_vars_for_gpu_script(
    os.environ["TOTAL_GPUS"] = str(total_gpus)
    os.environ["RANK"] = str(local_rank)
    os.environ["TOTAL_NODES"] = str(total_nodes)
+    os.environ["USE_INIT_LOCATIONS"] = str(use_init_locations)

    logging.info(f"Set HOST_IP: {host_ip}")
    logging.info(f"Set PORT: {port}")
    logging.info(f"Set TOTAL_GPUS: {total_gpus}")
    logging.info(f"Set RANK: {local_rank}")
    logging.info(f"Set TOTAL_NODES: {total_nodes}")
+    logging.info(f"Set USE_INIT_LOCATIONS: {use_init_locations}")


 def get_gpu_command(worker_type: str, gpu_type: str) -> str:
@@ -255,20 +284,33 @@ def setup_head_prefill_node(prefill_host_ip: str) -> None:
    if not etcd_process:
        raise RuntimeError("Failed to start etcd")

-    logging.info(f"Starting ingress server on node {prefill_host_ip}")
-    ingress_process = run_command(
-        "python3 -m dynamo.frontend --http-port=8000", background=True
-    )
-    if not ingress_process:
-        raise RuntimeError("Failed to start ingress")

-    logging.info(
-        f"Starting http server on port 9001 for flush_cache endpoint on node {prefill_host_ip}"
-    )
-    cache_flush_server_cmd = "python3 utils/sgl_http_server.py --ns dynamo"
-    cache_flush_server_process = run_command(cache_flush_server_cmd, background=True)
-    if not cache_flush_server_process:
-        raise RuntimeError("Failed to start cache flush server")
+def setup_nginx_worker(master_ip: str, nginx_config: str) -> int:
+    """Setup nginx load balancer"""
+    logging.info("Setting up nginx load balancer")
+
+    if not nginx_config or not os.path.exists(nginx_config):
+        raise ValueError(f"Nginx config file not found: {nginx_config}")
+
+    nginx_cmd = f"apt-get update && apt-get install -y nginx && nginx -c {nginx_config} && sleep 86400"
+    return run_command(nginx_cmd)
+
+
+def setup_frontend_worker(worker_idx: int, master_ip: str) -> int:
+    """Setup a frontend worker"""
+    logging.info(f"Setting up frontend worker {worker_idx}")
+
+    # First frontend (worker_idx 0) also sets up NATS/ETCD
+    if worker_idx == 0:
+        setup_head_prefill_node(master_ip)
+    else:
+        logging.info(f"Setting up additional frontend worker {worker_idx}")
+        if not wait_for_etcd(f"http://{master_ip}:{ETCD_CLIENT_PORT}"):
+            raise RuntimeError("Failed to connect to etcd")
+
+    # All frontends run the ingress server
+    frontend_cmd = "python3 -m dynamo.frontend --http-port=8000"
+    return run_command(frontend_cmd)


 def setup_prefill_worker(
@@ -279,24 +321,29 @@ def setup_prefill_worker(
    nodes_per_worker: int,
    gpus_per_node: int,
    gpu_type: str,
+    multiple_frontends_enabled: bool = False,
+    use_init_locations: bool = True,
 ) -> int:
    """
    Setup the prefill worker.
    """
    total_gpus = nodes_per_worker * gpus_per_node
-
-    # Only the first prefill worker's leader node sets up NATS/ETCD/Frontend
-    if worker_idx == 0 and local_rank == 0:
+    # Only setup infrastructure in traditional mode (not multiple frontends)
+    if not multiple_frontends_enabled and worker_idx == 0 and local_rank == 0:
        setup_head_prefill_node(master_ip)
    else:
-        logging.info(
-            f"Setting up child prefill worker {worker_idx}, local rank {local_rank}"
-        )
+        logging.info(f"Setting up prefill worker {worker_idx}, local rank {local_rank}")
        if not wait_for_etcd(f"http://{master_ip}:{ETCD_CLIENT_PORT}"):
            raise RuntimeError("Failed to connect to etcd")

    # Setup environment variables for GPU script - use leader_ip as dist-init-addr
-    setup_env_vars_for_gpu_script(leader_ip, local_rank, total_gpus, nodes_per_worker)
+    setup_env_vars_for_gpu_script(
+        leader_ip,
+        local_rank,
+        total_gpus,
+        nodes_per_worker,
+        use_init_locations=use_init_locations,
+    )

    # Use appropriate GPU script instead of generating command directly
    cmd_to_run = get_gpu_command("prefill", gpu_type)
@@ -311,19 +358,25 @@ def setup_decode_worker(
    nodes_per_worker: int,
    gpus_per_node: int,
    gpu_type: str,
+    use_init_locations: bool = True,
 ) -> int:
    """
    Setup the decode worker.
    """
    total_gpus = nodes_per_worker * gpus_per_node
-
    logging.info(f"Setting up decode worker {worker_idx}, local rank {local_rank}")

    if not wait_for_etcd(f"http://{master_ip}:{ETCD_CLIENT_PORT}"):
        raise RuntimeError("Failed to connect to etcd")

    # Setup environment variables for GPU script - use leader_ip as dist-init-addr
-    setup_env_vars_for_gpu_script(leader_ip, local_rank, total_gpus, nodes_per_worker)
+    setup_env_vars_for_gpu_script(
+        leader_ip,
+        local_rank,
+        total_gpus,
+        nodes_per_worker,
+        use_init_locations=use_init_locations,
+    )

    # Use appropriate GPU script instead of generating command directly
    cmd_to_run = get_gpu_command("decode", gpu_type)
@@ -357,9 +410,17 @@ def main(input_args: list[str] | None = None):
    logging.info(f"Leader IP: {args.leader_ip}")
    logging.info(f"Master IP: {args.master_ip}")
    logging.info(f"Nodes per worker: {args.nodes_per_worker}")
+    logging.info(f"Use init locations?: {args.use_init_locations}")

    setup_env(args.master_ip)
-    if args.worker_type == "prefill":
+
+    if args.worker_type == "nginx":
+        if not args.nginx_config:
+            raise ValueError("--nginx_config is required for nginx worker type")
+        setup_nginx_worker(args.master_ip, args.nginx_config)
+    elif args.worker_type == "frontend":
+        setup_frontend_worker(args.worker_idx, args.master_ip)
+    elif args.worker_type == "prefill":
        setup_prefill_worker(
            args.worker_idx,
            args.local_rank,
@@ -368,8 +429,10 @@ def main(input_args: list[str] | None = None):
            args.nodes_per_worker,
            args.gpus_per_node,
            args.gpu_type,
+            args.multiple_frontends_enabled,
+            args.use_init_locations,
        )
-    else:
+    elif args.worker_type == "decode":
        setup_decode_worker(
            args.worker_idx,
            args.local_rank,
@@ -378,6 +441,7 @@ def main(input_args: list[str] | None = None):
            args.nodes_per_worker,
            args.gpus_per_node,
            args.gpu_type,
+            args.use_init_locations,
        )

    logging.info(f"{args.worker_type.capitalize()} worker setup complete")

--- a/components/backends/sglang/slurm_jobs/submit_job_script.py
+++ b/components/backends/sglang/slurm_jobs/submit_job_script.py
@@ -25,6 +25,35 @@ import tempfile
 from jinja2 import Template


+def print_welcome_message(job_ids: list[str]):
+    """Print a clean welcome message with job information."""
+
+    job_id = f"<{', '.join(job_ids)}>"
+    print(
+        f"""
+🚀 Welcome! We hope you enjoy your time on our GB200 NVL72.
+
+Your logs for this submitted job will be available in logs/{job_id}
+You can access them by running:
+
+    cd logs/{job_id}
+
+You can view all of the prefill/decode worker logs by running:
+
+    tail -f *_decode_*.err *_prefill_*.err
+
+To kick off the benchmark we suggest opening up a new terminal, SSH-ing
+into the login node, and running the srun command that is found at the
+bottom of the log.out. You can find it by running:
+
+    cat log.out
+
+Enjoy :)
+- NVIDIA
+"""
+    )
+
+
 def setup_logging(level: int = logging.INFO) -> None:
    logging.basicConfig(
        level=level,
@@ -45,7 +74,7 @@ def generate_job_script(template_path, output_path, **kwargs):
    return output_path


-def submit_job(job_script_path):
+def submit_job(job_script_path, extra_slurm_args=[]):
    """
    Submit the job script to SLURM and extract the job ID from the output.

@@ -53,9 +82,14 @@ def submit_job(job_script_path):
        The job ID of the submitted job.
    """
    try:
-        result = subprocess.run(
-            ["sbatch", job_script_path], capture_output=True, text=True, check=True
+        command = (
+            ["sbatch"]
+            + ["--" + x for x in extra_slurm_args]
+            + [
+                job_script_path,
+            ]
        )
+        result = subprocess.run(command, capture_output=True, text=True, check=True)
        output_lines = result.stdout.strip().split("\n")

        # sbatch typically outputs: "Submitted batch job JOBID"
@@ -118,6 +152,45 @@ def _parse_command_line_args(args: list[str] | None = None) -> argparse.Namespac
        default="batch",
        help="SLURM partition to use",
    )
+    parser.add_argument(
+        "--enable-multiple-frontends",
+        action="store_true",
+        help="Enable multiple frontend architecture with nginx load balancer",
+    )
+    parser.add_argument(
+        "--num-additional-frontends",
+        type=int,
+        default=0,
+        help="Number of additional frontend nodes (beyond the first frontend on node 1)",
+    )
+
+    parser.add_argument(
+        "--use-init-location",
+        action="store_true",
+        help="Whether we use '--init-expert-locations' json files",
+    )
+
+    parser.add_argument(
+        "--profiler",
+        type=str,
+        help="Profiler configurations. Example: "
+        + '"type=vllm; isl=8192; osl=1024; concurrencies=16x2048x4096x8192; req-rate=inf"',
+    )
+
+    parser.add_argument(
+        "--extra-slurm-args",
+        action="append",
+        default=[],
+        help="Extra slurm arguments, remove the '--' prefix. Example: --extra-slurm-args dependency=afterok:<x>",
+    )
+
+    parser.add_argument(
+        "--retries",
+        type=int,
+        default=0,
+        help="Tries to launch the job multiple times to catch transient errors",
+    )
+
    return parser.parse_args(args)


@@ -136,7 +209,45 @@ def main(input_args: list[str] | None = None):
            f"Decode nodes ({args.decode_nodes}) must be divisible by decode workers ({args.decode_workers})"
        )

+    # Validation for multiple frontends
+    if args.enable_multiple_frontends:
+        if args.num_additional_frontends < 0:
+            raise ValueError("Number of additional frontends cannot be negative")
+
    total_nodes = args.prefill_nodes + args.decode_nodes
+
+    # parse profiler configs
+    profiler_config = {}
+    if args.profiler:
+        for key_val_pair in args.profiler.split("; "):
+            key, val = key_val_pair.split("=")
+            profiler_config[key] = val
+
+    # validate profiler configs
+    if profiler_config == {} or profiler_config["type"] == "manual":
+        parsable_config = ""
+        profiler_config["type"] = "manual"
+    elif profiler_config["type"] in ["sglang", "vllm", "gap"]:
+        parsable_config = ""
+        need_keys = ["isl", "osl", "concurrencies"]
+        assert all([key in profiler_config for key in need_keys])
+        assert profiler_config["isl"].isnumeric()
+        parsable_config = f"{parsable_config} {profiler_config['isl']}"
+        assert profiler_config["osl"].isnumeric()
+        parsable_config = f"{parsable_config} {profiler_config['osl']}"
+        assert all([x.isnumeric() for x in profiler_config["concurrencies"].split("x")])
+        parsable_config = f"{parsable_config} {profiler_config['concurrencies']}"
+
+        if profiler_config["type"] in ["sglang", "vllm"]:
+            assert "req-rate" in profiler_config
+            assert (
+                profiler_config["req-rate"] == "inf"
+                or profiler_config["req-rate"].isnumeric()
+            )
+            parsable_config = f"{parsable_config} {profiler_config['req-rate']}"
+    else:
+        assert False, profiler_config["type"]
+
    template_vars = {
        "job_name": args.job_name,
        "total_nodes": total_nodes,
@@ -153,12 +264,33 @@ def main(input_args: list[str] | None = None):
        "network_interface": args.network_interface,
        "gpu_type": args.gpu_type,
        "partition": args.partition,
+        "enable_multiple_frontends": args.enable_multiple_frontends,
+        "num_additional_frontends": args.num_additional_frontends,
+        "use_init_location": args.use_init_location,
+        "do_profile": profiler_config["type"] != "manual",
+        "profiler_type": profiler_config["type"],
+        "profiler_arg": parsable_config,
    }

    with tempfile.NamedTemporaryFile(mode="w", suffix=".sh") as temp_file:
        generate_job_script(args.template, temp_file.name, **template_vars)
-        job_id = submit_job(temp_file.name)
-        logging.info(f"Job logs will be available in: logs/{job_id}/")
+
+        submitted_job_ids = []
+        job_id = submit_job(temp_file.name, args.extra_slurm_args)
+        submitted_job_ids.append(job_id)
+        # retries logic
+        extra_slurm_args_without_dependencies = [
+            x for x in args.extra_slurm_args if "dependency" not in x
+        ]
+        for _ in range(args.retries):
+            dependencies = ",".join([f"afternotok:{job}" for job in submitted_job_ids])
+            slurm_args = extra_slurm_args_without_dependencies + [
+                f"dependency={dependencies}"
+            ]
+            job_id = submit_job(temp_file.name, slurm_args)
+            submitted_job_ids.append(job_id)
+
+        print_welcome_message(submitted_job_ids)


 if __name__ == "__main__":

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -139,6 +139,7 @@ addopts = [
    "--ignore-glob=*/llm/tensorrtllm*",
    "--ignore-glob=docs/*",
    "--ignore-glob=components/backends/sglang/src/dynamo/sglang/request_handlers/*",
+    "--ignore-glob=components/backends/sglang/slurm_jobs/*",
    # FIXME: Get relative/generic blob paths to work here
 ]
 xfail_strict = true