Merge branch 'v0.5.4-dtk24.04.1'

e7c1b7f3 · zhuwenwen · 7462218e · 04c62b93 · e7c1b7f3 · e7c1b7f3
Commit e7c1b7f3 authored Sep 06, 2024 by zhuwenwen
20 changed files
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  pkill lmdeploy || true
+  # waiting for GPU processes to be fully killed
+  sleep 10
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/v1/completions > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+    
+    # append lmdeploy to the test name
+    test_name=lmdeploy_$test_name
+    
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
+    client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    # prepare tokenizer
+    rm -rf /tokenizer_cache
+    mkdir /tokenizer_cache
+    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+      --model "$model" \
+      --cachedir /tokenizer_cache
+
+    server_command="lmdeploy serve api_server $model \
+      --tp $tp \
+      --server-port $port \
+      $server_args"
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    bash -c "$server_command" &
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "lmdeploy server is up and running."
+    else
+      echo ""
+      echo "lmdeploy failed to start within the timeout period."
+      break
+    fi
+
+    # get model name
+    model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend lmdeploy \
+        --tokenizer /tokenizer_cache \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        --model \"$model_name\" \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "lmdeploy" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
+main() {
+
+  check_gpus
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  python -m pip install transformers==4.41.2
+
+  export CURRENT_LLM_SERVING_ENGINE=lmdeploy
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  python -m pip install tabulate pandas
+  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  pkill text-generation || true
+  # waiting for GPU processes to be fully killed
+  sleep 10
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/generate_stream > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # append tgi to the test name
+    test_name=tgi_$test_name
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
+    client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
+      echo "Key 'fp8' exists in common params."
+      server_command="/tgi-entrypoint.sh \
+        --model-id $model \
+        --num-shard $tp \
+        --port $port \
+        --quantize fp8 \
+        $server_args"
+    else
+      echo "Key 'fp8' does not exist in common params."
+      server_command="/tgi-entrypoint.sh \
+        --model-id $model \
+        --num-shard $tp \
+        --port $port \
+        $server_args"
+    fi
+
+
+    
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    eval "$server_command" &
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "tgi server is up and running."
+    else
+      echo ""
+      echo "tgi failed to start within the timeout period."
+      break
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend tgi \
+        --model $model \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "tgi" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+main() {
+
+  check_gpus
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  export CURRENT_LLM_SERVING_ENGINE=tgi
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  python -m pip install tabulate pandas
+  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  pkill tritonserver || true
+  # waiting for GPU processes to be fully killed
+  sleep 20
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/generate_stream > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # append trt to the test name
+    test_name=trt_$test_name
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.trt_server_parameters')
+    client_params=$(echo "$params" | jq -r '.trt_client_parameters')
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+
+
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+
+
+    echo "Running test case $test_name"
+    bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "trt server is up and running."
+    else
+      echo ""
+      echo "trt failed to start within the timeout period."
+      break
+    fi
+
+    # prepare tokenizer
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    rm -rf /tokenizer_cache
+    mkdir /tokenizer_cache
+    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+      --model "$model" \
+      --cachedir /tokenizer_cache
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend tensorrt-llm \
+        --tokenizer /tokenizer_cache \
+        --model $model \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      server_command=""
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "trt" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
+main() {
+
+  check_gpus
+
+
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  # update transformers package, to make sure mixtral tokenizer is available
+  python -m pip install transformers -U
+
+  export CURRENT_LLM_SERVING_ENGINE=trt
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  python -m pip install tabulate pandas
+  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+#!/bin/bash
+
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pkill pt_main_thread
+  sleep 10
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl -s localhost:8000/v1/completions > /dev/null; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # append vllm to the test name
+    test_name=vllm_$test_name
+
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tp')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
+    client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
+      echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
+      model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
+      server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        $server_args"
+    else
+      echo "Key 'fp8' does not exist in common params."
+      server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        $server_args"
+    fi
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    eval "$server_command" &
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "vllm server is up and running."
+    else
+      echo ""
+      echo "vllm failed to start within the timeout period."
+      break
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend vllm \
+        --model $model \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "vllm" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
+  done
+}
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+main() {
+
+  check_gpus
+  # enter vllm directory
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  export CURRENT_LLM_SERVING_ENGINE=vllm
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+
+  python3 -m pip install tabulate pandas
+  python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
+
+}
+
+main "$@"
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+import datetime
+import json
+import os
+from pathlib import Path
+
+import pandas as pd
+from tabulate import tabulate
+
+results_folder = Path("results/")
+
+# serving results and the keys that will be printed into markdown
+serving_results = []
+serving_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    "completed": "Successful req.",
+    "request_throughput": "Tput (req/s)",
+    "mean_ttft_ms": "Mean TTFT (ms)",
+    "std_ttft_ms": "Std TTFT (ms)",
+    "mean_itl_ms": "Mean ITL (ms)",
+    "std_itl_ms": "Std ITL (ms)",
+    "input_throughput": "Input Tput (tok/s)",
+    "output_throughput": "Output Tput (tok/s)",
+    "engine": "Engine",
+}
+
+if __name__ == "__main__":
+
+    # collect results
+    for test_file in results_folder.glob("*.json"):
+
+        with open(test_file, "r") as f:
+            raw_result = json.loads(f.read())
+
+        # attach the benchmarking command to raw_result
+        with open(test_file.with_suffix(".commands"), "r") as f:
+            command = json.loads(f.read())
+        raw_result.update(command)
+
+        # update the test name of this result
+        raw_result.update({"test_name": test_file.stem})
+
+        # add the result to raw_result
+        serving_results.append(raw_result)
+        continue
+
+    serving_results = pd.DataFrame.from_dict(serving_results)
+
+    if not serving_results.empty:
+        serving_results = serving_results[list(
+            serving_column_mapping.keys())].rename(
+                columns=serving_column_mapping)
+
+    serving_md_table_with_headers = tabulate(serving_results,
+                                             headers='keys',
+                                             tablefmt='pipe',
+                                             showindex=False)
+    # remove the first line of header
+    serving_md_table_lines = serving_md_table_with_headers.split('\n')
+    serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
+
+    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
+
+    # document benchmarking results in markdown
+    with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
+        # document results with header.
+        # for those who wants to reproduce our benchmark.
+        f.write(serving_md_table_with_headers)
+        f.write('\n')
+
+    # document benchmarking results in json
+    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
+
+        results = serving_results.to_dict(orient='records')
+        f.write(json.dumps(results))
--- a/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+++ b/.buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+#!/bin/sh
+TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token)
+URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
+
+retries=0
+while [ $retries -lt 1000 ]; do
+    if [ $(curl -s -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then
+        exit 0
+    fi
+
+    echo "Waiting for image to be available..."
+
+    retries=$((retries + 1))
+    sleep 5
+done
+
+exit 1
\ No newline at end of file
--- a/.buildkite/nightly-benchmarks/tests/descriptions.md
+++ b/.buildkite/nightly-benchmarks/tests/descriptions.md
+
+## Latency tests
+
+This test suite aims to test vllm's end-to-end latency under a controlled setup.
+
+- Input length: 32 tokens.
+- Output length: 128 tokens.
+- Batch size: fixed (8).
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Evaluation metrics: end-to-end latency (mean, median, p99).
+
+### Latency benchmarking results
+
+{latency_tests_markdown_table}
+
+## Throughput tests
+
+This test suite aims to test vllm's throughput.
+
+- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 200 prompts.
+- Batch size: dynamically determined by vllm to achieve maximum throughput.
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Evaluation metrics: throughput.
+
+### Throughput benchmarking results
+
+{throughput_tests_markdown_table}
+
+## Serving tests
+
+This test suite aims to test vllm's real serving metrics.
+
+- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 200 prompts.
+- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
+- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
+
+### Serving benchmarking results
+
+{serving_tests_markdown_table}
+
+## json version of the benchmarking tables
+
+This section contains the data of the markdown tables above in JSON format. 
+You can load the benchmarking tables into pandas dataframes as follows:
+
+```python
+import json
+import pandas as pd
+
+benchmarking_results_json = """The json string"""
+benchmarking_results = json.loads(benchmarking_results_json)
+latency_results = pd.DataFrame.from_dict(benchmarking_results["latency"])
+throughput_results = pd.DataFrame.from_dict(benchmarking_results["throughput"])
+serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
+```
+
+The json string for all benchmarking tables:
+```json
+{benchmarking_results_in_json_string}
+```
+
+You can also check the raw experiment data in the Artifact tab of the Buildkite page.
+
--- a/.buildkite/nightly-benchmarks/tests/latency-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/latency-tests.json
+[
+    {
+        "test_name": "latency_llama8B_tp1",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "num_iters_warmup": 5,
+            "num_iters": 15
+        }
+    },
+    {
+        "test_name": "latency_llama70B_tp4",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15
+        }
+    },
+    {
+        "test_name": "latency_mixtral8x7B_tp2",
+        "parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "load_format": "dummy",
+            "num-iters-warmup": 5,
+            "num-iters": 15
+        }
+    }
+]
\ No newline at end of file
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+[
+    {
+        "test_name": "llama8B_tp1",
+        "qps_list": [4],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tp": 1,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000
+        },
+        "lmdeploy_server_parameters": {
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "float16",
+            "max_batch_size": 256,
+            "max_input_len": 4096,
+            "max_output_len": 4096,
+            "trt_llm_version": "r24.04"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": ""
+        },
+        "vllm_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4",
+        "qps_list": [2],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tp": 4,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000
+        },
+        "lmdeploy_server_parameters": {
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "float16",
+            "max_batch_size": 256,
+            "max_input_len": 4096,
+            "max_output_len": 4096,
+            "trt_llm_version": "r24.04"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": ""
+        },
+        "vllm_client_parameters": {
+        }
+    },
+    {
+        "test_name": "mixtral8x7B_tp2",
+        "qps_list": [2],
+        "common_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tp": 2,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000
+        },
+        "lmdeploy_server_parameters": {
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "float16",
+            "max_batch_size": 256,
+            "max_input_len": 4096,
+            "max_output_len": 4096,
+            "trt_llm_version": "r24.04"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": ""
+        },
+        "vllm_client_parameters": {
+        }
+    }
+]
\ No newline at end of file
--- a/.buildkite/nightly-benchmarks/tests/serving-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/serving-tests.json
+[
+    {
+        "test_name": "serving_llama8B_tp1_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tensor_parallel_size": 1,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama70B_tp4_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_mixtral8x7B_tp2_sharegpt",
+        "qps_list": [1, 4, 16, "inf"],
+        "server_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "swap_space": 16,
+            "disable_log_stats": "",
+            "disable_log_requests": "",
+            "load_format": "dummy"
+        },
+        "client_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
+        }
+    },
+    {
+        "test_name": "serving_llama70B_tp4_sharegpt_specdecode",
+        "qps_list": [2],
+        "server_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "disable_log_requests": "", 
+            "tensor_parallel_size": 4,
+            "swap_space": 16, 
+            "speculative_model": "turboderp/Qwama-0.5B-Instruct",
+            "num_speculative_tokens": 4,
+            "speculative_draft_tensor_parallel_size": 1,
+            "use_v2_block_manager": ""
+        },
+        "client_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200 
+        }
+    }
+]
--- a/.buildkite/nightly-benchmarks/tests/throughput-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/throughput-tests.json
+[
+    {
+        "test_name": "throughput_llama8B_tp1",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tensor_parallel_size": 1,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_llama70B_tp4",
+        "parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tensor_parallel_size": 4,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    },
+    {
+        "test_name": "throughput_mixtral8x7B_tp2",
+        "parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
+            "load_format": "dummy",
+            "dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "backend": "vllm"
+        }
+    }
+]
\ No newline at end of file
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
+steps:
+  - label: "Build wheel - CUDA {{matrix.cuda_version}}"
+    agents:
+      queue: cpu_queue
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      # rename the files to change linux -> manylinux1
+      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
+    env:
+      DOCKER_BUILDKIT: "1"
+    matrix:
+      setup:
+        cuda_version:
+          - "11.8.0"
+          - "12.1.0"
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -2,6 +2,15 @@
 set -ex

 # Print ROCm version
+echo "--- Confirming Clean Initial State"
+while true; do
+        sleep 3
+        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+                echo "GPUs state is \"clean\""
+                break
+        fi
+done
+
 echo "--- ROCm info"
 rocminfo

@@ -45,15 +54,10 @@ while true; do
        fi
 done

-echo "--- Building container"
-sha=$(git rev-parse --short HEAD)
-image_name=rocm_${sha}
-container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
-docker build \
-        -t ${image_name} \
-        -f Dockerfile.rocm \
-        --progress plain \
-        .
+echo "--- Pulling container" 
+image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
+container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
+docker pull ${image_name}

 remove_docker_container() {
   docker rm -f ${container_name} || docker image rm -f ${image_name} || true
@@ -62,11 +66,18 @@ trap remove_docker_container EXIT

 echo "--- Running container"

+HF_CACHE="$(realpath ~)/huggingface"
+mkdir -p ${HF_CACHE}
+HF_MOUNT="/root/.cache/huggingface"
+
 docker run \
        --device /dev/kfd --device /dev/dri \
        --network host \
+        --shm-size=16gb \
        --rm \
        -e HF_TOKEN \
+        -v ${HF_CACHE}:${HF_MOUNT} \
+        -e HF_HOME=${HF_MOUNT} \
        --name ${container_name} \
        ${image_name} \
        /bin/bash -c "${@}"

--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -3,22 +3,38 @@
 set -ex

 # Try building the docker image
-docker build -t cpu-test -f Dockerfile.cpu .
+numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu .
+numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .

 # Setup cleanup
-remove_docker_container() { docker rm -f cpu-test || true; }
+remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; }
 trap remove_docker_container EXIT
 remove_docker_container

-# Run the image
-docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
+# Run the image, setting --shm-size=4g for tensor parallel.
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
+ --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
+docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \
+ --cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2

 # offline inference
-docker exec cpu-test bash -c "python3 examples/offline_inference.py"
+docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py"

 # Run basic model test
-docker exec cpu-test bash -c "cd tests;
+docker exec cpu-test bash -c "
  pip install pytest Pillow protobuf
-  bash ../.buildkite/download-images.sh
-  cd ../
-  pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
+  pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
+
+# online inference
+docker exec cpu-test bash -c "
+  export VLLM_CPU_KVCACHE_SPACE=10 
+  export VLLM_CPU_OMP_THREADS_BIND=48-92 
+  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
+  timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
+  python3 benchmarks/benchmark_serving.py \
+    --backend vllm \
+    --dataset-name random \
+    --model facebook/opt-125m \
+    --num-prompts 20 \
+    --endpoint /v1/completions \
+    --tokenizer facebook/opt-125m"
--- a/.buildkite/run-multi-node-test.sh
+++ b/.buildkite/run-multi-node-test.sh
+#!/bin/bash
+
+set -euox pipefail
+
+if [[ $# -lt 4 ]]; then
+    echo "Usage: .buildkite/run-multi-node-test.sh WORKING_DIR NUM_NODES NUM_GPUS DOCKER_IMAGE COMMAND1 COMMAND2 ... COMMANDN"
+    exit 1
+fi
+
+WORKING_DIR=$1
+NUM_NODES=$2
+NUM_GPUS=$3
+DOCKER_IMAGE=$4
+
+shift 4
+COMMANDS=("$@")
+if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then
+    echo "The number of commands must be equal to the number of nodes."
+    echo "Number of nodes: $NUM_NODES"
+    echo "Number of commands: ${#COMMANDS[@]}"
+    exit 1
+fi
+
+echo "List of commands"
+for command in "${COMMANDS[@]}"; do
+    echo $command
+done
+
+start_network() {
+    docker network create --subnet=192.168.10.0/24 docker-net
+}
+
+start_nodes() {
+    for node in $(seq 0 $(($NUM_NODES-1))); do
+        GPU_DEVICES='"device='
+        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
+            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
+            GPU_DEVICES+=$(($DEVICE_NUM))
+            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
+                GPU_DEVICES+=','
+            fi
+        done
+        GPU_DEVICES+='"'
+
+        # start the container in detached mode
+        # things to note:
+        # 1. --shm-size=10.24gb is required. don't use --ipc=host
+        # 2. pass HF_TOKEN to the container
+        # 3. map the huggingface cache directory to the container
+        # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
+        #    starting from 192.168.10.11)
+        docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null"
+
+        # organize containers into a ray cluster
+        if [ $node -eq 0 ]; then
+            # start the ray head node
+            docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block"
+            # wait for the head node to be ready
+            sleep 10
+        else
+            # start the ray worker nodes, and connect them to the head node
+            docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
+        fi
+    done
+
+    # wait for the cluster to be ready
+    sleep 10
+
+    # print the cluster status
+    docker exec node0 /bin/bash -c "ray status"
+}
+
+run_nodes() {
+    # important: iterate in reverse order to start the head node last
+    # we start the worker nodes first, in detached mode, and then start the head node
+    # in the foreground, so that the output of the head node is visible in the buildkite logs
+    for node in $(seq $(($NUM_NODES - 1)) -1 0); do
+        GPU_DEVICES='"device='
+        for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
+            DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
+            GPU_DEVICES+=$(($DEVICE_NUM))
+            if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then
+                GPU_DEVICES+=','
+            fi
+        done
+        GPU_DEVICES+='"'
+        echo "Running node$node with GPU devices: $GPU_DEVICES"
+        if [ $node -ne 0 ]; then
+            docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+        else
+            docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
+        fi
+    done
+}
+cleanup() {
+    for node in $(seq 0 $(($NUM_NODES-1))); do
+        docker stop node$node
+    done
+    docker network rm docker-net
+}
+trap cleanup EXIT
+start_network
+start_nodes
+run_nodes
+
--- a/.buildkite/run-openvino-test.sh
+++ b/.buildkite/run-openvino-test.sh
+# This script build the OpenVINO docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t openvino-test -f Dockerfile.openvino .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f openvino-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
--- a/.buildkite/run-tpu-test.sh
+++ b/.buildkite/run-tpu-test.sh
+set -e
+
+# Build the docker image.
+docker build -f Dockerfile.tpu -t vllm-tpu .
+
+# Set up cleanup.
+remove_docker_container() { docker rm -f tpu-test || true; }
+trap remove_docker_container EXIT
+# Remove the container that might not be cleaned up in the previous run.
+remove_docker_container
+
+# For HF_TOKEN.
+source /etc/environment
+# Run a simple end-to-end example.
+docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu \
+    python3 /workspace/vllm/examples/offline_inference_tpu.py
--- a/.buildkite/run-xpu-test.sh
+++ b/.buildkite/run-xpu-test.sh
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t xpu-test -f Dockerfile.xpu .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f xpu-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image and launch offline inference
+docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
 # In this file, you can add more tests to run either by adding a new step or
 # adding a new command to an existing step. See different options here for examples.
-# This script will be feed into Jinja template in `test-template.j2` to generate
-# the final pipeline yaml file.
+
+# This script will be feed into Jinja template in `test-template-aws.j2` at
+# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 
+# to generate the final pipeline yaml file.
+

 steps:
+- label: Async Engine, Inputs, Utils, Worker Test
+  fast_check: true
+  fast_check_only: true
+  commands:
+  - pytest -v -s async_engine # Async Engine
+  - pytest -v -s test_inputs.py
+  - pytest -v -s multimodal
+  - pytest -v -s test_utils.py # Utils
+  - pytest -v -s worker # Worker
+
+- label: Metrics, Tracing Test
+  fast_check: true
+  fast_check_only: true
+  commands:
+  - pytest -v -s metrics # Metrics
+  - "pip install \
+      opentelemetry-sdk \
+      opentelemetry-api \
+      opentelemetry-exporter-otlp \
+      opentelemetry-semantic-conventions-ai" # Tracing
+  - pytest -v -s tracing
+
 - label: Regression Test
  mirror_hardwares: [amd]
+  fast_check: true
  command: pytest -v -s test_regression.py
  working_dir: "/vllm-workspace/tests" # optional

@@ -15,94 +41,120 @@ steps:

 - label: Basic Correctness Test
  mirror_hardwares: [amd]
+  fast_check: true
  commands:
-  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
-  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
+  # This flashinfer installation will fail on AMD ROCm, so it is set as optional.
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl || true
+  - pytest -v -s basic_correctness/test_basic_correctness.py
+  - pytest -v -s basic_correctness/test_cpu_offload.py
  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py

 - label: Core Test
  mirror_hardwares: [amd]
-  command: pytest -v -s core
+  fast_check: true
+  commands:
+  - pytest -v -s core

 - label: Distributed Comm Ops Test
  #mirror_hardwares: [amd]
-  command: pytest -v -s distributed/test_comm_ops.py
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
+  commands:
+  - pytest -v -s distributed/test_comm_ops.py
+  - pytest -v -s distributed/test_shm_broadcast.py
+
+- label: 2 Node Tests (4 GPUs in total)
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  num_nodes: 2
+  commands:
+  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
+  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py

- label: Distributed Tests
+- label: Distributed Tests (2 GPUs)
  mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 2
  commands:
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist.py
+  - TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pytest -v -s distributed/test_chunked_prefill_distributed.py
+  - pytest -v -s distributed/test_multimodal_broadcast.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py

- label: Distributed Tests (Multiple Groups)
+- label: Distributed Tests (4 GPUs)
  #mirror_hardwares: [amd]
  working_dir: "/vllm-workspace/tests"
  num_gpus: 4
+  fast_check: true
  commands:
  - pytest -v -s distributed/test_pynccl.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
+
+- label: Pipeline Parallelism Test
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  commands:
+  - pytest -v -s distributed/test_pipeline_parallel.py

 - label: Engine Test
  mirror_hardwares: [amd]
-  command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
+  commands:
+  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
+  # OOM in the CI unless we run this separately
+  - pytest -v -s tokenization

 - label: Entrypoints Test
+  fast_check: true
  mirror_hardwares: [amd]

  commands:
-  - pytest -v -s entrypoints -m llm
-  - pytest -v -s entrypoints -m openai
+  - pytest -v -s entrypoints/llm
+  - pytest -v -s entrypoints/openai

 - label: Examples Test
  working_dir: "/vllm-workspace/examples"
  mirror_hardwares: [amd]
  commands:
-    # install aws cli for llava_example.py
    # install tensorizer for tensorize_vllm_model.py
    - pip install awscli tensorizer
    - python3 offline_inference.py
+    - python3 cpu_offload.py
    - python3 offline_inference_with_prefix.py
    - python3 llm_engine_example.py
-    - python3 llava_example.py
+    - python3 offline_inference_vision_language.py
    - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors

 - label: Inputs Test
  #mirror_hardwares: [amd]
  commands:
-    - bash ../.buildkite/download-images.sh
    - pytest -v -s test_inputs.py
    - pytest -v -s multimodal

- label: Kernels Test %N
-  #mirror_hardwares: [amd]
-  command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-  parallelism: 4
+# - label: Kernels Test %N
+#   #mirror_hardwares: [amd]
+#   commands:
+#     - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+#     - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+#   parallelism: 4

 - label: Models Test
  #mirror_hardwares: [amd]
  commands:
-    - pytest -v -s models -m \"not llava\"
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
+    - pytest -v -s models -m \"not vlm\"

- label: Llava Test
+- label: Vision Language Models Test
  mirror_hardwares: [amd]
  commands:
-    - bash ../.buildkite/download-images.sh
-    - pytest -v -s models -m llava
+    - pytest -v -s models -m vlm

 - label: Prefix Caching Test
  mirror_hardwares: [amd]
@@ -118,7 +170,9 @@ steps:
  command: pytest -v -s test_logits_processor.py

 - label: Utils Test
-  command: pytest -v -s test_utils.py
+  commands:
+    - pytest -v -s test_utils.py
+    - pytest -v -s test_embedded_commit.py

 - label: Worker Test
  mirror_hardwares: [amd]
@@ -131,21 +185,28 @@ steps:
    - export VLLM_ATTENTION_BACKEND=XFORMERS
    - pytest -v -s spec_decode

- label: LoRA Test %N
-  #mirror_hardwares: [amd]
-  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
-  parallelism: 4
+# - label: LoRA Test %N
+#   #mirror_hardwares: [amd]
+#   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
+#   parallelism: 4

- label: LoRA Long Context (Distributed)
-  #mirror_hardwares: [amd]
-  num_gpus: 4
-  # This test runs llama 13B, so it is required to run on 4 GPUs.
-  commands:
-    - pytest -v -s -x lora/test_long_context.py
+# - label: LoRA Long Context (Distributed)
+#   #mirror_hardwares: [amd]
+#   num_gpus: 4
+#   # This test runs llama 13B, so it is required to run on 4 GPUs.
+#   commands:
+#     # FIXIT: find out which code initialize cuda before running the test
+#     # before the fix, we need to use spawn to test it
+#     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+#     - pytest -v -s -x lora/test_long_context.py

 - label: Tensorizer Test
  #mirror_hardwares: [amd]
-  command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
+  fast_check: true
+  commands:
+    - apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s tensorizer_loader

 - label: Metrics Test
  mirror_hardwares: [amd]
@@ -155,6 +216,15 @@ steps:
  #mirror_hardwares: [amd]
  command: pytest -v -s quantization

+- label: Tracing Test
+  commands: 
+    - "pip install \
+        opentelemetry-sdk \
+        opentelemetry-api \
+        opentelemetry-exporter-otlp \
+        opentelemetry-semantic-conventions-ai"
+    - pytest -v -s tracing
+
 - label: Benchmarks
  working_dir: "/vllm-workspace/.buildkite"
  mirror_hardwares: [amd]
@@ -162,9 +232,37 @@ steps:
  - pip install aiohttp
  - bash run-benchmarks.sh

+- label: LM Eval Small Models
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  commands:
+  - pip install lm-eval
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ./run-tests.sh -c configs/models-small.txt -t 1
+
+- label: LM Eval Large Models
+  gpu: a100
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  commands:
+  - pip install lm-eval
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ./run-tests.sh -c configs/models-large.txt -t 4
+
 - label: Documentation Build
  working_dir: "/vllm-workspace/test_docs/docs"
+  fast_check: true
  no_gpu: True
  commands:
  - pip install -r requirements-docs.txt
  - SPHINXOPTS=\"-W\" make html
+
+- label: Distributed Tests (A100)
+  gpu: a100
+  num_gpus: 4
+  commands: 
+  # NOTE: don't test llama model here, it seems hf implementation is buggy
+  # see https://github.com/vllm-project/vllm/pull/5689 for details
+  - pytest -v -s distributed/test_custom_all_reduce.py
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
+  - TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pytest -v -s -x lora/test_mixtral.py
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
-{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
-{% set default_working_dir = "/vllm-workspace/tests" %}
-
-steps:
-  - label: ":docker: build image"
-    agents:
-      queue: cpu_queue
-    commands:
-      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --tag {{ docker_image }} --target test --progress plain ."
-      - "docker push {{ docker_image }}"
-    env:
-      DOCKER_BUILDKIT: "1"
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-  - wait
-
-  - group: "AMD Tests"
-    depends_on: ~
-    steps:
-    {% for step in steps %}
-    {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
-      - label: "AMD: {{ step.label }}"
-        agents:
-          queue: amd
-        command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
-        env:
-          DOCKER_BUILDKIT: "1"
-        soft_fail: true
-    {% endif %}
-    {% endfor %}
-
-  - label: "Neuron Test"
-    depends_on: ~
-    agents:
-      queue: neuron
-    command: bash .buildkite/run-neuron-test.sh
-    soft_fail: false
-
-  - label: "Intel Test"
-    depends_on: ~
-    agents:
-      queue: intel
-    command: bash .buildkite/run-cpu-test.sh
-
-  {% for step in steps %}
-  - label: "{{ step.label }}"
-    agents:
-      {% if step.label == "Documentation Build" %}
-      queue: small_cpu_queue
-      {% elif step.no_gpu %}
-      queue: cpu_queue
-      {% elif step.num_gpus == 2 or step.num_gpus == 4 %}
-      queue: gpu_4_queue
-      {% else %}
-      queue: gpu_1_queue
-      {% endif %}
-    soft_fail: {{ step.soft_fail or false }}
-    {% if step.parallelism %}
-    parallelism: {{ step.parallelism }}
-    {% endif %}
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 5
-        - exit_status: -10  # Agent was lost
-          limit: 5
-    plugins:
-      - docker#v5.2.0:
-          image: {{ docker_image }}
-          always-pull: true
-          propagate-environment: true
-          {% if not step.no_gpu %}
-          gpus: all
-          {% endif %}
-          {% if step.label == "Benchmarks" %}
-          mount-buildkite-agent: true
-          {% endif %}
-          command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}"]
-          environment:
-            - VLLM_USAGE_SOURCE=ci-test
-            - HF_TOKEN
-            {% if step.label == "Speculative decoding tests" %}
-            - VLLM_ATTENTION_BACKEND=XFORMERS
-            {% endif %}
-          volumes:
-            - /dev/shm:/dev/shm
-  {% endfor %}