Commit 4d3a2c28 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.5' into v0.6.5-dev

parents 92ec5d8e 2d1b9baa
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
pkill tritonserver || true
# waiting for GPU processes to be fully killed
sleep 20
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
timeout 1200 bash -c '
until curl -s localhost:8000/generate_stream > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append trt to the test name
test_name=trt_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.trt_server_parameters')
client_params=$(echo "$params" | jq -r '.trt_client_parameters')
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
cd $VLLM_SOURCE_CODE_LOC/benchmarks
echo "Running test case $test_name"
bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "trt server is up and running."
else
echo ""
echo "trt failed to start within the timeout period."
break
fi
# prepare tokenizer
cd $VLLM_SOURCE_CODE_LOC/benchmarks
rm -rf /tokenizer_cache
mkdir /tokenizer_cache
python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
--model "$model" \
--cachedir /tokenizer_cache
cd $VLLM_SOURCE_CODE_LOC/benchmarks
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend tensorrt-llm \
--tokenizer /tokenizer_cache \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
server_command=""
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "trt" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
# update transformers package, to make sure mixtral tokenizer is available
python -m pip install transformers -U
export CURRENT_LLM_SERVING_ENGINE=trt
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python -m pip install tabulate pandas
python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"
#!/bin/bash
set -o pipefail
check_gpus() {
# check the number of GPUs and GPU type.
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
if [[ $gpu_count -gt 0 ]]; then
echo "GPU found."
else
echo "Need at least 1 GPU to run benchmarking."
exit 1
fi
declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
echo "GPU type is $gpu_type"
}
kill_gpu_processes() {
# kill all processes on GPU.
pkill pt_main_thread
sleep 10
# remove vllm config file
rm -rf ~/.config/vllm
# Print the GPU memory usage
# so that we know if all GPU processes are killed.
gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
# The memory usage should be 0 MB.
echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
}
json2args() {
# transforms the JSON string to command line args, and '_' is replaced to '-'
# example:
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
local json_string=$1
local args=$(
echo "$json_string" | jq -r '
to_entries |
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
join(" ")
'
)
echo "$args"
}
wait_for_server() {
# wait for vllm server to start
# return 1 if vllm server crashes
timeout 1200 bash -c '
until curl -s localhost:8000/v1/completions > /dev/null; do
sleep 1
done' && return 0 || return 1
}
run_serving_tests() {
# run serving tests using `benchmark_serving.py`
# $1: a json file specifying serving test cases
local serving_test_file
serving_test_file=$1
# Iterate over serving tests
jq -c '.[]' "$serving_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')
# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi
# append vllm to the test name
test_name=vllm_$test_name
# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
# get client and server arguments
server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
server_args=$(json2args "$server_params")
client_args=$(json2args "$client_params")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi
if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
else
echo "Key 'fp8' does not exist in common params."
server_command="python3 \
-m vllm.entrypoints.openai.api_server \
-tp $tp \
--model $model \
--port $port \
$server_args"
fi
# run the server
echo "Running test case $test_name"
echo "Server command: $server_command"
eval "$server_command" &
# wait until the server is alive
wait_for_server
if [ $? -eq 0 ]; then
echo ""
echo "vllm server is up and running."
else
echo ""
echo "vllm failed to start within the timeout period."
break
fi
# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps="inf"
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
client_command="python3 benchmark_serving.py \
--backend vllm \
--model $model \
--dataset-name $dataset_name \
--dataset-path $dataset_path \
--num-prompts $num_prompts \
--port $port \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
$client_args"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
eval "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
--arg engine "vllm" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu,
engine: $engine
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
# clean up
kill_gpu_processes
rm -rf /root/.cache/huggingface/*
done
}
upload_to_buildkite() {
# upload the benchmarking results to buildkite
# if the agent binary is not found, skip uploading the results, exit 0
if [ ! -f /workspace/buildkite-agent ]; then
echo "buildkite-agent binary not found. Skip uploading the results."
return 0
fi
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
}
main() {
check_gpus
# enter vllm directory
cd $VLLM_SOURCE_CODE_LOC/benchmarks
declare -g RESULTS_FOLDER=results/
mkdir -p $RESULTS_FOLDER
BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
export CURRENT_LLM_SERVING_ENGINE=vllm
run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
python3 -m pip install tabulate pandas
python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
upload_to_buildkite
}
main "$@"
...@@ -17,10 +17,17 @@ serving_column_mapping = { ...@@ -17,10 +17,17 @@ serving_column_mapping = {
"request_throughput": "Tput (req/s)", "request_throughput": "Tput (req/s)",
"mean_ttft_ms": "Mean TTFT (ms)", "mean_ttft_ms": "Mean TTFT (ms)",
"std_ttft_ms": "Std TTFT (ms)", "std_ttft_ms": "Std TTFT (ms)",
"median_ttft_ms": "Median TTFT (ms)",
"mean_itl_ms": "Mean ITL (ms)", "mean_itl_ms": "Mean ITL (ms)",
"std_itl_ms": "Std ITL (ms)", "std_itl_ms": "Std ITL (ms)",
"input_throughput": "Input Tput (tok/s)", "median_itl_ms": "Median ITL (ms)",
"mean_tpot_ms": "Mean TPOT (ms)",
"std_tpot_ms": "Std TPOT (ms)",
"median_tpot_ms": "Median TPOT (ms)",
"total_token_throughput": "Total Token Tput (tok/s)",
"output_throughput": "Output Tput (tok/s)", "output_throughput": "Output Tput (tok/s)",
"total_input_tokens": "Total input tokens",
"total_output_tokens": "Total output tokens",
"engine": "Engine", "engine": "Engine",
} }
...@@ -29,11 +36,11 @@ if __name__ == "__main__": ...@@ -29,11 +36,11 @@ if __name__ == "__main__":
# collect results # collect results
for test_file in results_folder.glob("*.json"): for test_file in results_folder.glob("*.json"):
with open(test_file, "r") as f: with open(test_file) as f:
raw_result = json.loads(f.read()) raw_result = json.loads(f.read())
# attach the benchmarking command to raw_result # attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands"), "r") as f: with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read()) command = json.loads(f.read())
raw_result.update(command) raw_result.update(command)
......
#!/bin/sh #!/bin/sh
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-test-repo:pull" | jq -r .token) TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT" URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
TIMEOUT_SECONDS=10 TIMEOUT_SECONDS=10
retries=0 retries=0
while [ $retries -lt 1000 ]; do while [ $retries -lt 1000 ]; do
if [ $(curl -s --max-time $TIMEOUT_SECONDS -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" $URL) -eq 200 ]; then if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
exit 0 exit 0
fi fi
...@@ -16,4 +16,4 @@ while [ $retries -lt 1000 ]; do ...@@ -16,4 +16,4 @@ while [ $retries -lt 1000 ]; do
sleep 5 sleep 5
done done
exit 1 exit 1
\ No newline at end of file
[ [
{ {
"test_name": "llama8B_tp1", "test_name": "llama8B_tp1_sharegpt",
"qps_list": [4], "qps_list": [4,8,16,32,"inf"],
"common_parameters": { "common_parameters": {
"model": "meta-llama/Meta-Llama-3-8B", "model": "meta-llama/Meta-Llama-3-8B-Instruct",
"tp": 1, "tp": 1,
"dataset_name": "sharegpt", "dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 500, "num_prompts": 500,
"port": 8000 "port": 8000,
"reuse_server": false
}, },
"lmdeploy_server_parameters": { "lmdeploy_server_parameters": {
"dtype": "bfloat16"
}, },
"lmdeploy_client_parameters": { "lmdeploy_client_parameters": {
}, },
...@@ -21,34 +23,158 @@ ...@@ -21,34 +23,158 @@
}, },
"trt_server_parameters": { "trt_server_parameters": {
"model_type": "llama", "model_type": "llama",
"model_dtype": "float16", "model_dtype": "bfloat16",
"max_batch_size": 256, "max_batch_size": 2048,
"max_input_len": 4096, "max_input_len": 4096,
"max_output_len": 4096, "max_seq_len": 6144,
"trt_llm_version": "r24.04" "max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
}, },
"trt_client_parameters": { "trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream" "endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
"vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"enable_torch_compile": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
}
},
{
"test_name": "llama8B_tp1_sonnet_512_16",
"qps_list": [4,8,16,32,"inf"],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
"tp": 1,
"dataset_name": "sonnet",
"dataset_path": "./sonnet_4x.txt",
"num_prompts": 500,
"port": 8000,
"sonnet_input_len": 512,
"sonnet_output_len": 16,
"sonnet_prefix_len": 50,
"reuse_server": true
},
"lmdeploy_server_parameters": {
"dtype": "bfloat16"
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "bfloat16",
"max_batch_size": 2048,
"max_input_len": 4096,
"max_seq_len": 6144,
"max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
},
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
"vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"enable_torch_compile": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
}
},
{
"test_name": "llama8B_tp1_sonnet_512_256",
"qps_list": [4,8,16,32,"inf"],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
"tp": 1,
"dataset_name": "sonnet",
"dataset_path": "./sonnet_4x.txt",
"num_prompts": 500,
"port": 8000,
"sonnet_input_len": 512,
"sonnet_output_len": 256,
"sonnet_prefix_len": 50,
"reuse_server": true
},
"lmdeploy_server_parameters": {
"dtype": "bfloat16"
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "bfloat16",
"max_batch_size": 2048,
"max_input_len": 4096,
"max_seq_len": 6144,
"max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
}, },
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": { "vllm_server_parameters": {
"disable_log_stats": "", "disable_log_stats": "",
"disable_log_requests": "" "disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
}, },
"vllm_client_parameters": { "vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"enable_torch_compile": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
} }
}, },
{ {
"test_name": "llama70B_tp4", "test_name": "llama70B_tp4_sharegpt",
"qps_list": [2], "qps_list": [4,8,16,32,"inf"],
"common_parameters": { "common_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct", "model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tp": 4, "tp": 4,
"dataset_name": "sharegpt", "dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 500, "num_prompts": 500,
"port": 8000 "port": 8000,
"reuse_server": false
}, },
"lmdeploy_server_parameters": { "lmdeploy_server_parameters": {
"dtype": "bfloat16"
}, },
"lmdeploy_client_parameters": { "lmdeploy_client_parameters": {
}, },
...@@ -59,34 +185,50 @@ ...@@ -59,34 +185,50 @@
}, },
"trt_server_parameters": { "trt_server_parameters": {
"model_type": "llama", "model_type": "llama",
"model_dtype": "float16", "model_dtype": "bfloat16",
"max_batch_size": 256, "max_batch_size": 2048,
"max_input_len": 4096, "max_input_len": 4096,
"max_output_len": 4096, "max_seq_len": 6144,
"trt_llm_version": "r24.04" "max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
}, },
"trt_client_parameters": { "trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream" "endpoint": "/v2/models/ensemble/generate_stream"
}, },
"vllm_server_parameters": { "vllm_server_parameters": {
"disable_log_stats": "", "disable_log_stats": "",
"disable_log_requests": "" "disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
}, },
"vllm_client_parameters": { "vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
} }
}, },
{ {
"test_name": "mixtral8x7B_tp2", "test_name": "llama70B_tp4_sonnet_512_16",
"qps_list": [2], "qps_list": [4,8,16,32,"inf"],
"common_parameters": { "common_parameters": {
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1", "model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tp": 2, "tp": 4,
"dataset_name": "sharegpt", "dataset_name": "sonnet",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "dataset_path": "./sonnet_4x.txt",
"num_prompts": 500, "num_prompts": 500,
"port": 8000 "port": 8000,
"sonnet_input_len": 512,
"sonnet_output_len": 16,
"sonnet_prefix_len": 50,
"reuse_server": true
}, },
"lmdeploy_server_parameters": { "lmdeploy_server_parameters": {
"dtype": "bfloat16"
}, },
"lmdeploy_client_parameters": { "lmdeploy_client_parameters": {
}, },
...@@ -97,20 +239,85 @@ ...@@ -97,20 +239,85 @@
}, },
"trt_server_parameters": { "trt_server_parameters": {
"model_type": "llama", "model_type": "llama",
"model_dtype": "float16", "model_dtype": "bfloat16",
"max_batch_size": 256, "max_batch_size": 2048,
"max_input_len": 4096, "max_input_len": 4096,
"max_output_len": 4096, "max_seq_len": 6144,
"trt_llm_version": "r24.04" "max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
}, },
"trt_client_parameters": { "trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream" "endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
"vllm_client_parameters": {
}, },
"sglang_server_parameters": {
"disable_radix_cache": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
}
},
{
"test_name": "llama70B_tp4_sonnet_512_256",
"qps_list": [4,8,16,32,"inf"],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-70B-Instruct",
"tp": 4,
"dataset_name": "sonnet",
"dataset_path": "./sonnet_4x.txt",
"num_prompts": 500,
"port": 8000,
"sonnet_input_len": 512,
"sonnet_output_len": 256,
"sonnet_prefix_len": 50,
"reuse_server": true
},
"lmdeploy_server_parameters": {
"dtype": "bfloat16"
},
"lmdeploy_client_parameters": {
},
"tgi_server_parameters": {
},
"tgi_client_parameters": {
"endpoint": "/generate_stream"
},
"trt_server_parameters": {
"model_type": "llama",
"model_dtype": "bfloat16",
"max_batch_size": 2048,
"max_input_len": 4096,
"max_seq_len": 6144,
"max_num_tokens": 16384,
"trt_llm_version": "v0.11.0"
},
"trt_client_parameters": {
"endpoint": "/v2/models/ensemble/generate_stream"
},
"vllm_server_parameters": { "vllm_server_parameters": {
"disable_log_stats": "", "disable_log_stats": "",
"disable_log_requests": "" "disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
}, },
"vllm_client_parameters": { "vllm_client_parameters": {
},
"sglang_server_parameters": {
"disable_radix_cache": "",
"dtype": "bfloat16"
},
"sglang_client_parameters": {
} }
} }
] ]
\ No newline at end of file
steps: steps:
- label: "Build wheel - CUDA 12.1" - label: "Build wheel - CUDA 12.1"
agents: agents:
queue: cpu_queue queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1 - "bash .buildkite/upload-wheels.sh"
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
- block: "Build CUDA 11.8 wheel" # Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
key: block-build-cu118-wheel # However, this block can be uncommented to save some compute hours.
# - block: "Build CUDA 11.8 wheel"
# key: block-build-cu118-wheel
- label: "Build wheel - CUDA 11.8" - label: "Build wheel - CUDA 11.8"
depends_on: block-build-cu118-wheel # depends_on: block-build-cu118-wheel
agents: agents:
queue: cpu_queue queue: cpu_queue_postmerge
commands: commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ." - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts" - "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1 - "bash .buildkite/upload-wheels.sh"
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done" env:
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/" DOCKER_BUILDKIT: "1"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
- block: "Build release image"
depends_on: ~
key: block-release-image-build
- label: "Build release image"
depends_on: block-release-image-build
agents:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Build and publish TPU release image"
depends_on: ~
if: build.env("NIGHTLY") == "1"
agents:
queue: tpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
- "docker push vllm/vllm-tpu:nightly"
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
plugins:
- docker-login#v3.0.0:
username: vllm
password-env: DOCKERHUB_TOKEN
env: env:
DOCKER_BUILDKIT: "1" DOCKER_BUILDKIT: "1"
#!/bin/bash
# This script runs test inside the corresponding ROCm docker container. # This script runs test inside the corresponding ROCm docker container.
set -o pipefail set -o pipefail
...@@ -31,8 +33,8 @@ cleanup_docker() { ...@@ -31,8 +33,8 @@ cleanup_docker() {
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..." echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
# Remove dangling images (those that are not tagged and not used by any container) # Remove dangling images (those that are not tagged and not used by any container)
docker image prune -f docker image prune -f
# Remove unused volumes # Remove unused volumes / force the system prune for old images as well.
docker volume prune -f docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed." echo "Docker images and volumes cleanup completed."
else else
echo "Disk usage is below $threshold%. No cleanup needed." echo "Disk usage is below $threshold%. No cleanup needed."
...@@ -57,17 +59,17 @@ done ...@@ -57,17 +59,17 @@ done
echo "--- Pulling container" echo "--- Pulling container"
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
docker pull ${image_name} docker pull "${image_name}"
remove_docker_container() { remove_docker_container() {
docker rm -f ${container_name} || docker image rm -f ${image_name} || true docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
} }
trap remove_docker_container EXIT trap remove_docker_container EXIT
echo "--- Running container" echo "--- Running container"
HF_CACHE="$(realpath ~)/huggingface" HF_CACHE="$(realpath ~)/huggingface"
mkdir -p ${HF_CACHE} mkdir -p "${HF_CACHE}"
HF_MOUNT="/root/.cache/huggingface" HF_MOUNT="/root/.cache/huggingface"
commands=$@ commands=$@
...@@ -83,7 +85,6 @@ if [[ $commands == *" kernels "* ]]; then ...@@ -83,7 +85,6 @@ if [[ $commands == *" kernels "* ]]; then
--ignore=kernels/test_encoder_decoder_attn.py \ --ignore=kernels/test_encoder_decoder_attn.py \
--ignore=kernels/test_flash_attn.py \ --ignore=kernels/test_flash_attn.py \
--ignore=kernels/test_flashinfer.py \ --ignore=kernels/test_flashinfer.py \
--ignore=kernels/test_gguf.py \
--ignore=kernels/test_int8_quant.py \ --ignore=kernels/test_int8_quant.py \
--ignore=kernels/test_machete_gemm.py \ --ignore=kernels/test_machete_gemm.py \
--ignore=kernels/test_mamba_ssm.py \ --ignore=kernels/test_mamba_ssm.py \
...@@ -107,35 +108,36 @@ fi ...@@ -107,35 +108,36 @@ fi
PARALLEL_JOB_COUNT=8 PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. # check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then if [[ $commands == *"--shard-id="* ]]; then
# assign job count as the number of shards used
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "}
for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do
#replace shard arguments # assign shard-id for each shard
commands=${commands//"--shard-id= "/"--shard-id=${GPU} "} commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} echo "Shard ${GPU} commands:$commands_gpu"
echo "Shard ${GPU} commands:$commands"
docker run \ docker run \
--device /dev/kfd --device /dev/dri \ --device /dev/kfd --device /dev/dri \
--network host \ --network host \
--shm-size=16gb \ --shm-size=16gb \
--rm \ --rm \
-e HIP_VISIBLE_DEVICES=${GPU} \ -e HIP_VISIBLE_DEVICES="${GPU}" \
-e HF_TOKEN \ -e HF_TOKEN \
-v ${HF_CACHE}:${HF_MOUNT} \ -v "${HF_CACHE}:${HF_MOUNT}" \
-e HF_HOME=${HF_MOUNT} \ -e "HF_HOME=${HF_MOUNT}" \
--name ${container_name}_${GPU} \ --name "${container_name}_${GPU}" \
${image_name} \ "${image_name}" \
/bin/bash -c "${commands}" \ /bin/bash -c "${commands_gpu}" \
|& while read -r line; do echo ">>Shard $GPU: $line"; done & |& while read -r line; do echo ">>Shard $GPU: $line"; done &
PIDS+=($!) PIDS+=($!)
done done
#wait for all processes to finish and collect exit codes #wait for all processes to finish and collect exit codes
for pid in ${PIDS[@]}; do for pid in "${PIDS[@]}"; do
wait ${pid} wait "${pid}"
STATUS+=($?) STATUS+=($?)
done done
for st in ${STATUS[@]}; do for st in "${STATUS[@]}"; do
if [[ ${st} -ne 0 ]]; then if [[ ${st} -ne 0 ]]; then
echo "One of the processes failed with $st" echo "One of the processes failed with $st"
exit ${st} exit "${st}"
fi fi
done done
else else
...@@ -146,9 +148,9 @@ else ...@@ -146,9 +148,9 @@ else
--rm \ --rm \
-e HIP_VISIBLE_DEVICES=0 \ -e HIP_VISIBLE_DEVICES=0 \
-e HF_TOKEN \ -e HF_TOKEN \
-v ${HF_CACHE}:${HF_MOUNT} \ -v "${HF_CACHE}:${HF_MOUNT}" \
-e HF_HOME=${HF_MOUNT} \ -e "HF_HOME=${HF_MOUNT}" \
--name ${container_name} \ --name "${container_name}" \
${image_name} \ "${image_name}" \
/bin/bash -c "${commands}" /bin/bash -c "${commands}"
fi fi
#!/bin/bash
# This script is run by buildkite to run the benchmarks and upload the results to buildkite # This script is run by buildkite to run the benchmarks and upload the results to buildkite
set -ex set -ex
......
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container. # This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage. # It serves a sanity check for compilation and basic model usage.
set -ex set -ex
# Try building the docker image
docker build -t cpu-test -f Dockerfile.ppc64le .
# Setup cleanup # Setup cleanup
remove_docker_container() { docker rm -f cpu-test || true; } remove_docker_container() { docker rm -f cpu-test || true; docker system prune -f; }
trap remove_docker_container EXIT trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel. # Try building the docker image
source /etc/environment docker build -t cpu-test -f Dockerfile.ppc64le .
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN=$HF_TOKEN --name cpu-test cpu-test
# Run basic model test
docker exec cpu-test bash -c "
pip install pytest matplotlib einops transformers_stream_generator
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
# online inference
docker exec cpu-test bash -c "
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer facebook/opt-125m"
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container. # This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage. # It serves a sanity check for compilation and basic model usage.
set -ex set -ex
# allow to bind to different cores
CORE_RANGE=${CORE_RANGE:-48-95}
NUMA_NODE=${NUMA_NODE:-1}
# Try building the docker image # Try building the docker image
numactl -C 48-95 -N 1 docker build -t cpu-test -f Dockerfile.cpu . numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
numactl -C 48-95 -N 1 docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu . numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
# Setup cleanup # Setup cleanup
remove_docker_container() { docker rm -f cpu-test cpu-test-avx2 || true; } remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; }
trap remove_docker_container EXIT trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel. # Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 \ docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems=1 --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2 cpu-test-avx2 --cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2
# offline inference function cpu_tests() {
docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" set -e
export NUMA_NODE=$2
# Run basic model test
docker exec cpu-test bash -c " # offline inference
pip install pytest matplotlib einops transformers_stream_generator datamodel_code_generator docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
pytest -v -s tests/models/decoder_only/language \ set -e
--ignore=tests/models/test_fp8.py \ python3 examples/offline_inference.py"
--ignore=tests/models/decoder_only/language/test_jamba.py \
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported # Run basic model test
docker exec cpu-test-"$NUMA_NODE" bash -c "
# Run compressed-tensor test set -e
docker exec cpu-test bash -c " pip install pytest pytest-asyncio \
pytest -s -v \ decord einops librosa peft Pillow sentence-transformers soundfile \
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ transformers_stream_generator matplotlib datamodel_code_generator
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token" pip install torchvision --index-url https://download.pytorch.org/whl/cpu
pytest -v -s tests/models/decoder_only/language -m cpu_model
# online inference pytest -v -s tests/models/embedding/language -m cpu_model
docker exec cpu-test bash -c " pytest -v -s tests/models/encoder_decoder/language -m cpu_model
export VLLM_CPU_KVCACHE_SPACE=10 pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
export VLLM_CPU_OMP_THREADS_BIND=48-92 pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 # Run compressed-tensor test
python3 benchmarks/benchmark_serving.py \ docker exec cpu-test-"$NUMA_NODE" bash -c "
--backend vllm \ set -e
--dataset-name random \ pytest -s -v \
--model facebook/opt-125m \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
--num-prompts 20 \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
--endpoint /v1/completions \
--tokenizer facebook/opt-125m" # Run AWQ test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/quantization/test_ipex_quant.py"
# Run chunked-prefill and prefix-cache test
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v -k cpu_model \
tests/basic_correctness/test_chunked_prefill.py"
# online inference
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
export VLLM_CPU_KVCACHE_SPACE=10
export VLLM_CPU_OMP_THREADS_BIND=$1
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer facebook/opt-125m"
}
# All of CPU tests are expected to be finished less than 25 mins.
export -f cpu_tests
timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
#!/bin/bash
# This script build the GH200 docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Try building the docker image
DOCKER_BUILDKIT=1 docker build . \
--target vllm-openai \
--platform "linux/arm64" \
-t gh200-test \
--build-arg max_jobs=66 \
--build-arg nvcc_threads=2 \
--build-arg torch_cuda_arch_list="9.0+PTX" \
--build-arg vllm_fa_cmake_gpu_arches="90-real"
# Setup cleanup
remove_docker_container() { docker rm -f gh200-test || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image and test offline inference
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference.py
'
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex
# Try building the docker image
docker build -t hpu-test-env -f Dockerfile.hpu .
# Setup cleanup
remove_docker_container() { docker rm -f hpu-test || true; }
trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
\ No newline at end of file
...@@ -14,7 +14,7 @@ DOCKER_IMAGE=$4 ...@@ -14,7 +14,7 @@ DOCKER_IMAGE=$4
shift 4 shift 4
COMMANDS=("$@") COMMANDS=("$@")
if [ ${#COMMANDS[@]} -ne $NUM_NODES ]; then if [ ${#COMMANDS[@]} -ne "$NUM_NODES" ]; then
echo "The number of commands must be equal to the number of nodes." echo "The number of commands must be equal to the number of nodes."
echo "Number of nodes: $NUM_NODES" echo "Number of nodes: $NUM_NODES"
echo "Number of commands: ${#COMMANDS[@]}" echo "Number of commands: ${#COMMANDS[@]}"
...@@ -23,7 +23,7 @@ fi ...@@ -23,7 +23,7 @@ fi
echo "List of commands" echo "List of commands"
for command in "${COMMANDS[@]}"; do for command in "${COMMANDS[@]}"; do
echo $command echo "$command"
done done
start_network() { start_network() {
...@@ -36,7 +36,7 @@ start_nodes() { ...@@ -36,7 +36,7 @@ start_nodes() {
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM)) GPU_DEVICES+=$(($DEVICE_NUM))
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
GPU_DEVICES+=',' GPU_DEVICES+=','
fi fi
done done
...@@ -49,17 +49,20 @@ start_nodes() { ...@@ -49,17 +49,20 @@ start_nodes() {
# 3. map the huggingface cache directory to the container # 3. map the huggingface cache directory to the container
# 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes: # 3. assign ip addresses to the containers (head node: 192.168.10.10, worker nodes:
# starting from 192.168.10.11) # starting from 192.168.10.11)
docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN -v ~/.cache/huggingface:/root/.cache/huggingface --name node$node --network docker-net --ip 192.168.10.$((10 + $node)) --rm $DOCKER_IMAGE /bin/bash -c "tail -f /dev/null" docker run -d --gpus "$GPU_DEVICES" --shm-size=10.24gb -e HF_TOKEN \
-v ~/.cache/huggingface:/root/.cache/huggingface --name "node$node" \
--network docker-net --ip 192.168.10.$((10 + $node)) --rm "$DOCKER_IMAGE" \
/bin/bash -c "tail -f /dev/null"
# organize containers into a ray cluster # organize containers into a ray cluster
if [ $node -eq 0 ]; then if [ "$node" -eq 0 ]; then
# start the ray head node # start the ray head node
docker exec -d node$node /bin/bash -c "ray start --head --port=6379 --block" docker exec -d "node$node" /bin/bash -c "ray start --head --port=6379 --block"
# wait for the head node to be ready # wait for the head node to be ready
sleep 10 sleep 10
else else
# start the ray worker nodes, and connect them to the head node # start the ray worker nodes, and connect them to the head node
docker exec -d node$node /bin/bash -c "ray start --address=192.168.10.10:6379 --block" docker exec -d "node$node" /bin/bash -c "ray start --address=192.168.10.10:6379 --block"
fi fi
done done
...@@ -79,22 +82,22 @@ run_nodes() { ...@@ -79,22 +82,22 @@ run_nodes() {
for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do for node_gpu in $(seq 0 $(($NUM_GPUS - 1))); do
DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu)) DEVICE_NUM=$(($node * $NUM_GPUS + $node_gpu))
GPU_DEVICES+=$(($DEVICE_NUM)) GPU_DEVICES+=$(($DEVICE_NUM))
if [ $node_gpu -lt $(($NUM_GPUS - 1)) ]; then if [ "$node_gpu" -lt $(($NUM_GPUS - 1)) ]; then
GPU_DEVICES+=',' GPU_DEVICES+=','
fi fi
done done
GPU_DEVICES+='"' GPU_DEVICES+='"'
echo "Running node$node with GPU devices: $GPU_DEVICES" echo "Running node$node with GPU devices: $GPU_DEVICES"
if [ $node -ne 0 ]; then if [ "$node" -ne 0 ]; then
docker exec -d node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" docker exec -d "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
else else
docker exec node$node /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}" docker exec "node$node" /bin/bash -c "cd $WORKING_DIR ; ${COMMANDS[$node]}"
fi fi
done done
} }
cleanup() { cleanup() {
for node in $(seq 0 $(($NUM_NODES-1))); do for node in $(seq 0 $(($NUM_NODES-1))); do
docker stop node$node docker stop "node$node"
done done
docker network rm docker-net docker network rm docker-net
} }
......
#!/bin/bash
# This script build the Neuron docker image and run the API server inside the container. # This script build the Neuron docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage. # It serves a sanity check for compilation and basic model usage.
set -e set -e
...@@ -12,10 +14,10 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then ...@@ -12,10 +14,10 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
current_time=$(date +%s) current_time=$(date +%s)
if [ $((current_time - last_build)) -gt 86400 ]; then if [ $((current_time - last_build)) -gt 86400 ]; then
docker system prune -f docker system prune -f
echo $current_time > /tmp/neuron-docker-build-timestamp echo "$current_time" > /tmp/neuron-docker-build-timestamp
fi fi
else else
echo $(date +%s) > /tmp/neuron-docker-build-timestamp date "+%s" > /tmp/neuron-docker-build-timestamp
fi fi
docker build -t neuron -f Dockerfile.neuron . docker build -t neuron -f Dockerfile.neuron .
...@@ -34,7 +36,7 @@ wait_for_server_to_start() { ...@@ -34,7 +36,7 @@ wait_for_server_to_start() {
timeout=300 timeout=300
counter=0 counter=0
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
sleep 1 sleep 1
counter=$((counter + 1)) counter=$((counter + 1))
if [ $counter -ge $timeout ]; then if [ $counter -ge $timeout ]; then
......
#!/bin/bash
# This script build the OpenVINO docker image and run the offline inference inside the container. # This script build the OpenVINO docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage. # It serves a sanity check for compilation and basic model usage.
set -ex set -ex
...@@ -11,4 +13,4 @@ trap remove_docker_container EXIT ...@@ -11,4 +13,4 @@ trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Run the image and launch offline inference # Run the image and launch offline inference
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
#!/bin/bash
set -e set -e
# Build the docker image. # Build the docker image.
...@@ -12,4 +14,4 @@ remove_docker_container ...@@ -12,4 +14,4 @@ remove_docker_container
# For HF_TOKEN. # For HF_TOKEN.
source /etc/environment source /etc/environment
# Run a simple end-to-end example. # Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it -e HF_TOKEN=$HF_TOKEN --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py" docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
#!/bin/bash
# This script build the CPU docker image and run the offline inference inside the container. # This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage. # It serves a sanity check for compilation and basic model usage.
set -ex set -ex
...@@ -10,5 +12,8 @@ remove_docker_container() { docker rm -f xpu-test || true; } ...@@ -10,5 +12,8 @@ remove_docker_container() { docker rm -f xpu-test || true; }
trap remove_docker_container EXIT trap remove_docker_container EXIT
remove_docker_container remove_docker_container
# Run the image and launch offline inference # Run the image and test offline inference/tensor parallel
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
python3 examples/offline_inference.py
python3 examples/offline_inference_cli.py -tp 2
'
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
# label(str): the name of the test. emoji allowed. # label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline. # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only # fast_check_only(bool): run this test on fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
# command(str): the single command to run for tests. incompatible with commands. # command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command. # commands(list): the list of commands to run for test. incompatbile with command.
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd] # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
...@@ -39,7 +40,7 @@ steps: ...@@ -39,7 +40,7 @@ steps:
# Check API reference (if it fails, you may have missing mock imports) # Check API reference (if it fails, you may have missing mock imports)
- grep \"sig sig-object py\" build/html/dev/sampling_params.html - grep \"sig sig-object py\" build/html/dev/sampling_params.html
- label: Async Engine, Inputs, Utils, Worker Test # 15min - label: Async Engine, Inputs, Utils, Worker Test # 24min
fast_check: true fast_check: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
...@@ -49,7 +50,9 @@ steps: ...@@ -49,7 +50,9 @@ steps:
- tests/multimodal - tests/multimodal
- tests/test_utils - tests/test_utils
- tests/worker - tests/worker
- tests/standalone_tests/lazy_torch_compile.py
commands: commands:
- python3 standalone_tests/lazy_torch_compile.py
- pytest -v -s mq_llm_engine # MQLLMEngine - pytest -v -s mq_llm_engine # MQLLMEngine
- pytest -v -s async_engine # AsyncLLMEngine - pytest -v -s async_engine # AsyncLLMEngine
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
...@@ -58,18 +61,33 @@ steps: ...@@ -58,18 +61,33 @@ steps:
- pytest -v -s test_utils.py # Utils - pytest -v -s test_utils.py # Utils
- pytest -v -s worker # Worker - pytest -v -s worker # Worker
- label: Python-only Installation Test
source_file_dependencies:
- tests/standalone_tests/python_only_compile.sh
- setup.py
commands:
- bash standalone_tests/python_only_compile.sh
- label: Basic Correctness Test # 30min - label: Basic Correctness Test # 30min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
fast_check: true fast_check: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/basic_correctness - tests/basic_correctness/test_basic_correctness
- tests/basic_correctness/test_cpu_offload
- tests/basic_correctness/test_preemption
commands: commands:
- pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py - pytest -v -s basic_correctness/test_cpu_offload.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Chunked Prefill Test
source_file_dependencies:
- vllm/
- tests/basic_correctness/test_chunked_prefill
commands:
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Core Test # 10min - label: Core Test # 10min
mirror_hardwares: [amd] mirror_hardwares: [amd]
...@@ -81,7 +99,7 @@ steps: ...@@ -81,7 +99,7 @@ steps:
commands: commands:
- pytest -v -s core - pytest -v -s core
- label: Entrypoints Test # 20min - label: Entrypoints Test # 40min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
fast_check: true fast_check: true
mirror_hardwares: [amd] mirror_hardwares: [amd]
...@@ -89,13 +107,13 @@ steps: ...@@ -89,13 +107,13 @@ steps:
- vllm/ - vllm/
commands: commands:
- pip install -e ./plugins/vllm_add_dummy_model - pip install -e ./plugins/vllm_add_dummy_model
- pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@a4987bba6e9e9b3f22bd3a6c1ecf0abd04fd5622#egg=lm_eval[api]
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s entrypoints/test_chat_utils.py - pytest -v -s entrypoints/test_chat_utils.py
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
...@@ -108,7 +126,10 @@ steps: ...@@ -108,7 +126,10 @@ steps:
- vllm/core/ - vllm/core/
- tests/distributed - tests/distributed
- tests/spec_decode/e2e/test_integration_dist_tp4 - tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile
commands: commands:
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_pynccl.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
...@@ -136,7 +157,9 @@ steps: ...@@ -136,7 +157,9 @@ steps:
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/test_regression - tests/test_regression
command: pytest -v -s test_regression.py commands:
- pip install modelscope
- pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional working_dir: "/vllm-workspace/tests" # optional
- label: Engine Test # 10min - label: Engine Test # 10min
...@@ -150,14 +173,22 @@ steps: ...@@ -150,14 +173,22 @@ steps:
# OOM in the CI unless we run this separately # OOM in the CI unless we run this separately
- pytest -v -s tokenization - pytest -v -s tokenization
- label: Examples Test # 12min - label: V1 Test
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/v1
commands:
- VLLM_USE_V1=1 pytest -v -s v1
- label: Examples Test # 25min
working_dir: "/vllm-workspace/examples" working_dir: "/vllm-workspace/examples"
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/entrypoints - vllm/entrypoints
- examples/ - examples/
commands: commands:
- pip install awscli tensorizer # for llava example and tensorizer test - pip install tensorizer # for tensorizer test
- python3 offline_inference.py - python3 offline_inference.py
- python3 cpu_offload.py - python3 cpu_offload.py
- python3 offline_inference_chat.py - python3 offline_inference_chat.py
...@@ -167,16 +198,20 @@ steps: ...@@ -167,16 +198,20 @@ steps:
- python3 offline_inference_vision_language_multi_image.py - python3 offline_inference_vision_language_multi_image.py
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference_encoder_decoder.py - python3 offline_inference_encoder_decoder.py
- python3 offline_inference_classification.py
- python3 offline_inference_embedding.py
- python3 offline_inference_scoring.py
- python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Prefix Caching Test # 7min - label: Prefix Caching Test # 9min
#mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/prefix_caching - tests/prefix_caching
commands: commands:
- pytest -v -s prefix_caching - pytest -v -s prefix_caching
- label: Samplers Test # 18min - label: Samplers Test # 36min
source_file_dependencies: source_file_dependencies:
- vllm/model_executor/layers - vllm/model_executor/layers
- vllm/sampling_metadata.py - vllm/sampling_metadata.py
...@@ -192,40 +227,41 @@ steps: ...@@ -192,40 +227,41 @@ steps:
- tests/test_logits_processor - tests/test_logits_processor
command: pytest -v -s test_logits_processor.py command: pytest -v -s test_logits_processor.py
- label: Speculative decoding tests # 22min - label: Speculative decoding tests # 30min
source_file_dependencies: source_file_dependencies:
- vllm/spec_decode - vllm/spec_decode
- tests/spec_decode - tests/spec_decode
commands: commands:
# See https://github.com/vllm-project/vllm/issues/5152
- export VLLM_ATTENTION_BACKEND=XFORMERS
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
- pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
- label: LoRA Test %N # 30min each - label: LoRA Test %N # 15min each
mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
- tests/lora - tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
parallelism: 4 parallelism: 4
- label: "PyTorch Fullgraph Smoke Test" - label: "PyTorch Fullgraph Smoke Test" # 9min
fast_check: true fast_check: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/compile - tests/compile
commands: commands:
- pytest -v -s compile/test_full_graph_smoke.py - pytest -v -s compile/test_basic_correctness.py
# these tests need to be separated, cannot combine
- pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py
- label: "PyTorch Fullgraph Test" - label: "PyTorch Fullgraph Test" # 18min
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/compile - tests/compile
commands: commands:
- pytest -v -s compile/test_full_graph.py - pytest -v -s compile/test_full_graph.py
- label: Kernels Test %N # 30min each - label: Kernels Test %N # 1h each
mirror_hardwares: [amd] mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
...@@ -252,15 +288,14 @@ steps: ...@@ -252,15 +288,14 @@ steps:
source_file_dependencies: source_file_dependencies:
- benchmarks/ - benchmarks/
commands: commands:
- pip install aiohttp
- bash run-benchmarks.sh - bash run-benchmarks.sh
- label: Quantization Test # 15min - label: Quantization Test # 33min
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
- tests/quantization - tests/quantization
command: pytest -v -s quantization command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
- label: LM Eval Small Models # 53min - label: LM Eval Small Models # 53min
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
...@@ -268,7 +303,6 @@ steps: ...@@ -268,7 +303,6 @@ steps:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- pip install lm-eval
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-small.txt -t 1 - bash ./run-tests.sh -c configs/models-small.txt -t 1
...@@ -290,42 +324,91 @@ steps: ...@@ -290,42 +324,91 @@ steps:
##### models test ##### ##### models test #####
- label: Basic Models Test # 3min - label: Basic Models Test # 24min
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models - tests/models
commands: commands:
- pip install -e ./plugins/vllm_add_dummy_model - pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py - pytest -v -s models/test_registry.py
- pytest -v -s models/test_initialization.py
- label: Decoder-only Language Models Test # 1h3min - label: Language Models Test (Standard) # 32min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/decoder_only/language - tests/models/decoder_only/language
- tests/models/embedding/language
- tests/models/encoder_decoder/language
commands: commands:
- pytest -v -s models/decoder_only/language - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
- pytest -v -s models/embedding/language -m core_model
- label: Decoder-only Multi-Modal Models Test # 56min - label: Language Models Test (Extended) # 1h10min
optional: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language
- tests/models/embedding/language
- tests/models/encoder_decoder/language
commands:
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
- pytest -v -s models/embedding/language -m 'not core_model'
- label: Multi-Modal Models Test (Standard) # 28min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/decoder_only/audio_language - tests/models/decoder_only/audio_language
- tests/models/decoder_only/vision_language - tests/models/decoder_only/vision_language
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/vision_language
commands: commands:
- pytest -v -s models/decoder_only/audio_language - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/vision_language - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
- pytest -v -s models/embedding/vision_language -m core_model
- pytest -v -s models/encoder_decoder/language -m core_model
- pytest -v -s models/encoder_decoder/vision_language -m core_model
- label: Other Models Test # 5min - label: Multi-Modal Models Test (Extended) 1 # 1h16m
#mirror_hardwares: [amd] optional: true
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
- tests/models/embedding/language - tests/models/decoder_only/audio_language
- tests/models/encoder_decoder/language - tests/models/decoder_only/vision_language
- tests/models/embedding/vision_language
- tests/models/encoder_decoder/vision_language
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
# HACK - run phi3v tests separately to sidestep this transformers bug
# https://github.com/huggingface/transformers/issues/34307
- pytest -v -s models/decoder_only/vision_language/test_phi3v.py
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
- pytest -v -s models/embedding/vision_language -m 'not core_model'
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
- label: Multi-Modal Models Test (Extended) 2 # 38m
optional: true
source_file_dependencies:
- vllm/
- tests/models/decoder_only/vision_language
commands:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
optional: true
commands: commands:
- pytest -v -s models/embedding/language - echo 'Testing custom models...'
- pytest -v -s models/encoder_decoder/language # PR authors can temporarily add commands below to test individual models
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
##### 1 GPU test ##### ##### 1 GPU test #####
##### multi gpus test ##### ##### multi gpus test #####
...@@ -352,13 +435,13 @@ steps: ...@@ -352,13 +435,13 @@ steps:
- tests/distributed/ - tests/distributed/
commands: commands:
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed' - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
- label: Distributed Tests (2 GPUs) # 28min - label: Distributed Tests (2 GPUs) # 40min
#mirror_hardwares: [amd] #mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
...@@ -369,20 +452,25 @@ steps: ...@@ -369,20 +452,25 @@ steps:
- vllm/model_executor/models/ - vllm/model_executor/models/
- tests/distributed/ - tests/distributed/
- vllm/compilation - vllm/compilation
- vllm/worker/worker_base.py
- vllm/worker/worker.py
- vllm/worker/model_runner.py
commands: commands:
- pytest -v -s ./compile/test_full_graph_multi_gpu.py - pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py - pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed' - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
# Avoid importing model tests that cause CUDA reinitialization error # Avoid importing model tests that cause CUDA reinitialization error
- pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- pip install -e ./plugins/vllm_add_dummy_model - pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py - pytest -v -s distributed/test_distributed_oot.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
- label: Multi-step Tests (4 GPUs) # 21min - label: Multi-step Tests (4 GPUs) # 36min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
...@@ -400,7 +488,7 @@ steps: ...@@ -400,7 +488,7 @@ steps:
- pytest -v -s multi_step/test_correctness_async_llm.py - pytest -v -s multi_step/test_correctness_async_llm.py
- pytest -v -s multi_step/test_correctness_llm.py - pytest -v -s multi_step/test_correctness_llm.py
- label: Pipeline Parallelism Test # 23min - label: Pipeline Parallelism Test # 45min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
...@@ -413,20 +501,24 @@ steps: ...@@ -413,20 +501,24 @@ steps:
- pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pp_cudagraph.py
- pytest -v -s distributed/test_pipeline_parallel.py - pytest -v -s distributed/test_pipeline_parallel.py
- label: LoRA Long Context (Distributed) # 11min - label: LoRA TP Test (Distributed)
# This test runs llama 13B, so it is required to run on 4 GPUs.
num_gpus: 4 num_gpus: 4
soft_fail: true
source_file_dependencies: source_file_dependencies:
- vllm/lora - vllm/lora
- tests/lora/test_long_context - tests/lora
commands: commands:
# FIXIT: find out which code initialize cuda before running the test # FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it # before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
# This test runs llama 13B, so it is required to run on 4 GPUs.
- pytest -v -s -x lora/test_long_context.py - pytest -v -s -x lora/test_long_context.py
# There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py
- label: Weight Loading Multiple GPU Test
- label: Weight Loading Multiple GPU Test # 33min
working_dir: "/vllm-workspace/tests" working_dir: "/vllm-workspace/tests"
num_gpus: 2 num_gpus: 2
source_file_dependencies: source_file_dependencies:
...@@ -452,6 +544,7 @@ steps: ...@@ -452,6 +544,7 @@ steps:
- label: Distributed Tests (A100) # optional - label: Distributed Tests (A100) # optional
gpu: a100 gpu: a100
optional: true
num_gpus: 4 num_gpus: 4
source_file_dependencies: source_file_dependencies:
- vllm/ - vllm/
...@@ -459,17 +552,18 @@ steps: ...@@ -459,17 +552,18 @@ steps:
# NOTE: don't test llama model here, it seems hf implementation is buggy # NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details # see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py - pytest -v -s distributed/test_custom_all_reduce.py
- TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
- pytest -v -s -x lora/test_mixtral.py - pytest -v -s -x lora/test_mixtral.py
- label: LM Eval Large Models # optional - label: LM Eval Large Models # optional
gpu: a100 gpu: a100
optional: true
num_gpus: 4 num_gpus: 4
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
source_file_dependencies: source_file_dependencies:
- csrc/ - csrc/
- vllm/model_executor/layers/quantization - vllm/model_executor/layers/quantization
commands: commands:
- pip install lm-eval
- export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-large.txt -t 4 - bash ./run-tests.sh -c configs/models-large.txt -t 4
#!/usr/bin/env bash
set -ex
# Assume wheels are in artifacts/dist/*.whl
wheel_files=(artifacts/dist/*.whl)
# Check that exactly one wheel is found
if [[ ${#wheel_files[@]} -ne 1 ]]; then
echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
exit 1
fi
# Get the single wheel file
wheel="${wheel_files[0]}"
# Rename 'linux' to 'manylinux1' in the wheel filename
new_wheel="${wheel/linux/manylinux1}"
mv -- "$wheel" "$new_wheel"
wheel="$new_wheel"
# Extract the version from the wheel
version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
echo "Version: $version"
# If the version contains "dev", rename it to v1.0.0.dev for consistency
if [[ $version == *dev* ]]; then
suffix="${version##*.}"
if [[ $suffix == cu* ]]; then
new_version="1.0.0.dev+${suffix}"
else
new_version="1.0.0.dev"
fi
new_wheel="${wheel/$version/$new_version}"
mv -- "$wheel" "$new_wheel"
wheel="$new_wheel"
version="$new_version"
fi
# Upload the wheel to S3
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
\ No newline at end of file
vllm/*.so
/.venv /.venv
/build /build
dist dist
vllm/*.so
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
.mypy_cache
# Distribution / packaging
.Python
/build/
cmake-build-*/
CMakeUserPresets.json
develop-eggs/
/dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment