Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
38d80967
Commit
38d80967
authored
Sep 12, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori
parents
33650733
880c741b
Changes
798
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1447 additions
and
194 deletions
+1447
-194
.buildkite/check-wheel-size.py
.buildkite/check-wheel-size.py
+4
-4
.buildkite/nightly-benchmarks/scripts/compare-json-results.py
...ldkite/nightly-benchmarks/scripts/compare-json-results.py
+1
-1
.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
...kite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
+414
-6
.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
...kite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
+621
-6
.buildkite/release-pipeline.yaml
.buildkite/release-pipeline.yaml
+32
-26
.buildkite/scripts/cleanup-nightly-builds.sh
.buildkite/scripts/cleanup-nightly-builds.sh
+97
-0
.buildkite/scripts/hardware_ci/run-neuron-test.sh
.buildkite/scripts/hardware_ci/run-neuron-test.sh
+0
-64
.buildkite/scripts/hardware_ci/run-xpu-test.sh
.buildkite/scripts/hardware_ci/run-xpu-test.sh
+6
-4
.buildkite/scripts/upload-wheels.sh
.buildkite/scripts/upload-wheels.sh
+12
-10
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+164
-58
.github/.bc-linter.yml
.github/.bc-linter.yml
+24
-0
.github/CODEOWNERS
.github/CODEOWNERS
+25
-9
.github/mergify.yml
.github/mergify.yml
+14
-0
.github/workflows/add_label_automerge.yml
.github/workflows/add_label_automerge.yml
+1
-1
.github/workflows/bc-lint.yml
.github/workflows/bc-lint.yml
+27
-0
.github/workflows/cleanup_pr_body.yml
.github/workflows/cleanup_pr_body.yml
+1
-1
.github/workflows/issue_autolabel.yml
.github/workflows/issue_autolabel.yml
+1
-1
.github/workflows/pre-commit.yml
.github/workflows/pre-commit.yml
+1
-1
.github/workflows/reminder_comment.yml
.github/workflows/reminder_comment.yml
+1
-1
.github/workflows/stale.yml
.github/workflows/stale.yml
+1
-1
No files found.
.buildkite/check-wheel-size.py
View file @
38d80967
...
@@ -5,11 +5,11 @@ import os
...
@@ -5,11 +5,11 @@ import os
import
sys
import
sys
import
zipfile
import
zipfile
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 4
0
0 MiB
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 4
5
0 MiB
# Note that we have
4
00 MiB quota, please use it wisely.
# Note that we have
8
00 MiB quota, please use it wisely.
# See https://github.com/pypi/support/issues/
3792
.
# See https://github.com/pypi/support/issues/
6326
.
# Please also sync the value with the one in Dockerfile.
# Please also sync the value with the one in Dockerfile.
VLLM_MAX_SIZE_MB
=
int
(
os
.
environ
.
get
(
"VLLM_MAX_SIZE_MB"
,
4
0
0
))
VLLM_MAX_SIZE_MB
=
int
(
os
.
environ
.
get
(
"VLLM_MAX_SIZE_MB"
,
4
5
0
))
def
print_top_10_largest_files
(
zip_file
):
def
print_top_10_largest_files
(
zip_file
):
...
...
.buildkite/nightly-benchmarks/scripts/compare-json-results.py
View file @
38d80967
...
@@ -218,7 +218,7 @@ if __name__ == "__main__":
...
@@ -218,7 +218,7 @@ if __name__ == "__main__":
"--xaxis"
,
"--xaxis"
,
type
=
str
,
type
=
str
,
default
=
"# of max concurrency."
,
default
=
"# of max concurrency."
,
help
=
"column name to use as X Axis in comparis
i
on graph"
,
help
=
"column name to use as X Axis in comparison graph"
,
)
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
...
...
.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc2.json
View file @
38d80967
[
[
{
{
"test_name"
:
"serving_llama8B_tp1_sharegpt"
,
"test_name"
:
"serving_llama8B_
bf16_
tp1_sharegpt"
,
"qps_list"
:
[
"inf"
],
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"server_environment_variables"
:
{
"server_environment_variables"
:
{
...
@@ -32,7 +32,7 @@
...
@@ -32,7 +32,7 @@
}
}
},
},
{
{
"test_name"
:
"serving_llama8B_tp2_sharegpt"
,
"test_name"
:
"serving_llama8B_
bf16_
tp2_sharegpt"
,
"qps_list"
:
[
"inf"
],
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"server_environment_variables"
:
{
"server_environment_variables"
:
{
...
@@ -64,7 +64,7 @@
...
@@ -64,7 +64,7 @@
}
}
},
},
{
{
"test_name"
:
"serving_llama8B_tp4_sharegpt"
,
"test_name"
:
"serving_llama8B_
bf16_
tp4_sharegpt"
,
"qps_list"
:
[
"inf"
],
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"server_environment_variables"
:
{
"server_environment_variables"
:
{
...
@@ -96,7 +96,7 @@
...
@@ -96,7 +96,7 @@
}
}
},
},
{
{
"test_name"
:
"serving_llama8B_tp1_random_128_128"
,
"test_name"
:
"serving_llama8B_
bf16_
tp1_random_128_128"
,
"qps_list"
:
[
"inf"
],
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
,
1000
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
,
1000
],
"server_environment_variables"
:
{
"server_environment_variables"
:
{
...
@@ -131,7 +131,7 @@
...
@@ -131,7 +131,7 @@
}
}
},
},
{
{
"test_name"
:
"serving_llama8B_tp2_random_128_128"
,
"test_name"
:
"serving_llama8B_
bf16_
tp2_random_128_128"
,
"qps_list"
:
[
"inf"
],
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
,
1000
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
,
1000
],
"server_environment_variables"
:
{
"server_environment_variables"
:
{
...
@@ -166,7 +166,7 @@
...
@@ -166,7 +166,7 @@
}
}
},
},
{
{
"test_name"
:
"serving_llama8B_tp4_random_128_128"
,
"test_name"
:
"serving_llama8B_
bf16_
tp4_random_128_128"
,
"qps_list"
:
[
"inf"
],
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
,
1000
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
,
1000
],
"server_environment_variables"
:
{
"server_environment_variables"
:
{
...
@@ -198,5 +198,413 @@
...
@@ -198,5 +198,413 @@
"random-output-len"
:
128
,
"random-output-len"
:
128
,
"num_prompts"
:
1000
"num_prompts"
:
1000
}
}
},
{
"test_name"
:
"serving_llama8B_int8_tp1_sharegpt"
,
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"tensor_parallel_size"
:
1
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"disable_log_stats"
:
""
,
"enforce_eager"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
,
"load_format"
:
"dummy"
},
"client_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
200
}
},
{
"test_name"
:
"serving_llama8B_int8_tp2_sharegpt"
,
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"tensor_parallel_size"
:
2
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"disable_log_stats"
:
""
,
"enforce_eager"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
,
"load_format"
:
"dummy"
},
"client_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
200
}
},
{
"test_name"
:
"serving_llama8B_int8_tp4_sharegpt"
,
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"tensor_parallel_size"
:
4
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"disable_log_stats"
:
""
,
"enforce_eager"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
,
"load_format"
:
"dummy"
},
"client_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
200
}
},
{
"test_name"
:
"serving_llama8B_int8_tp1_random_128_128"
,
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
,
1000
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"tensor_parallel_size"
:
1
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"enable_chunked_prefill"
:
""
,
"disable_log_stats"
:
""
,
"enforce_eager"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
,
"load_format"
:
"dummy"
},
"client_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
,
"ignore-eos"
:
""
,
"num_prompts"
:
1000
}
},
{
"test_name"
:
"serving_llama8B_int8_tp2_random_128_128"
,
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
,
1000
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"tensor_parallel_size"
:
2
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"enable_chunked_prefill"
:
""
,
"disable_log_stats"
:
""
,
"enforce_eager"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
,
"load_format"
:
"dummy"
},
"client_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
,
"ignore-eos"
:
""
,
"num_prompts"
:
1000
}
},
{
"test_name"
:
"serving_llama8B_int8_tp4_random_128_128"
,
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
,
1000
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"tensor_parallel_size"
:
4
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"enable_chunked_prefill"
:
""
,
"disable_log_stats"
:
""
,
"enforce_eager"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
,
"load_format"
:
"dummy"
},
"client_parameters"
:
{
"model"
:
"RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
,
"ignore-eos"
:
""
,
"num_prompts"
:
1000
}
},
{
"test_name"
:
"serving_llama8B_int4_tp1_sharegpt"
,
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"quantization"
:
"awq"
,
"tensor_parallel_size"
:
1
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"disable_log_stats"
:
""
,
"enforce_eager"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
,
"load_format"
:
"dummy"
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
200
}
},
{
"test_name"
:
"serving_llama8B_int4_tp2_sharegpt"
,
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"quantization"
:
"awq"
,
"tensor_parallel_size"
:
2
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"disable_log_stats"
:
""
,
"enforce_eager"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
,
"load_format"
:
"dummy"
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
200
}
},
{
"test_name"
:
"serving_llama8B_int4_tp4_sharegpt"
,
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"quantization"
:
"awq"
,
"tensor_parallel_size"
:
4
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"disable_log_stats"
:
""
,
"enforce_eager"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
,
"load_format"
:
"dummy"
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"sharegpt"
,
"dataset_path"
:
"./ShareGPT_V3_unfiltered_cleaned_split.json"
,
"num_prompts"
:
200
}
},
{
"test_name"
:
"serving_llama8B_int4_tp1_random_128_128"
,
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
,
1000
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"quantization"
:
"awq"
,
"tensor_parallel_size"
:
1
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"enable_chunked_prefill"
:
""
,
"disable_log_stats"
:
""
,
"enforce_eager"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
,
"load_format"
:
"dummy"
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
,
"ignore-eos"
:
""
,
"num_prompts"
:
1000
}
},
{
"test_name"
:
"serving_llama8B_int4_tp2_random_128_128"
,
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
,
1000
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"quantization"
:
"awq"
,
"tensor_parallel_size"
:
2
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"enable_chunked_prefill"
:
""
,
"disable_log_stats"
:
""
,
"enforce_eager"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
,
"load_format"
:
"dummy"
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
,
"ignore-eos"
:
""
,
"num_prompts"
:
1000
}
},
{
"test_name"
:
"serving_llama8B_int4_tp4_random_128_128"
,
"qps_list"
:
[
"inf"
],
"max_concurrency_list"
:
[
12
,
16
,
24
,
32
,
64
,
128
,
200
,
1000
],
"server_environment_variables"
:
{
"VLLM_RPC_TIMEOUT"
:
100000
,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN"
:
1
,
"VLLM_ENGINE_ITERATION_TIMEOUT_S"
:
120
,
"VLLM_CPU_SGL_KERNEL"
:
1
,
"VLLM_CPU_KVCACHE_SPACE"
:
40
},
"server_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"quantization"
:
"awq"
,
"tensor_parallel_size"
:
4
,
"dtype"
:
"bfloat16"
,
"distributed_executor_backend"
:
"mp"
,
"block_size"
:
128
,
"trust_remote_code"
:
""
,
"enable_chunked_prefill"
:
""
,
"disable_log_stats"
:
""
,
"enforce_eager"
:
""
,
"max_num_batched_tokens"
:
2048
,
"max_num_seqs"
:
256
,
"load_format"
:
"dummy"
},
"client_parameters"
:
{
"model"
:
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4"
,
"backend"
:
"vllm"
,
"dataset_name"
:
"random"
,
"random-input-len"
:
128
,
"random-output-len"
:
128
,
"ignore-eos"
:
""
,
"num_prompts"
:
1000
}
}
}
]
]
.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json
View file @
38d80967
This diff is collapsed.
Click to expand it.
.buildkite/release-pipeline.yaml
View file @
38d80967
steps
:
steps
:
# aarch64 + CUDA builds
# aarch64 + CUDA builds
. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
-
label
:
"
Build
arm64
wheel
-
CUDA
12.
8
"
-
label
:
"
Build
arm64
wheel
-
CUDA
12.
9
"
id
:
build-wheel-arm64-cuda-12-
8
id
:
build-wheel-arm64-cuda-12-
9
agents
:
agents
:
queue
:
arm64_cpu_queue_postmerge
queue
:
arm64_cpu_queue_postmerge
commands
:
commands
:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.
8
.1
--build-arg
torch_cuda_arch_list='8.7
9.0
10.0+PTX
12.0'
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.
9
.1
--build-arg
torch_cuda_arch_list='8.7
9.0
10.0+PTX
12.0'
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
mkdir
artifacts"
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
bash
.buildkite/scripts/upload-wheels.sh"
-
"
bash
.buildkite/scripts/upload-wheels.sh"
env
:
env
:
DOCKER_BUILDKIT
:
"
1"
DOCKER_BUILDKIT
:
"
1"
# x86 + CUDA builds
-
block
:
"
Build
CUDA
12.8
wheel"
key
:
block-build-cu128-wheel
-
label
:
"
Build
wheel
-
CUDA
12.8"
-
label
:
"
Build
wheel
-
CUDA
12.8"
depends_on
:
block-build-cu128-wheel
id
:
build-wheel-cuda-12-8
id
:
build-wheel-cuda-12-8
agents
:
agents
:
queue
:
cpu_queue_postmerge
queue
:
cpu_queue_postmerge
...
@@ -44,18 +47,14 @@ steps:
...
@@ -44,18 +47,14 @@ steps:
env
:
env
:
DOCKER_BUILDKIT
:
"
1"
DOCKER_BUILDKIT
:
"
1"
# Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
# x86 + CUDA builds
# However, this block can be uncommented to save some compute hours.
-
label
:
"
Build
wheel
-
CUDA
12.9"
# - block: "Build CUDA 11.8 wheel"
depends_on
:
~
# key: block-build-cu118-wheel
id
:
build-wheel-cuda-12-9
-
label
:
"
Build
wheel
-
CUDA
11.8"
# depends_on: block-build-cu118-wheel
id
:
build-wheel-cuda-11-8
agents
:
agents
:
queue
:
cpu_queue_postmerge
queue
:
cpu_queue_postmerge
commands
:
commands
:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=1
1.8.0
--build-arg
torch_cuda_arch_list='7.0
7.5
8.0
8.9
9.0+PTX'
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=1
2.9.1
--build-arg
torch_cuda_arch_list='7.0
7.5
8.0
8.9
9.0+PTX'
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
mkdir
artifacts"
-
"
mkdir
artifacts"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
docker
run
--rm
-v
$(pwd)/artifacts:/artifacts_host
vllm-ci:build-image
bash
-c
'cp
-r
dist
/artifacts_host
&&
chmod
-R
a+rw
/artifacts_host'"
-
"
bash
.buildkite/scripts/upload-wheels.sh"
-
"
bash
.buildkite/scripts/upload-wheels.sh"
...
@@ -75,6 +74,7 @@ steps:
...
@@ -75,6 +74,7 @@ steps:
-
"
docker
tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
"
docker
tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
# PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
-
label
:
"
Build
release
image
(arm64)"
-
label
:
"
Build
release
image
(arm64)"
depends_on
:
~
depends_on
:
~
id
:
build-release-image-arm64
id
:
build-release-image-arm64
...
@@ -82,7 +82,7 @@ steps:
...
@@ -82,7 +82,7 @@ steps:
queue
:
arm64_cpu_queue_postmerge
queue
:
arm64_cpu_queue_postmerge
commands
:
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.
8
.1
--build-arg
torch_cuda_arch_list='8.7
9.0
10.0+PTX
12.0'
--build-arg
INSTALL_KV_CONNECTORS=true
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)
--target
vllm-openai
--progress
plain
-f
docker/Dockerfile
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.
9
.1
--build-arg
torch_cuda_arch_list='8.7
9.0
10.0+PTX
12.0'
--build-arg
INSTALL_KV_CONNECTORS=true
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)
--target
vllm-openai
--progress
plain
-f
docker/Dockerfile
."
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)"
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)"
# Add job to create multi-arch manifest
# Add job to create multi-arch manifest
...
@@ -103,7 +103,7 @@ steps:
...
@@ -103,7 +103,7 @@ steps:
-
create-multi-arch-manifest
-
create-multi-arch-manifest
-
build-wheel-cuda-12-8
-
build-wheel-cuda-12-8
-
build-wheel-cuda-12-6
-
build-wheel-cuda-12-6
-
build-wheel-cuda-1
1-8
-
build-wheel-cuda-1
2-9
id
:
annotate-release-workflow
id
:
annotate-release-workflow
agents
:
agents
:
queue
:
cpu_queue_postmerge
queue
:
cpu_queue_postmerge
...
@@ -150,18 +150,24 @@ steps:
...
@@ -150,18 +150,24 @@ steps:
env
:
env
:
DOCKER_BUILDKIT
:
"
1"
DOCKER_BUILDKIT
:
"
1"
-
block
:
"
Build
Neuron
release
image"
-
label
:
"
Build
and
publish
nightly
multi-arch
image
to
DockerHub"
key
:
block-neuron-release-image-build
depends_on
:
depends_on
:
~
-
create-multi-arch-manifest
if
:
build.env("NIGHTLY") == "1"
-
label
:
"
Build
and
publish
Neuron
release
image"
depends_on
:
block-neuron-release-image-build
agents
:
agents
:
queue
:
neuron-
postmerge
queue
:
cpu_queue_
postmerge
commands
:
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--tag
public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent
meta-data
get
release-version)
--tag
public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest
--progress
plain
-f
docker/Dockerfile.neuron
."
-
"
docker
pull
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
-
"
docker
tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT
vllm/vllm-openai:nightly"
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent
meta-data
get
release-version)"
-
"
docker
tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT
vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
-
"
docker
push
vllm/vllm-openai:nightly"
-
"
docker
push
vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
# Clean up old nightly builds (keep only last 14)
-
"
bash
.buildkite/scripts/cleanup-nightly-builds.sh"
plugins
:
-
docker-login#v3.0.0
:
username
:
vllmbot
password-env
:
DOCKERHUB_TOKEN
env
:
env
:
DOCKER_BUILDKIT
:
"
1"
DOCKER_BUILDKIT
:
"
1"
.buildkite/scripts/cleanup-nightly-builds.sh
0 → 100755
View file @
38d80967
#!/bin/bash
set
-ex
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
# DockerHub API endpoint for vllm/vllm-openai repository
REPO_API_URL
=
"https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
# Get DockerHub token from environment
if
[
-z
"
$DOCKERHUB_TOKEN
"
]
;
then
echo
"Error: DOCKERHUB_TOKEN environment variable is not set"
exit
1
fi
# Function to get all tags from DockerHub
get_all_tags
()
{
local
page
=
1
local
all_tags
=
""
while
true
;
do
local
response
=
$(
curl
-s
-H
"Authorization: Bearer
$DOCKERHUB_TOKEN
"
\
"
$REPO_API_URL
?page=
$page
&page_size=100"
)
# Get both last_updated timestamp and tag name, separated by |
local
tags
=
$(
echo
"
$response
"
| jq
-r
'.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"'
)
if
[
-z
"
$tags
"
]
;
then
break
fi
all_tags
=
"
$all_tags$tags
"
$'
\n
'
page
=
$((
page
+
1
))
done
# Sort by timestamp (newest first) and extract just the tag names
echo
"
$all_tags
"
|
sort
-r
|
cut
-d
'|'
-f2
}
delete_tag
()
{
local
tag_name
=
"
$1
"
echo
"Deleting tag:
$tag_name
"
local
delete_url
=
"https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/
$tag_name
"
local
response
=
$(
curl
-s
-X
DELETE
-H
"Authorization: Bearer
$DOCKERHUB_TOKEN
"
"
$delete_url
"
)
if
echo
"
$response
"
| jq
-e
'.detail'
>
/dev/null 2>&1
;
then
echo
"Warning: Failed to delete tag
$tag_name
:
$(
echo
"
$response
"
| jq
-r
'.detail'
)
"
else
echo
"Successfully deleted tag:
$tag_name
"
fi
}
# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
echo
"Fetching all tags from DockerHub..."
all_tags
=
$(
get_all_tags
)
if
[
-z
"
$all_tags
"
]
;
then
echo
"No tags found to clean up"
exit
0
fi
# Count total tags
total_tags
=
$(
echo
"
$all_tags
"
|
wc
-l
)
echo
"Found
$total_tags
tags"
# Keep only the last 14 builds (including the current one)
tags_to_keep
=
14
tags_to_delete
=
$((
total_tags
-
tags_to_keep
))
if
[
$tags_to_delete
-le
0
]
;
then
echo
"No tags need to be deleted (only
$total_tags
tags found, keeping
$tags_to_keep
)"
exit
0
fi
echo
"Will delete
$tags_to_delete
old tags, keeping the newest
$tags_to_keep
"
# Get tags to delete (skip the first $tags_to_keep tags)
tags_to_delete_list
=
$(
echo
"
$all_tags
"
|
tail
-n
+
$((
tags_to_keep
+
1
))
)
if
[
-z
"
$tags_to_delete_list
"
]
;
then
echo
"No tags to delete"
exit
0
fi
# Delete old tags
echo
"Deleting old tags..."
while
IFS
=
read
-r
tag
;
do
if
[
-n
"
$tag
"
]
;
then
delete_tag
"
$tag
"
# Add a small delay to avoid rate limiting
sleep
1
fi
done
<<<
"
$tags_to_delete_list
"
echo
"Cleanup completed successfully"
.buildkite/scripts/hardware_ci/run-neuron-test.sh
deleted
100644 → 0
View file @
33650733
#!/bin/bash
# This script build the Neuron docker image and run the API server inside the container.
# It serves a sanity check for compilation and basic model usage.
set
-e
set
-v
image_name
=
"neuron/vllm-ci"
container_name
=
"neuron_
$(
tr
-dc
A-Za-z0-9 < /dev/urandom |
head
-c
10
;
echo
)
"
HF_CACHE
=
"
$(
realpath
~
)
/huggingface"
mkdir
-p
"
${
HF_CACHE
}
"
HF_MOUNT
=
"/root/.cache/huggingface"
HF_TOKEN
=
$(
aws secretsmanager get-secret-value
--secret-id
"ci/vllm-neuron/hf-token"
--region
us-west-2
--query
'SecretString'
--output
text | jq
-r
.VLLM_NEURON_CI_HF_TOKEN
)
NEURON_COMPILE_CACHE_URL
=
"
$(
realpath
~
)
/neuron_compile_cache"
mkdir
-p
"
${
NEURON_COMPILE_CACHE_URL
}
"
NEURON_COMPILE_CACHE_MOUNT
=
"/root/.cache/neuron_compile_cache"
# Try building the docker image
aws ecr-public get-login-password
--region
us-east-1 | docker login
--username
AWS
--password-stdin
public.ecr.aws
# prune old image and containers to save disk space, and only once a day
# by using a timestamp file in tmp.
if
[
-f
/tmp/neuron-docker-build-timestamp
]
;
then
last_build
=
$(
cat
/tmp/neuron-docker-build-timestamp
)
current_time
=
$(
date
+%s
)
if
[
$((
current_time
-
last_build
))
-gt
86400
]
;
then
# Remove dangling images (those that are not tagged and not used by any container)
docker image prune
-f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune
-f
&&
docker system prune
-f
echo
"
$current_time
"
>
/tmp/neuron-docker-build-timestamp
fi
else
date
"+%s"
>
/tmp/neuron-docker-build-timestamp
fi
docker build
-t
"
${
image_name
}
"
-f
docker/Dockerfile.neuron
.
# Setup cleanup
remove_docker_container
()
{
docker image
rm
-f
"
${
image_name
}
"
||
true
;
}
trap
remove_docker_container EXIT
# Run the image
docker run
--rm
-it
--device
=
/dev/neuron0
--network
bridge
\
-v
"
${
HF_CACHE
}
:
${
HF_MOUNT
}
"
\
-e
"HF_HOME=
${
HF_MOUNT
}
"
\
-e
"HF_TOKEN=
${
HF_TOKEN
}
"
\
-v
"
${
NEURON_COMPILE_CACHE_URL
}
:
${
NEURON_COMPILE_CACHE_MOUNT
}
"
\
-e
"NEURON_COMPILE_CACHE_URL=
${
NEURON_COMPILE_CACHE_MOUNT
}
"
\
--name
"
${
container_name
}
"
\
${
image_name
}
\
/bin/bash
-c
"
set -e; # Exit on first error
python3 /workspace/vllm/examples/offline_inference/neuron.py;
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
echo
\"
Running test file:
\$
f
\"
;
python3 -m pytest
\$
f -v --capture=tee-sys;
done
"
\ No newline at end of file
.buildkite/scripts/hardware_ci/run-xpu-test.sh
View file @
38d80967
...
@@ -30,10 +30,12 @@ docker run \
...
@@ -30,10 +30,12 @@ docker run \
bash
-c
'
bash
-c
'
set -e
set -e
echo $ZE_AFFINITY_MASK
echo $ZE_AFFINITY_MASK
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
pip install tblib==3.1.0
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
cd tests
cd tests
pytest -v -s v1/core
pytest -v -s v1/core
pytest -v -s v1/engine
pytest -v -s v1/engine
...
...
.buildkite/scripts/upload-wheels.sh
View file @
38d80967
...
@@ -58,14 +58,15 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
...
@@ -58,14 +58,15 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
aws s3
cp
"
$wheel
"
"s3://vllm-wheels/
$BUILDKITE_COMMIT
/"
aws s3
cp
"
$wheel
"
"s3://vllm-wheels/
$BUILDKITE_COMMIT
/"
aws s3
cp
"
$normal_wheel
"
"s3://vllm-wheels/
$BUILDKITE_COMMIT
/"
aws s3
cp
"
$normal_wheel
"
"s3://vllm-wheels/
$BUILDKITE_COMMIT
/"
if
[[
$normal_wheel
==
*
"cu118"
*
]]
;
then
if
[[
$normal_wheel
==
*
"cu126"
*
]]
;
then
# if $normal_wheel matches cu118, do not upload the index.html
echo
"Skipping index files for cu118 wheels"
elif
[[
$normal_wheel
==
*
"cu126"
*
]]
;
then
# if $normal_wheel matches cu126, do not upload the index.html
# if $normal_wheel matches cu126, do not upload the index.html
echo
"Skipping index files for cu126 wheels"
echo
"Skipping index files for cu126 wheels"
elif
[[
$normal_wheel
==
*
"cu128"
*
]]
;
then
# if $normal_wheel matches cu128, do not upload the index.html
echo
"Skipping index files for cu128 wheels"
else
else
# only upload index.html for cu128 wheels (default wheels)
# only upload index.html for cu129 wheels (default wheels) as it
# is available on both x86 and arm64
aws s3
cp
index.html
"s3://vllm-wheels/
$BUILDKITE_COMMIT
/vllm/index.html"
aws s3
cp
index.html
"s3://vllm-wheels/
$BUILDKITE_COMMIT
/vllm/index.html"
aws s3
cp
"s3://vllm-wheels/nightly/index.html"
"s3://vllm-wheels/
$BUILDKITE_COMMIT
/index.html"
aws s3
cp
"s3://vllm-wheels/nightly/index.html"
"s3://vllm-wheels/
$BUILDKITE_COMMIT
/index.html"
fi
fi
...
@@ -74,14 +75,15 @@ fi
...
@@ -74,14 +75,15 @@ fi
aws s3
cp
"
$wheel
"
"s3://vllm-wheels/nightly/"
aws s3
cp
"
$wheel
"
"s3://vllm-wheels/nightly/"
aws s3
cp
"
$normal_wheel
"
"s3://vllm-wheels/nightly/"
aws s3
cp
"
$normal_wheel
"
"s3://vllm-wheels/nightly/"
if
[[
$normal_wheel
==
*
"cu118"
*
]]
;
then
if
[[
$normal_wheel
==
*
"cu126"
*
]]
;
then
# if $normal_wheel matches cu118, do not upload the index.html
echo
"Skipping index files for cu118 wheels"
elif
[[
$normal_wheel
==
*
"cu126"
*
]]
;
then
# if $normal_wheel matches cu126, do not upload the index.html
# if $normal_wheel matches cu126, do not upload the index.html
echo
"Skipping index files for cu126 wheels"
echo
"Skipping index files for cu126 wheels"
elif
[[
$normal_wheel
==
*
"cu128"
*
]]
;
then
# if $normal_wheel matches cu128, do not upload the index.html
echo
"Skipping index files for cu128 wheels"
else
else
# only upload index.html for cu128 wheels (default wheels)
# only upload index.html for cu129 wheels (default wheels) as it
# is available on both x86 and arm64
aws s3
cp
index.html
"s3://vllm-wheels/nightly/vllm/index.html"
aws s3
cp
index.html
"s3://vllm-wheels/nightly/vllm/index.html"
fi
fi
...
...
.buildkite/test-pipeline.yaml
View file @
38d80967
This diff is collapsed.
Click to expand it.
.github/.bc-linter.yml
0 → 100644
View file @
38d80967
# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
version
:
1
paths
:
# We temporarily disable globally, and will only enable with `annotations.include`
# include:
# - "vllm/v1/attetion/*.py"
# - "vllm/v1/core/*.py"
exclude
:
-
"
**/*.py"
scan
:
functions
:
true
# check free functions and methods
classes
:
true
# check classes/dataclasses
public_only
:
true
# ignore names starting with "_" at any level
annotations
:
include
:
# decorators that force‑include a symbol
-
name
:
"
bc_linter_include"
# matched by simple name or dotted suffix
propagate_to_members
:
false
# for classes, include methods/inner classes
exclude
:
# decorators that force‑exclude a symbol
-
name
:
"
bc_linter_skip"
# matched by simple name or dotted suffix
propagate_to_members
:
true
# for classes, exclude methods/inner classes
excluded_violations
:
[]
# e.g. ["ParameterRenamed", "FieldTypeChanged"]
.github/CODEOWNERS
View file @
38d80967
...
@@ -5,18 +5,21 @@
...
@@ -5,18 +5,21 @@
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
@22quinn
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
@22quinn
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
@NickLucche
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/model_executor/layers/mamba @tdoublep
/vllm/model_executor/layers/mamba @tdoublep
/vllm/multimodal @DarkLight1337 @ywang96
/vllm/model_executor/model_loader @22quinn
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
/vllm/v1/sample @22quinn @houseroad
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
/vllm/lora @jeejeelee
/vllm/reasoning @aarnphm
/vllm/reasoning @aarnphm
@chaunceyjiang
/vllm/entrypoints @aarnphm
/vllm/entrypoints @aarnphm
@chaunceyjiang
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
/vllm/distributed/kv_transfer @NickLucche
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
# Any change to the VllmConfig changes can have a large user-facing impact,
...
@@ -25,8 +28,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
...
@@ -25,8 +28,11 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# vLLM V1
# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/structured_output @mgoin @russellb @aarnphm
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
/vllm/v1/spec_decode @benchislett @luccafong
/vllm/v1/attention/backends/triton_attn.py @tdoublep
/vllm/v1/attention/backends/triton_attn.py @tdoublep
/vllm/v1/core @heheda12345
/vllm/v1/kv_cache_interface.py @heheda12345
# Test ownership
# Test ownership
/.buildkite/lm-eval-harness @mgoin @simon-mo
/.buildkite/lm-eval-harness @mgoin @simon-mo
...
@@ -34,18 +40,20 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
...
@@ -34,18 +40,20 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
@NickLucche
/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
/tests/models @DarkLight1337 @ywang96
/tests/models @DarkLight1337 @ywang96
/tests/multimodal @DarkLight1337 @ywang96
/tests/multimodal @DarkLight1337 @ywang96
@NickLucche
/tests/prefix_caching @comaniac @KuntaiDu
/tests/prefix_caching @comaniac @KuntaiDu
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/v1/core @heheda12345
/tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee
/tests/lora @jeejeelee
/tests/models/language/generation/test_hybrid.py @tdoublep
/tests/models/language/generation/test_hybrid.py @tdoublep
/tests/v1/kv_connector/nixl_integration @NickLucche
# Docs
# Docs
/docs @hmellor
/docs @hmellor
...
@@ -67,6 +75,9 @@ mkdocs.yaml @hmellor
...
@@ -67,6 +75,9 @@ mkdocs.yaml @hmellor
/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
/vllm/model_executor/models/qwen* @sighingnow
/vllm/model_executor/models/qwen* @sighingnow
# MTP-specific files
/vllm/model_executor/models/deepseek_mtp.py @luccafong
# Mistral-specific files
# Mistral-specific files
/vllm/model_executor/models/mistral*.py @patrickvonplaten
/vllm/model_executor/models/mistral*.py @patrickvonplaten
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
...
@@ -86,3 +97,8 @@ mkdocs.yaml @hmellor
...
@@ -86,3 +97,8 @@ mkdocs.yaml @hmellor
/vllm/attention/ops/rocm*.py @gshtras
/vllm/attention/ops/rocm*.py @gshtras
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
# TPU
/vllm/v1/worker/tpu* @NickLucche
/vllm/platforms/tpu.py @NickLucche
/vllm/v1/sample/tpu @NickLucche
/vllm/tests/v1/tpu @NickLucche
\ No newline at end of file
.github/mergify.yml
View file @
38d80967
...
@@ -273,6 +273,20 @@ pull_request_rules:
...
@@ -273,6 +273,20 @@ pull_request_rules:
users
:
users
:
-
"
sangstar"
-
"
sangstar"
-
name
:
assign reviewer for modelopt changes
conditions
:
-
or
:
-
files~=^vllm/model_executor/layers/quantization/modelopt\.py$
-
files~=^vllm/model_executor/layers/quantization/__init__\.py$
-
files~=^tests/models/quantization/test_modelopt\.py$
-
files~=^tests/quantization/test_modelopt\.py$
-
files~=^tests/models/quantization/test_nvfp4\.py$
-
files~=^docs/features/quantization/modelopt\.md$
actions
:
assign
:
users
:
-
"
Edwardf0t1"
-
name
:
remove 'needs-rebase' label when conflict is resolved
-
name
:
remove 'needs-rebase' label when conflict is resolved
conditions
:
conditions
:
-
-conflict
-
-conflict
...
...
.github/workflows/add_label_automerge.yml
View file @
38d80967
...
@@ -10,7 +10,7 @@ jobs:
...
@@ -10,7 +10,7 @@ jobs:
runs-on
:
ubuntu-latest
runs-on
:
ubuntu-latest
steps
:
steps
:
-
name
:
Add label
-
name
:
Add label
uses
:
actions/github-script@
60a0d83039c74a4aee543508d2ffcb1c3799cdea
# v
7
.0.
1
uses
:
actions/github-script@
ed597411d8f924073f98dfc5c65a23a2325f34cd
# v
8
.0.
0
with
:
with
:
script
:
|
script
:
|
github.rest.issues.addLabels({
github.rest.issues.addLabels({
...
...
.github/workflows/bc-lint.yml
0 → 100644
View file @
38d80967
name
:
BC Lint
on
:
pull_request
:
types
:
-
opened
-
synchronize
-
reopened
jobs
:
bc_lint
:
if
:
github.repository_owner == 'vllm-project'
runs-on
:
ubuntu-latest
steps
:
-
name
:
Run BC Lint Action
uses
:
pytorch/test-infra/.github/actions/bc-lint@main
with
:
repo
:
${{ github.event.pull_request.head.repo.full_name }}
base_sha
:
${{ github.event.pull_request.base.sha }}
head_sha
:
${{ github.event.pull_request.head.sha }}
suppression
:
${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
docs_link
:
'
https://github.com/pytorch/test-infra/wiki/BC-Linter'
config_dir
:
.github
concurrency
:
group
:
${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
cancel-in-progress
:
true
.github/workflows/cleanup_pr_body.yml
View file @
38d80967
...
@@ -16,7 +16,7 @@ jobs:
...
@@ -16,7 +16,7 @@ jobs:
uses
:
actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
# v4.2.2
uses
:
actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
# v4.2.2
-
name
:
Set up Python
-
name
:
Set up Python
uses
:
actions/setup-python@
42375524e23c412d93fb67b49958b491fce71c38
# v
5.4
.0
uses
:
actions/setup-python@
e797f83bcb11b83ae66e0230d6156d7c80228e7c
# v
6.0
.0
with
:
with
:
python-version
:
'
3.12'
python-version
:
'
3.12'
...
...
.github/workflows/issue_autolabel.yml
View file @
38d80967
...
@@ -13,7 +13,7 @@ jobs:
...
@@ -13,7 +13,7 @@ jobs:
runs-on
:
ubuntu-latest
runs-on
:
ubuntu-latest
steps
:
steps
:
-
name
:
Label issues based on keywords
-
name
:
Label issues based on keywords
uses
:
actions/github-script@
60a0d83039c74a4aee543508d2ffcb1c3799cdea
# v
7
.0.
1
uses
:
actions/github-script@
ed597411d8f924073f98dfc5c65a23a2325f34cd
# v
8
.0.
0
with
:
with
:
script
:
|
script
:
|
// Configuration: Add new labels and keywords here
// Configuration: Add new labels and keywords here
...
...
.github/workflows/pre-commit.yml
View file @
38d80967
...
@@ -17,7 +17,7 @@ jobs:
...
@@ -17,7 +17,7 @@ jobs:
runs-on
:
ubuntu-latest
runs-on
:
ubuntu-latest
steps
:
steps
:
-
uses
:
actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
# v4.2.2
-
uses
:
actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683
# v4.2.2
-
uses
:
actions/setup-python@
42375524e23c412d93fb67b49958b491fce71c38
# v
5.4
.0
-
uses
:
actions/setup-python@
e797f83bcb11b83ae66e0230d6156d7c80228e7c
# v
6.0
.0
with
:
with
:
python-version
:
"
3.12"
python-version
:
"
3.12"
-
run
:
echo "::add-matcher::.github/workflows/matchers/actionlint.json"
-
run
:
echo "::add-matcher::.github/workflows/matchers/actionlint.json"
...
...
.github/workflows/reminder_comment.yml
View file @
38d80967
...
@@ -9,7 +9,7 @@ jobs:
...
@@ -9,7 +9,7 @@ jobs:
runs-on
:
ubuntu-latest
runs-on
:
ubuntu-latest
steps
:
steps
:
-
name
:
Remind to run full CI on PR
-
name
:
Remind to run full CI on PR
uses
:
actions/github-script@
60a0d83039c74a4aee543508d2ffcb1c3799cdea
# v
7
.0.
1
uses
:
actions/github-script@
ed597411d8f924073f98dfc5c65a23a2325f34cd
# v
8
.0.
0
with
:
with
:
script
:
|
script
:
|
try {
try {
...
...
.github/workflows/stale.yml
View file @
38d80967
...
@@ -13,7 +13,7 @@ jobs:
...
@@ -13,7 +13,7 @@ jobs:
actions
:
write
actions
:
write
runs-on
:
ubuntu-latest
runs-on
:
ubuntu-latest
steps
:
steps
:
-
uses
:
actions/stale@
5bef64f19d7facfb25b37b414482c7164d639639
# v9.1
.0
-
uses
:
actions/stale@
3a9db7e6a41a89f618792c92c0e97cc736e1b13f
# v10.0
.0
with
:
with
:
# Increasing this value ensures that changes to this workflow
# Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months
# propagate to all issues and PRs in days rather than months
...
...
Prev
1
2
3
4
5
…
40
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment