Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
61b1a9d8
Commit
61b1a9d8
authored
Mar 31, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.18.1' into v0.18.1-ori
parents
0da93439
a26e8dc7
Changes
17
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
363 additions
and
39 deletions
+363
-39
.buildkite/release-pipeline.yaml
.buildkite/release-pipeline.yaml
+120
-25
.buildkite/scripts/push-release-builds.sh
.buildkite/scripts/push-release-builds.sh
+113
-0
docker/Dockerfile
docker/Dockerfile
+26
-2
docker/docker-bake.hcl
docker/docker-bake.hcl
+30
-0
docker/versions.json
docker/versions.json
+3
-0
tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml
tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml
+2
-1
tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml
tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml
+2
-1
tests/evals/gsm8k/configs/Qwen3.5-397B-A17B-NVFP4-DEP2.yaml
tests/evals/gsm8k/configs/Qwen3.5-397B-A17B-NVFP4-DEP2.yaml
+9
-0
tests/evals/gsm8k/configs/models-qwen35-blackwell.txt
tests/evals/gsm8k/configs/models-qwen35-blackwell.txt
+2
-0
tests/evals/gsm8k/test_gsm8k_correctness.py
tests/evals/gsm8k/test_gsm8k_correctness.py
+4
-6
tests/models/registry.py
tests/models/registry.py
+1
-0
vllm/config/attention.py
vllm/config/attention.py
+1
-1
vllm/config/vllm.py
vllm/config/vllm.py
+19
-0
vllm/model_executor/layers/quantization/fp8.py
vllm/model_executor/layers/quantization/fp8.py
+7
-2
vllm/model_executor/layers/quantization/input_quant_fp8.py
vllm/model_executor/layers/quantization/input_quant_fp8.py
+1
-0
vllm/model_executor/layers/quantization/utils/fp8_utils.py
vllm/model_executor/layers/quantization/utils/fp8_utils.py
+5
-1
vllm/utils/deep_gemm.py
vllm/utils/deep_gemm.py
+18
-0
No files found.
.buildkite/release-pipeline.yaml
View file @
61b1a9d8
...
@@ -12,7 +12,7 @@ steps:
...
@@ -12,7 +12,7 @@ steps:
depends_on
:
~
depends_on
:
~
id
:
build-wheel-arm64-cuda-12-9
id
:
build-wheel-arm64-cuda-12-9
agents
:
agents
:
queue
:
arm64_cpu_queue_
postmerg
e
queue
:
arm64_cpu_queue_
releas
e
commands
:
commands
:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
...
@@ -27,7 +27,7 @@ steps:
...
@@ -27,7 +27,7 @@ steps:
depends_on
:
~
depends_on
:
~
id
:
build-wheel-arm64-cuda-13-0
id
:
build-wheel-arm64-cuda-13-0
agents
:
agents
:
queue
:
arm64_cpu_queue_
postmerg
e
queue
:
arm64_cpu_queue_
releas
e
commands
:
commands
:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
...
@@ -42,7 +42,7 @@ steps:
...
@@ -42,7 +42,7 @@ steps:
depends_on
:
~
depends_on
:
~
id
:
build-wheel-arm64-cpu
id
:
build-wheel-arm64-cpu
agents
:
agents
:
queue
:
arm64_cpu_queue_
postmerg
e
queue
:
arm64_cpu_queue_
releas
e
commands
:
commands
:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--build-arg
VLLM_BUILD_ACL=ON
--tag
vllm-ci:build-image
--target
vllm-build
--progress
plain
-f
docker/Dockerfile.cpu
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--build-arg
VLLM_BUILD_ACL=ON
--tag
vllm-ci:build-image
--target
vllm-build
--progress
plain
-f
docker/Dockerfile.cpu
."
-
"
mkdir
artifacts"
-
"
mkdir
artifacts"
...
@@ -55,7 +55,7 @@ steps:
...
@@ -55,7 +55,7 @@ steps:
depends_on
:
~
depends_on
:
~
id
:
build-wheel-x86-cuda-12-9
id
:
build-wheel-x86-cuda-12-9
agents
:
agents
:
queue
:
cpu_queue_
postmerg
e
queue
:
cpu_queue_
releas
e
commands
:
commands
:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.9.1
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.9.1
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
mkdir
artifacts"
-
"
mkdir
artifacts"
...
@@ -68,7 +68,7 @@ steps:
...
@@ -68,7 +68,7 @@ steps:
depends_on
:
~
depends_on
:
~
id
:
build-wheel-x86-cuda-13-0
id
:
build-wheel-x86-cuda-13-0
agents
:
agents
:
queue
:
cpu_queue_
postmerg
e
queue
:
cpu_queue_
releas
e
commands
:
commands
:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=13.0.1
--build-arg
BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=13.0.1
--build-arg
BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04
--tag
vllm-ci:build-image
--target
build
--progress
plain
-f
docker/Dockerfile
."
-
"
mkdir
artifacts"
-
"
mkdir
artifacts"
...
@@ -81,7 +81,7 @@ steps:
...
@@ -81,7 +81,7 @@ steps:
depends_on
:
~
depends_on
:
~
id
:
build-wheel-x86-cpu
id
:
build-wheel-x86-cpu
agents
:
agents
:
queue
:
cpu_queue_
postmerg
e
queue
:
cpu_queue_
releas
e
commands
:
commands
:
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--build-arg
VLLM_CPU_X86=true
--tag
vllm-ci:build-image
--target
vllm-build
--progress
plain
-f
docker/Dockerfile.cpu
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--build-arg
VLLM_CPU_X86=true
--tag
vllm-ci:build-image
--target
vllm-build
--progress
plain
-f
docker/Dockerfile.cpu
."
-
"
mkdir
artifacts"
-
"
mkdir
artifacts"
...
@@ -97,7 +97,7 @@ steps:
...
@@ -97,7 +97,7 @@ steps:
depends_on
:
~
depends_on
:
~
id
:
build-release-image-x86
id
:
build-release-image-x86
agents
:
agents
:
queue
:
cpu_queue_
postmerg
e
queue
:
cpu_queue_
releas
e
commands
:
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.9.1
--build-arg
FLASHINFER_AOT_COMPILE=true
--build-arg
INSTALL_KV_CONNECTORS=true
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)
--target
vllm-openai
--progress
plain
-f
docker/Dockerfile
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.9.1
--build-arg
FLASHINFER_AOT_COMPILE=true
--build-arg
INSTALL_KV_CONNECTORS=true
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)
--target
vllm-openai
--progress
plain
-f
docker/Dockerfile
."
...
@@ -110,7 +110,7 @@ steps:
...
@@ -110,7 +110,7 @@ steps:
depends_on
:
~
depends_on
:
~
id
:
build-release-image-arm64
id
:
build-release-image-arm64
agents
:
agents
:
queue
:
arm64_cpu_queue_
postmerg
e
queue
:
arm64_cpu_queue_
releas
e
commands
:
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.9.1
--build-arg
FLASHINFER_AOT_COMPILE=true
--build-arg
torch_cuda_arch_list='8.7
8.9
9.0
10.0+PTX
12.0'
--build-arg
INSTALL_KV_CONNECTORS=true
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)
--target
vllm-openai
--progress
plain
-f
docker/Dockerfile
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.9.1
--build-arg
FLASHINFER_AOT_COMPILE=true
--build-arg
torch_cuda_arch_list='8.7
8.9
9.0
10.0+PTX
12.0'
--build-arg
INSTALL_KV_CONNECTORS=true
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)
--target
vllm-openai
--progress
plain
-f
docker/Dockerfile
."
...
@@ -120,7 +120,7 @@ steps:
...
@@ -120,7 +120,7 @@ steps:
depends_on
:
~
depends_on
:
~
id
:
build-release-image-x86-cuda-13-0
id
:
build-release-image-x86-cuda-13-0
agents
:
agents
:
queue
:
cpu_queue_
postmerg
e
queue
:
cpu_queue_
releas
e
commands
:
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=13.0.1
--build-arg
INSTALL_KV_CONNECTORS=true
--build-arg
BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-cu130
--target
vllm-openai
--progress
plain
-f
docker/Dockerfile
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=13.0.1
--build-arg
INSTALL_KV_CONNECTORS=true
--build-arg
BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-cu130
--target
vllm-openai
--progress
plain
-f
docker/Dockerfile
."
...
@@ -133,13 +133,57 @@ steps:
...
@@ -133,13 +133,57 @@ steps:
depends_on
:
~
depends_on
:
~
id
:
build-release-image-arm64-cuda-13-0
id
:
build-release-image-arm64-cuda-13-0
agents
:
agents
:
queue
:
arm64_cpu_queue_
postmerg
e
queue
:
arm64_cpu_queue_
releas
e
commands
:
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
# compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
# compute capability 12.0 for RTX-50 series / RTX PRO 6000 Blackwell, 12.1 for DGX Spark
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=13.0.1
--build-arg
torch_cuda_arch_list='8.7
8.9
9.0
10.0+PTX
12.0
12.1'
--build-arg
INSTALL_KV_CONNECTORS=true
--build-arg
BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-cu130
--target
vllm-openai
--progress
plain
-f
docker/Dockerfile
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=13.0.1
--build-arg
torch_cuda_arch_list='8.7
8.9
9.0
10.0+PTX
12.0
12.1'
--build-arg
INSTALL_KV_CONNECTORS=true
--build-arg
BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-cu130
--target
vllm-openai
--progress
plain
-f
docker/Dockerfile
."
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-cu130"
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-cu130"
-
label
:
"
Build
release
image
-
x86_64
-
CUDA
12.9
-
Ubuntu
24.04"
depends_on
:
~
id
:
build-release-image-x86-ubuntu2404
agents
:
queue
:
cpu_queue_release
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.9.1
--build-arg
UBUNTU_VERSION=24.04
--build-arg
GDRCOPY_OS_VERSION=Ubuntu24_04
--build-arg
FLASHINFER_AOT_COMPILE=true
--build-arg
torch_cuda_arch_list='8.7
8.9
9.0
10.0+PTX
12.0'
--build-arg
INSTALL_KV_CONNECTORS=true
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-ubuntu2404
--target
vllm-openai
--progress
plain
-f
docker/Dockerfile
."
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-ubuntu2404"
-
"
docker
tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-ubuntu2404
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
-
label
:
"
Build
release
image
-
aarch64
-
CUDA
12.9
-
Ubuntu
24.04"
depends_on
:
~
id
:
build-release-image-arm64-ubuntu2404
agents
:
queue
:
arm64_cpu_queue_release
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=12.9.1
--build-arg
UBUNTU_VERSION=24.04
--build-arg
GDRCOPY_OS_VERSION=Ubuntu24_04
--build-arg
FLASHINFER_AOT_COMPILE=true
--build-arg
torch_cuda_arch_list='8.7
8.9
9.0
10.0+PTX
12.0'
--build-arg
INSTALL_KV_CONNECTORS=true
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-ubuntu2404
--target
vllm-openai
--progress
plain
-f
docker/Dockerfile
."
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-ubuntu2404"
-
label
:
"
Build
release
image
-
x86_64
-
CUDA
13.0
-
Ubuntu
24.04"
depends_on
:
~
id
:
build-release-image-x86-cuda-13-0-ubuntu2404
agents
:
queue
:
cpu_queue_release
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=13.0.1
--build-arg
UBUNTU_VERSION=24.04
--build-arg
GDRCOPY_OS_VERSION=Ubuntu24_04
--build-arg
FLASHINFER_AOT_COMPILE=true
--build-arg
torch_cuda_arch_list='8.7
8.9
9.0
10.0+PTX
12.0
12.1'
--build-arg
INSTALL_KV_CONNECTORS=true
--build-arg
BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-cu130-ubuntu2404
--target
vllm-openai
--progress
plain
-f
docker/Dockerfile
."
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-cu130-ubuntu2404"
-
"
docker
tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-cu130-ubuntu2404
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
-
label
:
"
Build
release
image
-
aarch64
-
CUDA
13.0
-
Ubuntu
24.04"
depends_on
:
~
id
:
build-release-image-arm64-cuda-13-0-ubuntu2404
agents
:
queue
:
arm64_cpu_queue_release
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
USE_SCCACHE=1
--build-arg
GIT_REPO_CHECK=1
--build-arg
CUDA_VERSION=13.0.1
--build-arg
UBUNTU_VERSION=24.04
--build-arg
GDRCOPY_OS_VERSION=Ubuntu24_04
--build-arg
FLASHINFER_AOT_COMPILE=true
--build-arg
torch_cuda_arch_list='8.7
8.9
9.0
10.0+PTX
12.0
12.1'
--build-arg
INSTALL_KV_CONNECTORS=true
--build-arg
BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu24.04
--tag
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-cu130-ubuntu2404
--target
vllm-openai
--progress
plain
-f
docker/Dockerfile
."
-
"
docker
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname
-m)-cu130-ubuntu2404"
-
block
:
"
Build
release
image
for
x86_64
CPU"
-
block
:
"
Build
release
image
for
x86_64
CPU"
key
:
block-cpu-release-image-build
key
:
block-cpu-release-image-build
depends_on
:
~
depends_on
:
~
...
@@ -148,8 +192,9 @@ steps:
...
@@ -148,8 +192,9 @@ steps:
depends_on
:
depends_on
:
-
block-cpu-release-image-build
-
block-cpu-release-image-build
-
input-release-version
-
input-release-version
id
:
build-release-image-x86-cpu
agents
:
agents
:
queue
:
cpu_queue_
postmerg
e
queue
:
cpu_queue_
releas
e
commands
:
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--build-arg
VLLM_CPU_X86=true
--tag
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent
meta-data
get
release-version)
--tag
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest
--progress
plain
--target
vllm-openai
-f
docker/Dockerfile.cpu
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--build-arg
VLLM_CPU_X86=true
--tag
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent
meta-data
get
release-version)
--tag
public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest
--progress
plain
--target
vllm-openai
-f
docker/Dockerfile.cpu
."
...
@@ -163,11 +208,12 @@ steps:
...
@@ -163,11 +208,12 @@ steps:
depends_on
:
~
depends_on
:
~
-
label
:
"
Build
release
image
-
arm64
-
CPU"
-
label
:
"
Build
release
image
-
arm64
-
CPU"
depends_on
:
depends_on
:
-
block-arm64-cpu-release-image-build
-
block-arm64-cpu-release-image-build
-
input-release-version
-
input-release-version
id
:
build-release-image-arm64-cpu
agents
:
agents
:
queue
:
arm64_cpu_queue_
postmerg
e
queue
:
arm64_cpu_queue_
releas
e
commands
:
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--tag
public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent
meta-data
get
release-version)
--tag
public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest
--progress
plain
--target
vllm-openai
-f
docker/Dockerfile.cpu
."
-
"
DOCKER_BUILDKIT=1
docker
build
--build-arg
max_jobs=16
--build-arg
GIT_REPO_CHECK=1
--tag
public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent
meta-data
get
release-version)
--tag
public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest
--progress
plain
--target
vllm-openai
-f
docker/Dockerfile.cpu
."
...
@@ -185,7 +231,7 @@ steps:
...
@@ -185,7 +231,7 @@ steps:
-
build-release-image-arm64
-
build-release-image-arm64
id
:
create-multi-arch-manifest
id
:
create-multi-arch-manifest
agents
:
agents
:
queue
:
small_cpu_queue_
postmerg
e
queue
:
small_cpu_queue_
releas
e
commands
:
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
docker
manifest
create
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64
--amend"
-
"
docker
manifest
create
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64
--amend"
...
@@ -196,7 +242,7 @@ steps:
...
@@ -196,7 +242,7 @@ steps:
-
create-multi-arch-manifest
-
create-multi-arch-manifest
id
:
annotate-release-workflow
id
:
annotate-release-workflow
agents
:
agents
:
queue
:
small_cpu_queue_
postmerg
e
queue
:
small_cpu_queue_
releas
e
commands
:
commands
:
-
"
bash
.buildkite/scripts/annotate-release.sh"
-
"
bash
.buildkite/scripts/annotate-release.sh"
...
@@ -206,18 +252,67 @@ steps:
...
@@ -206,18 +252,67 @@ steps:
-
build-release-image-arm64-cuda-13-0
-
build-release-image-arm64-cuda-13-0
id
:
create-multi-arch-manifest-cuda-13-0
id
:
create-multi-arch-manifest-cuda-13-0
agents
:
agents
:
queue
:
small_cpu_queue_
postmerg
e
queue
:
small_cpu_queue_
releas
e
commands
:
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
docker
manifest
create
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130
--amend"
-
"
docker
manifest
create
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130
--amend"
-
"
docker
manifest
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
-
"
docker
manifest
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130"
-
label
:
"
Create
multi-arch
manifest
-
CUDA
12.9
-
Ubuntu
24.04"
depends_on
:
-
build-release-image-x86-ubuntu2404
-
build-release-image-arm64-ubuntu2404
id
:
create-multi-arch-manifest-ubuntu2404
agents
:
queue
:
small_cpu_queue_release
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
docker
manifest
create
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-ubuntu2404
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-ubuntu2404
--amend"
-
"
docker
manifest
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-ubuntu2404"
-
label
:
"
Create
multi-arch
manifest
-
CUDA
13.0
-
Ubuntu
24.04"
depends_on
:
-
build-release-image-x86-cuda-13-0-ubuntu2404
-
build-release-image-arm64-cuda-13-0-ubuntu2404
id
:
create-multi-arch-manifest-cuda-13-0-ubuntu2404
agents
:
queue
:
small_cpu_queue_release
commands
:
-
"
aws
ecr-public
get-login-password
--region
us-east-1
|
docker
login
--username
AWS
--password-stdin
public.ecr.aws/q9t5s3a7"
-
"
docker
manifest
create
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64-cu130-ubuntu2404
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64-cu130-ubuntu2404
--amend"
-
"
docker
manifest
push
public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-cu130-ubuntu2404"
-
block
:
"
Confirm
publishing
release
images
to
DockerHub"
key
:
block-publish-release-images-dockerhub
depends_on
:
-
create-multi-arch-manifest
-
create-multi-arch-manifest-cuda-13-0
-
build-release-image-x86-cpu
-
build-release-image-arm64-cpu
-
build-rocm-release-image
-
label
:
"
Publish
release
images
to
DockerHub"
key
:
publish-release-images-dockerhub
depends_on
:
-
block-publish-release-images-dockerhub
agents
:
queue
:
small_cpu_queue_release
commands
:
-
"
bash
.buildkite/scripts/push-release-builds.sh"
plugins
:
-
docker-login#v3.0.0
:
username
:
vllmbot
password-env
:
DOCKERHUB_TOKEN
env
:
DOCKER_BUILDKIT
:
"
1"
DOCKERHUB_USERNAME
:
"
vllmbot"
-
label
:
"
Publish
nightly
multi-arch
image
to
DockerHub"
-
label
:
"
Publish
nightly
multi-arch
image
to
DockerHub"
depends_on
:
depends_on
:
-
create-multi-arch-manifest
-
create-multi-arch-manifest
if
:
build.env("NIGHTLY") == "1"
if
:
build.env("NIGHTLY") == "1"
agents
:
agents
:
queue
:
small_cpu_queue_
postmerg
e
queue
:
small_cpu_queue_
releas
e
commands
:
commands
:
-
"
bash
.buildkite/scripts/push-nightly-builds.sh"
-
"
bash
.buildkite/scripts/push-nightly-builds.sh"
# Clean up old nightly builds (keep only last 14)
# Clean up old nightly builds (keep only last 14)
...
@@ -235,7 +330,7 @@ steps:
...
@@ -235,7 +330,7 @@ steps:
-
create-multi-arch-manifest-cuda-13-0
-
create-multi-arch-manifest-cuda-13-0
if
:
build.env("NIGHTLY") == "1"
if
:
build.env("NIGHTLY") == "1"
agents
:
agents
:
queue
:
small_cpu_queue_
postmerg
e
queue
:
small_cpu_queue_
releas
e
commands
:
commands
:
-
"
bash
.buildkite/scripts/push-nightly-builds.sh
cu130"
-
"
bash
.buildkite/scripts/push-nightly-builds.sh
cu130"
# Clean up old nightly builds (keep only last 14)
# Clean up old nightly builds (keep only last 14)
...
@@ -262,7 +357,7 @@ steps:
...
@@ -262,7 +357,7 @@ steps:
-
block-upload-release-wheels
-
block-upload-release-wheels
id
:
upload-release-wheels
id
:
upload-release-wheels
agents
:
agents
:
queue
:
small_cpu_queue_
postmerg
e
queue
:
small_cpu_queue_
releas
e
commands
:
commands
:
-
"
bash
.buildkite/scripts/upload-release-wheels-pypi.sh"
-
"
bash
.buildkite/scripts/upload-release-wheels-pypi.sh"
...
@@ -344,7 +439,7 @@ steps:
...
@@ -344,7 +439,7 @@ steps:
-
step
:
input-rocm-config
-
step
:
input-rocm-config
allow_failure
:
true
# Allow failure so non-UI builds can proceed (input step is skipped)
allow_failure
:
true
# Allow failure so non-UI builds can proceed (input step is skipped)
agents
:
agents
:
queue
:
cpu_queue_
postmerg
e
queue
:
cpu_queue_
releas
e
commands
:
commands
:
# Set configuration and check cache
# Set configuration and check cache
-
|
-
|
...
@@ -486,7 +581,7 @@ steps:
...
@@ -486,7 +581,7 @@ steps:
-
step
:
build-rocm-base-wheels
-
step
:
build-rocm-base-wheels
allow_failure
:
false
allow_failure
:
false
agents
:
agents
:
queue
:
cpu_queue_
postmerg
e
queue
:
cpu_queue_
releas
e
timeout_in_minutes
:
180
timeout_in_minutes
:
180
commands
:
commands
:
# Download artifacts and prepare Docker image
# Download artifacts and prepare Docker image
...
@@ -596,7 +691,7 @@ steps:
...
@@ -596,7 +691,7 @@ steps:
-
step
:
build-rocm-vllm-wheel
-
step
:
build-rocm-vllm-wheel
allow_failure
:
false
allow_failure
:
false
agents
:
agents
:
queue
:
cpu_queue_
postmerg
e
queue
:
cpu_queue_
releas
e
timeout_in_minutes
:
60
timeout_in_minutes
:
60
commands
:
commands
:
# Download all wheel artifacts and run upload
# Download all wheel artifacts and run upload
...
@@ -645,7 +740,7 @@ steps:
...
@@ -645,7 +740,7 @@ steps:
-
step
:
input-release-version
-
step
:
input-release-version
allow_failure
:
true
allow_failure
:
true
agents
:
agents
:
queue
:
small_
cpu_queue_
postmerg
e
queue
:
cpu_queue_
releas
e
commands
:
commands
:
-
"
bash
.buildkite/scripts/annotate-rocm-release.sh"
-
"
bash
.buildkite/scripts/annotate-rocm-release.sh"
env
:
env
:
...
@@ -662,7 +757,7 @@ steps:
...
@@ -662,7 +757,7 @@ steps:
depends_on
:
block-generate-root-index-rocm-wheels
depends_on
:
block-generate-root-index-rocm-wheels
id
:
generate-root-index-rocm-wheels
id
:
generate-root-index-rocm-wheels
agents
:
agents
:
queue
:
cpu_queue_
postmerg
e
queue
:
cpu_queue_
releas
e
commands
:
commands
:
-
"
bash
tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
-
"
bash
tools/vllm-rocm/generate-rocm-wheels-root-index.sh"
env
:
env
:
...
@@ -676,7 +771,7 @@ steps:
...
@@ -676,7 +771,7 @@ steps:
-
step
:
build-rocm-base-wheels
-
step
:
build-rocm-base-wheels
allow_failure
:
false
allow_failure
:
false
agents
:
agents
:
queue
:
cpu_queue_
postmerg
e
queue
:
cpu_queue_
releas
e
timeout_in_minutes
:
60
timeout_in_minutes
:
60
commands
:
commands
:
-
|
-
|
...
...
.buildkite/scripts/push-release-builds.sh
0 → 100755
View file @
61b1a9d8
#!/bin/bash
set
-euo
pipefail
# Ensure git tags are up-to-date (Buildkite's default fetch doesn't always include tags)
echo
"Fetching latest tags from origin..."
git fetch
--tags
--force
origin
# Derive release version from the git tag on the current commit.
# The pipeline must be triggered on a tagged commit (e.g. v0.18.1).
RELEASE_VERSION
=
$(
git describe
--exact-match
--tags
"
${
BUILDKITE_COMMIT
}
"
2>/dev/null
||
true
)
if
[
-z
"
${
RELEASE_VERSION
}
"
]
;
then
echo
"[FATAL] Commit
${
BUILDKITE_COMMIT
}
has no exact git tag. "
\
"Release images must be published from a tagged commit."
exit
1
fi
# Strip leading 'v' for use in Docker tags (e.g. v0.18.1 -> 0.18.1)
PURE_VERSION
=
"
${
RELEASE_VERSION
#v
}
"
echo
"========================================"
echo
"Publishing release images"
echo
" Commit:
${
BUILDKITE_COMMIT
}
"
echo
" Release version:
${
RELEASE_VERSION
}
"
echo
"========================================"
set
-x
# ---- CUDA (default, CUDA 12.9) ----
docker pull
"public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-x86_64"
docker pull
"public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-aarch64"
docker tag
"public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-x86_64"
"vllm/vllm-openai:latest-x86_64"
docker tag
"public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-x86_64"
"vllm/vllm-openai:v
${
PURE_VERSION
}
-x86_64"
docker push
"vllm/vllm-openai:latest-x86_64"
docker push
"vllm/vllm-openai:v
${
PURE_VERSION
}
-x86_64"
docker tag
"public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-aarch64"
"vllm/vllm-openai:latest-aarch64"
docker tag
"public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-aarch64"
"vllm/vllm-openai:v
${
PURE_VERSION
}
-aarch64"
docker push
"vllm/vllm-openai:latest-aarch64"
docker push
"vllm/vllm-openai:v
${
PURE_VERSION
}
-aarch64"
docker manifest
rm
"vllm/vllm-openai:latest"
||
true
docker manifest create
"vllm/vllm-openai:latest"
"vllm/vllm-openai:latest-x86_64"
"vllm/vllm-openai:latest-aarch64"
docker manifest push
"vllm/vllm-openai:latest"
docker manifest
rm
"vllm/vllm-openai:v
${
PURE_VERSION
}
"
||
true
docker manifest create
"vllm/vllm-openai:v
${
PURE_VERSION
}
"
"vllm/vllm-openai:v
${
PURE_VERSION
}
-x86_64"
"vllm/vllm-openai:v
${
PURE_VERSION
}
-aarch64"
docker manifest push
"vllm/vllm-openai:v
${
PURE_VERSION
}
"
# ---- CUDA 13.0 ----
docker pull
"public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-x86_64-cu130"
docker pull
"public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-aarch64-cu130"
docker tag
"public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-x86_64-cu130"
"vllm/vllm-openai:latest-x86_64-cu130"
docker tag
"public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-x86_64-cu130"
"vllm/vllm-openai:v
${
PURE_VERSION
}
-x86_64-cu130"
docker push
"vllm/vllm-openai:latest-x86_64-cu130"
docker push
"vllm/vllm-openai:v
${
PURE_VERSION
}
-x86_64-cu130"
docker tag
"public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-aarch64-cu130"
"vllm/vllm-openai:latest-aarch64-cu130"
docker tag
"public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-aarch64-cu130"
"vllm/vllm-openai:v
${
PURE_VERSION
}
-aarch64-cu130"
docker push
"vllm/vllm-openai:latest-aarch64-cu130"
docker push
"vllm/vllm-openai:v
${
PURE_VERSION
}
-aarch64-cu130"
docker manifest
rm
"vllm/vllm-openai:latest-cu130"
||
true
docker manifest create
"vllm/vllm-openai:latest-cu130"
"vllm/vllm-openai:latest-x86_64-cu130"
"vllm/vllm-openai:latest-aarch64-cu130"
docker manifest push
"vllm/vllm-openai:latest-cu130"
docker manifest
rm
"vllm/vllm-openai:v
${
PURE_VERSION
}
-cu130"
||
true
docker manifest create
"vllm/vllm-openai:v
${
PURE_VERSION
}
-cu130"
"vllm/vllm-openai:v
${
PURE_VERSION
}
-x86_64-cu130"
"vllm/vllm-openai:v
${
PURE_VERSION
}
-aarch64-cu130"
docker manifest push
"vllm/vllm-openai:v
${
PURE_VERSION
}
-cu130"
# ---- ROCm ----
docker pull
"public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-rocm"
docker pull
"public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-rocm-base"
docker tag
"public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-rocm"
"vllm/vllm-openai-rocm:latest"
docker tag
"public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-rocm"
"vllm/vllm-openai-rocm:v
${
PURE_VERSION
}
"
docker push
"vllm/vllm-openai-rocm:latest"
docker push
"vllm/vllm-openai-rocm:v
${
PURE_VERSION
}
"
docker tag
"public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-rocm-base"
"vllm/vllm-openai-rocm:latest-base"
docker tag
"public.ecr.aws/q9t5s3a7/vllm-release-repo:
${
BUILDKITE_COMMIT
}
-rocm-base"
"vllm/vllm-openai-rocm:v
${
PURE_VERSION
}
-base"
docker push
"vllm/vllm-openai-rocm:latest-base"
docker push
"vllm/vllm-openai-rocm:v
${
PURE_VERSION
}
-base"
# ---- CPU ----
# CPU images in ECR are tagged with the full version including 'v' (e.g. v0.18.1),
# matching the value from the Buildkite release-version metadata input.
docker pull
"public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:
${
RELEASE_VERSION
}
"
docker pull
"public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:
${
RELEASE_VERSION
}
"
docker tag
"public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:
${
RELEASE_VERSION
}
"
"vllm/vllm-openai-cpu:latest-x86_64"
docker tag
"public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:
${
RELEASE_VERSION
}
"
"vllm/vllm-openai-cpu:v
${
PURE_VERSION
}
-x86_64"
docker push
"vllm/vllm-openai-cpu:latest-x86_64"
docker push
"vllm/vllm-openai-cpu:v
${
PURE_VERSION
}
-x86_64"
docker tag
"public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:
${
RELEASE_VERSION
}
"
"vllm/vllm-openai-cpu:latest-arm64"
docker tag
"public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:
${
RELEASE_VERSION
}
"
"vllm/vllm-openai-cpu:v
${
PURE_VERSION
}
-arm64"
docker push
"vllm/vllm-openai-cpu:latest-arm64"
docker push
"vllm/vllm-openai-cpu:v
${
PURE_VERSION
}
-arm64"
docker manifest
rm
"vllm/vllm-openai-cpu:latest"
||
true
docker manifest create
"vllm/vllm-openai-cpu:latest"
"vllm/vllm-openai-cpu:latest-x86_64"
"vllm/vllm-openai-cpu:latest-arm64"
docker manifest push
"vllm/vllm-openai-cpu:latest"
docker manifest
rm
"vllm/vllm-openai-cpu:v
${
PURE_VERSION
}
"
||
true
docker manifest create
"vllm/vllm-openai-cpu:v
${
PURE_VERSION
}
"
"vllm/vllm-openai-cpu:v
${
PURE_VERSION
}
-x86_64"
"vllm/vllm-openai-cpu:v
${
PURE_VERSION
}
-arm64"
docker manifest push
"vllm/vllm-openai-cpu:v
${
PURE_VERSION
}
"
echo
"========================================"
echo
"Successfully published release images for
${
RELEASE_VERSION
}
"
echo
"========================================"
docker/Dockerfile
View file @
61b1a9d8
...
@@ -24,6 +24,7 @@
...
@@ -24,6 +24,7 @@
ARG
CUDA_VERSION=12.9.1
ARG
CUDA_VERSION=12.9.1
ARG
PYTHON_VERSION=3.12
ARG
PYTHON_VERSION=3.12
ARG
UBUNTU_VERSION=22.04
# By parameterizing the base images, we allow third-party to use their own
# By parameterizing the base images, we allow third-party to use their own
# base images. One use case is hermetic builds with base images stored in
# base images. One use case is hermetic builds with base images stored in
...
@@ -38,7 +39,7 @@ ARG PYTHON_VERSION=3.12
...
@@ -38,7 +39,7 @@ ARG PYTHON_VERSION=3.12
# version are not backwards compatible with OSes that use an earlier version.
# version are not backwards compatible with OSes that use an earlier version.
ARG
BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
ARG
BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
# Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels)
# Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels)
ARG
FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu
22.04
ARG
FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu
${UBUNTU_VERSION}
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
# By parameterizing the Deadsnakes repository URL, we allow third-party to use
# their own mirror. When doing so, we don't benefit from the transparent
# their own mirror. When doing so, we don't benefit from the transparent
...
@@ -111,6 +112,10 @@ RUN apt-get update -y \
...
@@ -111,6 +112,10 @@ RUN apt-get update -y \
gcc-10 \
gcc-10 \
g++-10 \
g++-10 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \
# Install python dev headers if available (needed for cmake FindPython on Ubuntu 24.04
# which ships cmake 3.28 and requires Development.SABIModule; silently skipped on
# Ubuntu 20.04/22.04 where python3.x-dev is not available without a PPA)
&& (apt-get install -y --no-install-recommends python${PYTHON_VERSION}-dev 2>/dev/null || true) \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /var/lib/apt/lists/* \
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
&& curl -LsSf https://astral.sh/uv/install.sh | sh \
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
...
@@ -507,7 +512,6 @@ RUN apt-get update -y \
...
@@ -507,7 +512,6 @@ RUN apt-get update -y \
software-properties-common
\
software-properties-common
\
curl
\
curl
\
sudo
\
sudo
\
python3-pip
\
ffmpeg
\
ffmpeg
\
libsm6
\
libsm6
\
libxext6
\
libxext6
\
...
@@ -535,6 +539,7 @@ RUN apt-get update -y \
...
@@ -535,6 +539,7 @@ RUN apt-get update -y \
&&
update-alternatives
--install
/usr/bin/python3 python3 /usr/bin/python
${
PYTHON_VERSION
}
1
\
&&
update-alternatives
--install
/usr/bin/python3 python3 /usr/bin/python
${
PYTHON_VERSION
}
1
\
&&
update-alternatives
--set
python3 /usr/bin/python
${
PYTHON_VERSION
}
\
&&
update-alternatives
--set
python3 /usr/bin/python
${
PYTHON_VERSION
}
\
&&
ln
-sf
/usr/bin/python
${
PYTHON_VERSION
}
-config
/usr/bin/python3-config
\
&&
ln
-sf
/usr/bin/python
${
PYTHON_VERSION
}
-config
/usr/bin/python3-config
\
&&
rm
-f
/usr/lib/python
${
PYTHON_VERSION
}
/EXTERNALLY-MANAGED
\
&&
curl
-sS
${
GET_PIP_URL
}
| python
${
PYTHON_VERSION
}
\
&&
curl
-sS
${
GET_PIP_URL
}
| python
${
PYTHON_VERSION
}
\
&&
python3
--version
&&
python3
-m
pip
--version
&&
python3
--version
&&
python3
-m
pip
--version
...
@@ -593,6 +598,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \
...
@@ -593,6 +598,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--extra-index-url
https://flashinfer.ai/whl/cu
$(
echo
$CUDA_VERSION
|
cut
-d
.
-f1
,2 |
tr
-d
'.'
)
\
--extra-index-url
https://flashinfer.ai/whl/cu
$(
echo
$CUDA_VERSION
|
cut
-d
.
-f1
,2 |
tr
-d
'.'
)
\
&&
flashinfer show-config
&&
flashinfer show-config
# Pre-download FlashInfer TRTLLM BMM headers for air-gapped environments.
# At runtime, MoE JIT compilation downloads these from edge.urm.nvidia.com
# which fails without internet. This step caches them at build time.
RUN
python3
<<
'
PYEOF
'
from
flashinfer.jit import env as jit_env
from
flashinfer.jit.cubin_loader import download_trtllm_headers, get_cubin
from
flashinfer.artifacts import ArtifactPath, CheckSumHash
download_trtllm_headers(
'bmm',
jit_env.FLASHINFER_CUBIN_DIR / 'flashinfer' / 'trtllm' / 'batched_gemm' / 'trtllmGen_bmm_export',
f'{ArtifactPath.TRTLLM_GEN_BMM}/include/trtllmGen_bmm_export',
ArtifactPath.TRTLLM_GEN_BMM,
get_cubin(f'{ArtifactPath.TRTLLM_GEN_BMM}/checksums.txt', CheckSumHash.TRTLLM_GEN_BMM),
)
print('FlashInfer TRTLLM BMM headers downloaded successfully')
PYEOF
# ============================================================
# ============================================================
# OPENAI API SERVER DEPENDENCIES
# OPENAI API SERVER DEPENDENCIES
# Pre-install these to avoid reinstalling on every vLLM wheel rebuild
# Pre-install these to avoid reinstalling on every vLLM wheel rebuild
...
...
docker/docker-bake.hcl
View file @
61b1a9d8
...
@@ -33,6 +33,10 @@ group "default" {
...
@@ -33,6 +33,10 @@ group "default" {
targets = ["openai"]
targets = ["openai"]
}
}
group "all" {
targets = ["openai", "openai-ubuntu2404"]
}
# Base targets
# Base targets
target "_common" {
target "_common" {
...
@@ -74,3 +78,29 @@ target "openai" {
...
@@ -74,3 +78,29 @@ target "openai" {
tags = ["vllm:openai"]
tags = ["vllm:openai"]
output = ["type=docker"]
output = ["type=docker"]
}
}
# Ubuntu 24.04 targets
target "test-ubuntu2404" {
inherits = ["_common", "_labels"]
target = "test"
tags = ["vllm:test-ubuntu24.04"]
args = {
UBUNTU_VERSION = "24.04"
GDRCOPY_OS_VERSION = "Ubuntu24_04"
FLASHINFER_AOT_COMPILE = "true"
}
output = ["type=docker"]
}
target "openai-ubuntu2404" {
inherits = ["_common", "_labels"]
target = "vllm-openai"
tags = ["vllm:openai-ubuntu24.04"]
args = {
UBUNTU_VERSION = "24.04"
GDRCOPY_OS_VERSION = "Ubuntu24_04"
FLASHINFER_AOT_COMPILE = "true"
}
output = ["type=docker"]
}
docker/versions.json
View file @
61b1a9d8
...
@@ -7,6 +7,9 @@
...
@@ -7,6 +7,9 @@
"PYTHON_VERSION"
:
{
"PYTHON_VERSION"
:
{
"default"
:
"3.12"
"default"
:
"3.12"
},
},
"UBUNTU_VERSION"
:
{
"default"
:
"22.04"
},
"BUILD_BASE_IMAGE"
:
{
"BUILD_BASE_IMAGE"
:
{
"default"
:
"nvidia/cuda:12.9.1-devel-ubuntu20.04"
"default"
:
"nvidia/cuda:12.9.1-devel-ubuntu20.04"
},
},
...
...
tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-DEP2.yaml
View file @
61b1a9d8
model_name
:
"
Qwen/Qwen3.5-35B-A3B"
model_name
:
"
Qwen/Qwen3.5-35B-A3B"
accuracy_threshold
:
0.86
accuracy_threshold
:
0.84
tolerance
:
0.03
num_questions
:
1319
num_questions
:
1319
num_fewshot
:
5
num_fewshot
:
5
server_args
:
>-
server_args
:
>-
...
...
tests/evals/gsm8k/configs/Qwen3.5-35B-A3B-FP8-DEP2.yaml
View file @
61b1a9d8
model_name
:
"
Qwen/Qwen3.5-35B-A3B-FP8"
model_name
:
"
Qwen/Qwen3.5-35B-A3B-FP8"
accuracy_threshold
:
0.86
accuracy_threshold
:
0.79
tolerance
:
0.03
num_questions
:
1319
num_questions
:
1319
num_fewshot
:
5
num_fewshot
:
5
server_args
:
>-
server_args
:
>-
...
...
tests/evals/gsm8k/configs/Qwen3.5-397B-A17B-NVFP4-DEP2.yaml
0 → 100644
View file @
61b1a9d8
model_name
:
"
nvidia/Qwen3.5-397B-A17B-NVFP4"
accuracy_threshold
:
0.88
tolerance
:
0.03
num_questions
:
1319
num_fewshot
:
5
server_args
:
>-
--max-model-len 4096
--data-parallel-size 2
--enable-expert-parallel
tests/evals/gsm8k/configs/models-qwen35-blackwell.txt
View file @
61b1a9d8
Qwen3.5-35B-A3B-DEP2.yaml
Qwen3.5-35B-A3B-DEP2.yaml
Qwen3.5-35B-A3B-FP8-DEP2.yaml
Qwen3.5-397B-A17B-NVFP4-DEP2.yaml
tests/evals/gsm8k/test_gsm8k_correctness.py
View file @
61b1a9d8
...
@@ -19,8 +19,6 @@ from vllm.platforms import current_platform
...
@@ -19,8 +19,6 @@ from vllm.platforms import current_platform
from
.gsm8k_eval
import
evaluate_gsm8k
from
.gsm8k_eval
import
evaluate_gsm8k
TOL
=
0.08
# Absolute tolerance for accuracy comparison
def
run_gsm8k_eval
(
eval_config
:
dict
,
server_url
:
str
)
->
dict
:
def
run_gsm8k_eval
(
eval_config
:
dict
,
server_url
:
str
)
->
dict
:
"""Run GSM8K evaluation using our isolated script."""
"""Run GSM8K evaluation using our isolated script."""
...
@@ -109,20 +107,20 @@ def test_gsm8k_correctness(config_filename):
...
@@ -109,20 +107,20 @@ def test_gsm8k_correctness(config_filename):
measured_metric
=
results
[
"accuracy"
]
measured_metric
=
results
[
"accuracy"
]
expected_metric
=
eval_config
[
"accuracy_threshold"
]
expected_metric
=
eval_config
[
"accuracy_threshold"
]
tol
=
eval_config
.
get
(
"tolerance"
,
0.08
)
print
(
f
"GSM8K Results for
{
eval_config
[
'model_name'
]
}
:"
)
print
(
f
"GSM8K Results for
{
eval_config
[
'model_name'
]
}
:"
)
print
(
f
" Measured metric:
{
measured_metric
:.
4
f
}
"
)
print
(
f
" Measured metric:
{
measured_metric
:.
4
f
}
"
)
print
(
f
" Expected metric:
{
expected_metric
:.
4
f
}
"
)
print
(
f
" Expected metric:
{
expected_metric
:.
4
f
}
"
)
print
(
f
" Tolerance:
{
TOL
:.
4
f
}
"
)
print
(
f
" Tolerance:
{
tol
:.
4
f
}
"
)
print
(
f
" Questions:
{
results
[
'num_questions'
]
}
"
)
print
(
f
" Questions:
{
results
[
'num_questions'
]
}
"
)
print
(
f
" Invalid rate:
{
results
[
'invalid_rate'
]:.
3
f
}
"
)
print
(
f
" Invalid rate:
{
results
[
'invalid_rate'
]:.
3
f
}
"
)
print
(
f
" Latency:
{
results
[
'latency'
]:.
1
f
}
s"
)
print
(
f
" Latency:
{
results
[
'latency'
]:.
1
f
}
s"
)
print
(
f
" QPS:
{
results
[
'questions_per_second'
]:.
1
f
}
"
)
print
(
f
" QPS:
{
results
[
'questions_per_second'
]:.
1
f
}
"
)
# Verify metric is within tolerance
assert
measured_metric
>=
expected_metric
-
tol
,
(
assert
measured_metric
>=
expected_metric
-
TOL
,
(
f
"GSM8K metric too low:
{
measured_metric
:.
4
f
}
< "
f
"GSM8K metric too low:
{
measured_metric
:.
4
f
}
< "
f
"
{
expected_metric
:.
4
f
}
-
{
TOL
:.
4
f
}
=
{
expected_metric
-
TOL
:.
4
f
}
"
f
"
{
expected_metric
:.
4
f
}
-
{
tol
:.
4
f
}
=
{
expected_metric
-
tol
:.
4
f
}
"
)
)
print
(
f
"✅ GSM8K test passed for
{
eval_config
[
'model_name'
]
}
"
)
print
(
f
"✅ GSM8K test passed for
{
eval_config
[
'model_name'
]
}
"
)
tests/models/registry.py
View file @
61b1a9d8
...
@@ -791,6 +791,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
...
@@ -791,6 +791,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"Ernie4_5_VLMoeForConditionalGeneration"
:
_HfExamplesInfo
(
"Ernie4_5_VLMoeForConditionalGeneration"
:
_HfExamplesInfo
(
"baidu/ERNIE-4.5-VL-28B-A3B-PT"
,
"baidu/ERNIE-4.5-VL-28B-A3B-PT"
,
trust_remote_code
=
True
,
trust_remote_code
=
True
,
revision
=
"refs/pr/17"
,
),
),
"FireRedASR2ForConditionalGeneration"
:
_HfExamplesInfo
(
"FireRedASR2ForConditionalGeneration"
:
_HfExamplesInfo
(
"allendou/FireRedASR2-LLM-vllm"
,
"allendou/FireRedASR2-LLM-vllm"
,
...
...
vllm/config/attention.py
View file @
61b1a9d8
...
@@ -30,7 +30,7 @@ class AttentionConfig:
...
@@ -30,7 +30,7 @@ class AttentionConfig:
use_cudnn_prefill
:
bool
=
False
use_cudnn_prefill
:
bool
=
False
"""Whether to use cudnn prefill."""
"""Whether to use cudnn prefill."""
use_trtllm_ragged_deepseek_prefill
:
bool
=
Fals
e
use_trtllm_ragged_deepseek_prefill
:
bool
=
Tru
e
"""Whether to use TRTLLM ragged deepseek prefill."""
"""Whether to use TRTLLM ragged deepseek prefill."""
use_trtllm_attention
:
bool
|
None
=
None
use_trtllm_attention
:
bool
|
None
=
None
...
...
vllm/config/vllm.py
View file @
61b1a9d8
...
@@ -682,6 +682,25 @@ class VllmConfig:
...
@@ -682,6 +682,25 @@ class VllmConfig:
self
.
model_config
,
self
.
load_config
self
.
model_config
,
self
.
load_config
)
)
if
(
self
.
quant_config
is
not
None
and
self
.
model_config
is
not
None
and
hasattr
(
self
.
quant_config
,
"use_deep_gemm"
)
and
self
.
quant_config
.
use_deep_gemm
is
None
):
from
vllm.utils.deep_gemm
import
should_auto_disable_deep_gemm
model_type
=
getattr
(
self
.
model_config
.
hf_text_config
,
"model_type"
,
None
)
if
should_auto_disable_deep_gemm
(
model_type
):
self
.
quant_config
.
use_deep_gemm
=
False
logger
.
warning_once
(
"Auto-disabled DeepGemm for model_type=%s on Blackwell. "
"DeepGemm E8M0 scale format causes accuracy degradation "
"for this architecture. Falling back to CUTLASS. "
"To disable DeepGemm globally, set VLLM_USE_DEEP_GEMM=0."
,
model_type
,
)
from
vllm.v1.executor.abstract
import
Executor
from
vllm.v1.executor.abstract
import
Executor
executor_backend
=
self
.
parallel_config
.
distributed_executor_backend
executor_backend
=
self
.
parallel_config
.
distributed_executor_backend
...
...
vllm/model_executor/layers/quantization/fp8.py
View file @
61b1a9d8
...
@@ -135,6 +135,7 @@ class Fp8Config(QuantizationConfig):
...
@@ -135,6 +135,7 @@ class Fp8Config(QuantizationConfig):
f
"
{
activation_scheme
}
activation scheme."
f
"
{
activation_scheme
}
activation scheme."
)
)
self
.
weight_block_size
=
weight_block_size
self
.
weight_block_size
=
weight_block_size
self
.
use_deep_gemm
:
bool
|
None
=
None
@
classmethod
@
classmethod
def
get_name
(
cls
)
->
QuantizationMethods
:
def
get_name
(
cls
)
->
QuantizationMethods
:
...
@@ -291,7 +292,10 @@ class Fp8LinearMethod(LinearMethodBase):
...
@@ -291,7 +292,10 @@ class Fp8LinearMethod(LinearMethodBase):
self
.
use_marlin
=
False
self
.
use_marlin
=
False
self
.
use_aiter_and_is_supported
=
rocm_aiter_ops
.
is_linear_fp8_enabled
()
self
.
use_aiter_and_is_supported
=
rocm_aiter_ops
.
is_linear_fp8_enabled
()
self
.
use_deep_gemm
=
is_deep_gemm_supported
()
if
self
.
quant_config
.
use_deep_gemm
is
not
None
:
self
.
use_deep_gemm
=
self
.
quant_config
.
use_deep_gemm
else
:
self
.
use_deep_gemm
=
is_deep_gemm_supported
()
self
.
weight_block_size
=
self
.
quant_config
.
weight_block_size
self
.
weight_block_size
=
self
.
quant_config
.
weight_block_size
self
.
block_quant
=
self
.
weight_block_size
is
not
None
self
.
block_quant
=
self
.
weight_block_size
is
not
None
...
@@ -305,6 +309,7 @@ class Fp8LinearMethod(LinearMethodBase):
...
@@ -305,6 +309,7 @@ class Fp8LinearMethod(LinearMethodBase):
act_quant_group_shape
=
GroupShape
(
1
,
self
.
weight_block_size
[
0
]),
act_quant_group_shape
=
GroupShape
(
1
,
self
.
weight_block_size
[
0
]),
cutlass_block_fp8_supported
=
self
.
cutlass_block_fp8_supported
,
cutlass_block_fp8_supported
=
self
.
cutlass_block_fp8_supported
,
use_aiter_and_is_supported
=
self
.
use_aiter_and_is_supported
,
use_aiter_and_is_supported
=
self
.
use_aiter_and_is_supported
,
use_deep_gemm
=
self
.
use_deep_gemm
,
)
)
else
:
else
:
# Use per-token quantization for better perf if dynamic and cutlass
# Use per-token quantization for better perf if dynamic and cutlass
...
@@ -440,7 +445,7 @@ class Fp8LinearMethod(LinearMethodBase):
...
@@ -440,7 +445,7 @@ class Fp8LinearMethod(LinearMethodBase):
del
layer
.
input_scale
del
layer
.
input_scale
return
return
if
self
.
block_quant
:
if
self
.
block_quant
and
self
.
use_deep_gemm
:
maybe_post_process_fp8_weight_block
(
layer
)
maybe_post_process_fp8_weight_block
(
layer
)
def
apply
(
def
apply
(
...
...
vllm/model_executor/layers/quantization/input_quant_fp8.py
View file @
61b1a9d8
...
@@ -91,6 +91,7 @@ class QuantFP8(CustomOp):
...
@@ -91,6 +91,7 @@ class QuantFP8(CustomOp):
if
(
if
(
self
.
is_group_quant
self
.
is_group_quant
and
self
.
use_ue8m0
and
self
.
use_deep_gemm_supported
and
self
.
use_deep_gemm_supported
and
(
DeepGemmQuantScaleFMT
.
from_oracle
()
==
DeepGemmQuantScaleFMT
.
UE8M0
)
and
(
DeepGemmQuantScaleFMT
.
from_oracle
()
==
DeepGemmQuantScaleFMT
.
UE8M0
)
):
):
...
...
vllm/model_executor/layers/quantization/utils/fp8_utils.py
View file @
61b1a9d8
...
@@ -356,10 +356,14 @@ class W8A8BlockFp8LinearOp:
...
@@ -356,10 +356,14 @@ class W8A8BlockFp8LinearOp:
act_quant_group_shape
:
GroupShape
,
act_quant_group_shape
:
GroupShape
,
cutlass_block_fp8_supported
:
bool
=
CUTLASS_BLOCK_FP8_SUPPORTED
,
cutlass_block_fp8_supported
:
bool
=
CUTLASS_BLOCK_FP8_SUPPORTED
,
use_aiter_and_is_supported
:
bool
=
False
,
use_aiter_and_is_supported
:
bool
=
False
,
use_deep_gemm
:
bool
|
None
=
None
,
):
):
self
.
weight_group_shape
=
weight_group_shape
self
.
weight_group_shape
=
weight_group_shape
self
.
act_quant_group_shape
=
act_quant_group_shape
self
.
act_quant_group_shape
=
act_quant_group_shape
self
.
is_deep_gemm_supported
=
is_deep_gemm_supported
()
if
use_deep_gemm
is
not
None
:
self
.
is_deep_gemm_supported
=
use_deep_gemm
else
:
self
.
is_deep_gemm_supported
=
is_deep_gemm_supported
()
self
.
is_hopper
=
current_platform
.
is_device_capability
(
90
)
self
.
is_hopper
=
current_platform
.
is_device_capability
(
90
)
self
.
use_deep_gemm_e8m0
=
is_deep_gemm_e8m0_used
()
self
.
use_deep_gemm_e8m0
=
is_deep_gemm_e8m0_used
()
self
.
is_flashinfer_supported
=
is_flashinfer_fp8_blockscale_gemm_supported
()
self
.
is_flashinfer_supported
=
is_flashinfer_fp8_blockscale_gemm_supported
()
...
...
vllm/utils/deep_gemm.py
View file @
61b1a9d8
...
@@ -23,6 +23,24 @@ from vllm.platforms import current_platform
...
@@ -23,6 +23,24 @@ from vllm.platforms import current_platform
from
vllm.utils.import_utils
import
has_deep_gemm
from
vllm.utils.import_utils
import
has_deep_gemm
from
vllm.utils.math_utils
import
cdiv
from
vllm.utils.math_utils
import
cdiv
_DEEPGEMM_BLACKWELL_EXCLUDED_MODEL_TYPES
:
set
[
str
]
=
{
"qwen3_5_text"
,
"qwen3_5_moe_text"
,
}
def
should_auto_disable_deep_gemm
(
model_type
:
str
|
None
)
->
bool
:
"""Check if DeepGemm should be auto-disabled for this model on Blackwell.
Returns True if the model is known to have accuracy degradation with
DeepGemm's E8M0 scale format on Blackwell GPUs (SM100+).
"""
if
model_type
is
None
:
return
False
if
not
current_platform
.
is_device_capability_family
(
100
):
return
False
return
model_type
in
_DEEPGEMM_BLACKWELL_EXCLUDED_MODEL_TYPES
class
DeepGemmQuantScaleFMT
(
Enum
):
class
DeepGemmQuantScaleFMT
(
Enum
):
# Float32 scales in Float32 tensor
# Float32 scales in Float32 tensor
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment