"vllm/model_executor/models/mistral.py" did not exist on "707ec647bb3a5018e5d8cdded409d6244bbb9ea5"
Unverified Commit 3ed5231c authored by Shengqi Chen's avatar Shengqi Chen Committed by GitHub
Browse files

[Build] Switch default CUDA to 13.0, update CUDA architecture lists, clean up...


[Build] Switch default CUDA to 13.0, update CUDA architecture lists, clean up stale build-args (#39878)
Signed-off-by: default avatarShengqi Chen <harry-chen@outlook.com>
Co-authored-by: default avatarClaude Opus 4.6 (1M context) <noreply@anthropic.com>
parent 9c2492e5
This diff is collapsed.
......@@ -13,12 +13,12 @@ ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
To download the wheel (by commit):
\`\`\`
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_31_aarch64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_35_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux_2_35_aarch64.whl .
(Optional) For CUDA 13.0:
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu130-cp38-abi3-manylinux_2_35_aarch64.whl .
(Optional) For CUDA 12.9:
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux_2_31_x86_64.whl .
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux_2_31_aarch64.whl .
(Optional) For CPU:
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cpu-cp38-abi3-manylinux_2_35_x86_64.whl .
......@@ -33,8 +33,8 @@ To download and upload the image:
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu129
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu129
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
......@@ -50,11 +50,11 @@ docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker push vllm/vllm-openai:latest-x86_64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 vllm/vllm-openai:x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:latest-x86_64-cu130
docker tag vllm/vllm-openai:x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
docker push vllm/vllm-openai:latest-x86_64-cu130
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu129 vllm/vllm-openai:x86_64-cu129
docker tag vllm/vllm-openai:x86_64-cu129 vllm/vllm-openai:latest-x86_64-cu129
docker tag vllm/vllm-openai:x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129
docker push vllm/vllm-openai:latest-x86_64-cu129
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
......@@ -62,11 +62,11 @@ docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker push vllm/vllm-openai:latest-aarch64
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 vllm/vllm-openai:aarch64-cu130
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker tag vllm/vllm-openai:aarch64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker push vllm/vllm-openai:latest-aarch64-cu130
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu129 vllm/vllm-openai:aarch64-cu129
docker tag vllm/vllm-openai:aarch64-cu129 vllm/vllm-openai:latest-aarch64-cu129
docker tag vllm/vllm-openai:aarch64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129
docker push vllm/vllm-openai:latest-aarch64-cu129
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129
## ROCm
......@@ -104,11 +104,11 @@ docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${
docker manifest push vllm/vllm-openai:latest
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
docker manifest rm vllm/vllm-openai:latest-cu130
docker manifest create vllm/vllm-openai:latest-cu130 vllm/vllm-openai:latest-x86_64-cu130 vllm/vllm-openai:latest-aarch64-cu130
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu130 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu130
docker manifest push vllm/vllm-openai:latest-cu130
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu130
docker manifest rm vllm/vllm-openai:latest-cu129
docker manifest create vllm/vllm-openai:latest-cu129 vllm/vllm-openai:latest-x86_64-cu129 vllm/vllm-openai:latest-aarch64-cu129
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION}-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64-cu129 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64-cu129
docker manifest push vllm/vllm-openai:latest-cu129
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}-cu129
docker manifest rm vllm/vllm-openai-cpu:latest || true
docker manifest create vllm/vllm-openai-cpu:latest vllm/vllm-openai-cpu:latest-x86_64 vllm/vllm-openai-cpu:latest-arm64
......
......@@ -29,7 +29,7 @@ if python3 -c "import torch; assert torch.version.hip" 2>/dev/null; then
TORCH_INDEX_URL=""
fi
else
TORCH_INDEX_URL="https://download.pytorch.org/whl/cu129"
TORCH_INDEX_URL="https://download.pytorch.org/whl/cu130"
fi
echo ">>> Using PyTorch index: ${TORCH_INDEX_URL:-PyPI default}"
......
......@@ -9,7 +9,7 @@ set -ex
BUCKET="vllm-wheels"
INDICES_OUTPUT_DIR="indices"
DEFAULT_VARIANT_ALIAS="cu129" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
DEFAULT_VARIANT_ALIAS="cu130" # align with vLLM_MAIN_CUDA_VERSION in vllm/envs.py
PYTHON="${PYTHON_PROG:-python3}" # try to read from env var, otherwise use python3
SUBPATH=$BUILDKITE_COMMIT
S3_COMMIT_PREFIX="s3://$BUCKET/$SUBPATH/"
......
......@@ -14,7 +14,7 @@ $python_executable -m pip install -r requirements/build/cuda.txt -r requirements
# Limit the number of parallel jobs to avoid OOM
export MAX_JOBS=1
# Make sure release wheels are built for the following architectures
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
export TORCH_CUDA_ARCH_LIST="7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX"
bash tools/check_repo.sh
......
......@@ -94,12 +94,15 @@ find_package(Torch REQUIRED)
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0;12.1")
# starting from CUDA 12.9 and Blackwell (10.0), we use family-specific targets (10.0f, 12.0f, etc)
# to support the whole generation without specifying all sub-architectures
# see: https://developer.nvidia.com/blog/nvidia-blackwell-and-nvidia-cuda-12-9-introduce-family-specific-architecture-features/
set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0;12.1")
set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;10.3;12.0;12.1")
else()
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.7;8.9;9.0")
endif()
#
......
......@@ -188,7 +188,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# Explicitly set the list to avoid issues with torch 2.2
# See https://github.com/pytorch/pytorch/pull/123243
# From versions.json: .torch.cuda_arch_list
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
ARG torch_cuda_arch_list='7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
#################### BUILD BASE IMAGE ####################
......@@ -765,7 +765,7 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ENV UV_HTTP_TIMEOUT=500
# install kv_connectors if requested
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0 12.0'
ARG torch_cuda_arch_list='7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
......
......@@ -88,7 +88,6 @@ target "test-ubuntu2404" {
args = {
UBUNTU_VERSION = "24.04"
GDRCOPY_OS_VERSION = "Ubuntu24_04"
FLASHINFER_AOT_COMPILE = "true"
}
output = ["type=docker"]
}
......@@ -100,7 +99,6 @@ target "openai-ubuntu2404" {
args = {
UBUNTU_VERSION = "24.04"
GDRCOPY_OS_VERSION = "Ubuntu24_04"
FLASHINFER_AOT_COMPILE = "true"
}
output = ["type=docker"]
}
......@@ -32,7 +32,7 @@
"default": "false"
},
"TORCH_CUDA_ARCH_LIST": {
"default": "7.0 7.5 8.0 8.9 9.0 10.0 12.0"
"default": "7.5 8.0 8.6 8.9 9.0 10.0 12.0+PTX"
},
"MAX_JOBS": {
"default": "2"
......
......@@ -79,7 +79,7 @@ if TYPE_CHECKING:
VLLM_MEDIA_CONNECTOR: str = "http"
VLLM_MM_HASHER_ALGORITHM: str = "blake3"
VLLM_TARGET_DEVICE: str = "cuda"
VLLM_MAIN_CUDA_VERSION: str = "12.9"
VLLM_MAIN_CUDA_VERSION: str = "13.0"
VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest"
VLLM_BATCH_INVARIANT: bool = False
MAX_JOBS: str | None = None
......@@ -493,7 +493,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_TARGET_DEVICE": lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
# Main CUDA version of vLLM. This follows PyTorch but can be overridden.
"VLLM_MAIN_CUDA_VERSION": lambda: (
os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() or "12.9"
os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() or "13.0"
),
# Controls PyTorch float32 matmul precision mode within vLLM workers.
# Valid options mirror torch.set_float32_matmul_precision
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment