Unverified Commit 4a2235d1 authored by Dmitry Tokarev's avatar Dmitry Tokarev Committed by GitHub
Browse files

feat: vllm sglang cuda13 main (#5218)


Signed-off-by: default avatarAnant Sharma <anants@nvidia.com>
Signed-off-by: default avatarDmitry Tokarev <dtokarev@nvidia.com>
Co-authored-by: default avatarAnant Sharma <anants@nvidia.com>
Co-authored-by: default avatarDillon Cullinan <dcullinan@nvidia.com>
parent 9bb7f101
......@@ -42,7 +42,7 @@ inputs:
required: false
cuda_version:
description: 'Optional override for CUDA_VERSION build-arg'
required: false
required: true
torch_backend:
description: 'Optional override for TORCH_BACKEND build-arg (e.g., cu129)'
required: false
......@@ -86,6 +86,7 @@ runs:
GITHUB_RUN_ID: ${{ github.run_id }}
GITHUB_JOB: ${{ github.job }}
GITHUB_REF_NAME: ${{ github.ref_name }}
CUDA_VERSION: ${{ inputs.cuda_version }}
run: |
set -x
# Determine image tag
......@@ -95,6 +96,8 @@ runs:
IMAGE_TAG="${{ inputs.framework }}:latest"
fi
CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*}
BUILD_START_TIME=$(date -u +%Y-%m-%dT%H:%M:%SZ)
echo "BUILD_START_TIME=${BUILD_START_TIME}" >> $GITHUB_ENV
......@@ -110,10 +113,10 @@ runs:
# Set base cache args and set --cache-to if this is a main commit
EXTRA_ARGS=""
EXTRA_ARGS="--cache-to type=inline "
EXTRA_ARGS+="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:${{ inputs.framework }}-${PLATFORM##*/}-cache "
EXTRA_ARGS+="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:main-${{ inputs.framework }}-${PLATFORM##*/} "
EXTRA_ARGS+="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:${{ inputs.framework }}-cuda${CUDA_VERSION_MAJOR}-${PLATFORM##*/}-cache "
EXTRA_ARGS+="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:main-${{ inputs.framework }}-cuda${CUDA_VERSION_MAJOR}-${PLATFORM##*/} "
if [[ "$GITHUB_REF_NAME" == "main" ]]; then
EXTRA_ARGS+="--cache-to type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:${{ inputs.framework }}-${PLATFORM##*/}-cache,mode=max "
EXTRA_ARGS+="--cache-to type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:${{ inputs.framework }}-cuda${CUDA_VERSION_MAJOR}-${PLATFORM##*/}-cache,mode=max "
fi
echo "$EXTRA_ARGS"
......
......@@ -122,7 +122,7 @@ runs:
echo "🔍 Mypy type checking enabled"
MYPY_FLAG="--mypy"
fi
PYTEST_CMD="pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 ${MYPY_FLAG} -m \"${{ inputs.pytest_marks }}\""
PYTEST_CMD="pytest --continue-on-collection-errors -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 ${MYPY_FLAG} -m \"${{ inputs.pytest_marks }}\""
# Detect GPU availability and conditionally add GPU flags
GPU_FLAGS=""
......
......@@ -49,7 +49,7 @@ jobs:
backend-status-check:
runs-on: ubuntu-latest
needs: [changed-files, vllm, sglang, trtllm, operator]
needs: [changed-files, vllm, sglang, trtllm, operator] # THIS list determines blocking jobs
if: always()
steps:
- name: "Check all dependent jobs"
......@@ -132,6 +132,73 @@ jobs:
azure_push: 'true'
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
vllm-cuda-13:
needs: changed-files
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true'
strategy:
fail-fast: false
matrix:
platform:
- { arch: amd64, runner: gpu-l40-amd64 }
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
name: vllm-cuda-13 (${{ matrix.platform.arch }})
runs-on: ${{ matrix.platform.runner }}
steps:
- name: Output Node Name
shell: bash
run: |
echo ${K8S_NODE_NAME}
- name: Checkout code
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
with:
lfs: true
- name: Docker Login
uses: ./.github/actions/docker-login
with:
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Build Container
id: build-image
uses: ./.github/actions/docker-build
with:
framework: vllm
target: runtime
platform: 'linux/${{ matrix.platform.arch }}'
base_image_tag: '25.11-cuda13.0-devel-ubuntu24.04'
runtime_image_tag: '13.0.2-runtime-ubuntu24.04'
cuda_version: '13.0'
torch_backend: 'cu130'
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- name: Docker Tag and Push
uses: ./.github/actions/docker-tag-push
with:
local_image: ${{ steps.build-image.outputs.image_tag }}
push_tags: ai-dynamo/dynamo:${{ github.sha }}-vllm-cuda13-${{ matrix.platform.arch }}
conditional_tag: ${{ github.ref_name == 'main' && format('ai-dynamo/dynamo:main-vllm-cuda13-{0}', matrix.platform.arch) || '' }}
aws_push: 'true'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
- name: Run tests
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.build-image.outputs.image_tag }}
# GH ARM runners have no GPUs, run CPU tests only on ARM
pytest_marks: ${{ matrix.platform.arch == 'amd64' && 'pre_merge and vllm and (gpu_0 or gpu_1)' || 'pre_merge and vllm and gpu_0' }}
framework: "vllm"
test_type: "pre_merge"
platform_arch: ${{ matrix.platform.arch }}
vllm:
needs: changed-files
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true'
......@@ -170,7 +237,7 @@ jobs:
platform: 'linux/${{ matrix.platform.arch }}'
base_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.06-cuda12.9-devel-ubuntu24.04' || '' }}
runtime_image_tag: ${{ matrix.platform.arch == 'arm64' && '12.9.0-runtime-ubuntu24.04' || '' }}
cuda_version: ${{ matrix.platform.arch == 'arm64' && '12.9' || '' }}
cuda_version: '12.9'
torch_backend: ${{ matrix.platform.arch == 'arm64' && 'cu129' || '' }}
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
......@@ -182,25 +249,90 @@ jobs:
uses: ./.github/actions/docker-tag-push
with:
local_image: ${{ steps.build-image.outputs.image_tag }}
push_tags: ai-dynamo/dynamo:${{ github.sha }}-vllm-${{ matrix.platform.arch }}
conditional_tag: ${{ github.ref_name == 'main' && format('ai-dynamo/dynamo:main-vllm-{0}', matrix.platform.arch) || '' }}
push_tags: |
ai-dynamo/dynamo:${{ github.sha }}-vllm-${{ matrix.platform.arch }}
ai-dynamo/dynamo:${{ github.sha }}-vllm-cuda12-${{ matrix.platform.arch }}
conditional_tag: ${{ github.ref_name == 'main' && format('ai-dynamo/dynamo:main-vllm-cuda12-{0}', matrix.platform.arch) || '' }}
aws_push: 'true'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
- name: Run tests
if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "pre_merge and vllm and (gpu_0 or gpu_1)"
# GH ARM runners have no GPUs, run CPU tests only on ARM
pytest_marks: ${{ matrix.platform.arch == 'amd64' && 'pre_merge and vllm and (gpu_0 or gpu_1)' || 'pre_merge and vllm and gpu_0' }}
framework: "vllm"
test_type: "pre_merge"
platform_arch: ${{ matrix.platform.arch }}
enable_mypy: 'true'
hf_token: ${{ secrets.HF_TOKEN }}
sglang-cuda-13:
needs: changed-files
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true'
strategy:
fail-fast: false
matrix:
platform:
- { arch: amd64, runner: gpu-l40-amd64 }
- { arch: arm64, runner: cpu-arm-r8g-4xlarge }
name: sglang-cuda-13 (${{ matrix.platform.arch }})
runs-on: ${{ matrix.platform.runner }}
steps:
- name: Output Node Name
shell: bash
run: |
echo ${K8S_NODE_NAME}
- name: Checkout repository
uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955 # v4.3.0
- name: Docker Login
uses: ./.github/actions/docker-login
with:
ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
- name: Build Container
id: build-image
uses: ./.github/actions/docker-build
with:
framework: sglang
target: runtime
platform: 'linux/${{ matrix.platform.arch }}'
cuda_version: '13.0'
torch_backend: 'cu130'
ci_token: ${{ secrets.CI_TOKEN }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
- name: Docker Tag and Push
uses: ./.github/actions/docker-tag-push
with:
local_image: ${{ steps.build-image.outputs.image_tag }}
push_tags: ai-dynamo/dynamo:${{ github.sha }}-sglang-cuda13-${{ matrix.platform.arch }}
conditional_tag: ${{ github.ref_name == 'main' && format('ai-dynamo/dynamo:main-sglang-cuda13-{0}', matrix.platform.arch) || '' }}
aws_push: 'true'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
- name: Run tests
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.build-image.outputs.image_tag }}
# GH arm runners have no GPUs, testing arm containers on cpu only
pytest_marks: ${{ matrix.platform.arch == 'amd64' && 'pre_merge and sglang and (gpu_0 or gpu_1)' || 'pre_merge and sglang and gpu_0' }}
framework: "sglang"
test_type: "pre_merge"
platform_arch: ${{ matrix.platform.arch }}
sglang:
needs: changed-files
if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true'
......@@ -235,6 +367,7 @@ jobs:
uses: ./.github/actions/docker-build
with:
framework: sglang
cuda_version: '12.9'
target: runtime
platform: 'linux/${{ matrix.platform.arch }}'
ci_token: ${{ secrets.CI_TOKEN }}
......@@ -247,19 +380,21 @@ jobs:
uses: ./.github/actions/docker-tag-push
with:
local_image: ${{ steps.build-image.outputs.image_tag }}
push_tags: ai-dynamo/dynamo:${{ github.sha }}-sglang-${{ matrix.platform.arch }}
conditional_tag: ${{ github.ref_name == 'main' && format('ai-dynamo/dynamo:main-sglang-{0}', matrix.platform.arch) || '' }}
push_tags: |
ai-dynamo/dynamo:${{ github.sha }}-sglang-${{ matrix.platform.arch }}
ai-dynamo/dynamo:${{ github.sha }}-sglang-cuda12-${{ matrix.platform.arch }}
conditional_tag: ${{ github.ref_name == 'main' && format('ai-dynamo/dynamo:main-sglang-cuda12-{0}', matrix.platform.arch) || '' }}
aws_push: 'true'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
- name: Run tests
if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "pre_merge and sglang and (gpu_0 or gpu_1)"
# GH arm runners have no GPUs, testing arm containers on cpu only
pytest_marks: ${{ matrix.platform.arch == 'amd64' && 'pre_merge and sglang and (gpu_0 or gpu_1)' || 'pre_merge and sglang and gpu_0' }}
framework: "sglang"
test_type: "pre_merge"
platform_arch: ${{ matrix.platform.arch }}
......@@ -300,6 +435,7 @@ jobs:
uses: ./.github/actions/docker-build
with:
framework: trtllm
cuda_version: '13.0'
target: runtime
platform: 'linux/${{ matrix.platform.arch }}'
ci_token: ${{ secrets.CI_TOKEN }}
......@@ -312,19 +448,25 @@ jobs:
uses: ./.github/actions/docker-tag-push
with:
local_image: ${{ steps.build-image.outputs.image_tag }}
push_tags: ai-dynamo/dynamo:${{ github.sha }}-trtllm-${{ matrix.platform.arch }}
conditional_tag: ${{ github.ref_name == 'main' && format('ai-dynamo/dynamo:main-trtllm-{0}', matrix.platform.arch) || '' }}
push_tags: |
ai-dynamo/dynamo:${{ github.sha }}-trtllm-${{ matrix.platform.arch }}
ai-dynamo/dynamo:${{ github.sha }}-trtllm-cuda13-${{ matrix.platform.arch }}
conditional_tag: ${{ github.ref_name == 'main' && format('ai-dynamo/dynamo:main-trtllm-cuda13-{0}', matrix.platform.arch) || '' }}
aws_push: 'true'
azure_push: 'true'
aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
- name: Run tests
# TODO: enable testing on ARM when these tests stop failing on collection on machines without GPUs:
# components/src/dynamo/trtllm/tests/test_trtllm_autodeploy.py
# components/src/dynamo/trtllm/tests/test_trtllm_unit.py
if: ${{ matrix.platform.arch != 'arm64' }}
uses: ./.github/actions/pytest
with:
image_tag: ${{ steps.build-image.outputs.image_tag }}
pytest_marks: "pre_merge and trtllm and (gpu_0 or gpu_1)"
# GH arm runners have no GPUs, testing arm containers on cpu only
pytest_marks: ${{ matrix.platform.arch == 'amd64' && 'pre_merge and trtllm and (gpu_0 or gpu_1)' || 'pre_merge and trtllm and gpu_0' }}
framework: "trtllm"
test_type: "pre_merge"
platform_arch: ${{ matrix.platform.arch }}
......
......@@ -22,7 +22,7 @@
# DEVELOPMENT PATHS THAT MUST BE GROUP-WRITABLE (for virtualenv containers):
# /workspace - Users create/modify project files
# /home/dynamo - Users create config/cache files
# /home/dynamo/.local - SGLang uses $HOME/.local/lib/python3.10/site-packages for pip install
# /home/dynamo/.local - SGLang uses $HOME/.local/lib/python3.12/site-packages for pip install
#
# HOW TO ACHIEVE GROUP-WRITABLE PERMISSIONS:
# 1. SHELL + /etc/profile.d - Login shell sources umask 002 globally for all RUN commands (775/664)
......@@ -499,12 +499,15 @@ RUN --mount=type=bind,source=.,target=/mnt/local_src \
pip install --break-system-packages --no-cache . && \
# pip/uv bypasses umask when creating .egg-info files, but chmod -R is fast here (small directory)
chmod -R g+w /workspace/benchmarks && \
# Install NVIDIA packages that are needed for DeepEP to work properly
# This is done in the upstream runtime image too, but we overrode these packages earlier
pip install --no-cache-dir --break-system-packages --force-reinstall --no-deps \
nvidia-nccl-cu12==2.28.3 \
nvidia-cudnn-cu12==9.16.0.29 \
nvidia-cutlass-dsl==4.3.0
CUDA_MAJOR=$(nvcc --version | egrep -o 'cuda_[0-9]+' | cut -d_ -f2) && \
if [ "$CUDA_MAJOR" = "12" ]; then \
# Install NVIDIA packages that are needed for DeepEP to work properly
# This is done in the upstream runtime image too, but these packages are overridden in earlier commands
pip install --no-cache-dir --break-system-packages --force-reinstall --no-deps \
nvidia-nccl-cu12==2.28.3 \
nvidia-cudnn-cu12==9.16.0.29 \
nvidia-cutlass-dsl==4.3.0; \
fi
# Copy tests, deploy and components for CI with correct ownership
# Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>
......
......@@ -118,8 +118,10 @@ NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
SGLANG_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
SGLANG_BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04"
SGLANG_BASE_IMAGE_TAG_CU13="25.11-cuda13.0-devel-ubuntu24.04"
SGLANG_CUDA_VERSION="12.9.1"
SGLANG_PYTHON_VERSION="3.10"
SGLANG_CUDA_VERSION_CU13="13.0.1"
SGLANG_RUNTIME_IMAGE_TAG_CU13="v0.5.7-cu130-runtime"
# GAIE (Gateway API Inference Extension) configuration for frontend (required for EPP binary for frontend image)
GAIE_REPO_URL="https://github.com/kubernetes-sigs/gateway-api-inference-extension.git"
......@@ -420,6 +422,17 @@ get_options() {
echo "INFO: Overriding base image tag for vLLM with CUDA 13: $BASE_IMAGE_TAG AND RUNTIME_IMAGE_TAG: $RUNTIME_IMAGE_TAG"
fi
if [[ $FRAMEWORK == "SGLANG" ]] && [[ $CUDA_VERSION == "13."* ]]; then
BASE_IMAGE_TAG=$SGLANG_BASE_IMAGE_TAG_CU13
BUILD_ARGS+=" --build-arg BASE_IMAGE_TAG=${SGLANG_BASE_IMAGE_TAG_CU13} "
SGLANG_CUDA_VERSION="${SGLANG_CUDA_VERSION_CU13}"
RUNTIME_IMAGE_TAG="${SGLANG_RUNTIME_IMAGE_TAG_CU13}"
BUILD_ARGS+=" --build-arg RUNTIME_IMAGE_TAG=${RUNTIME_IMAGE_TAG} "
echo "INFO: Overriding base image tag for SGLang with CUDA 13: $BASE_IMAGE_TAG AND RUNTIME_IMAGE_TAG: $RUNTIME_IMAGE_TAG"
fi
if [ -z "$BASE_IMAGE" ]; then
error "ERROR: Framework $FRAMEWORK without BASE_IMAGE"
fi
......@@ -923,11 +936,11 @@ fi
if [[ $FRAMEWORK == "SGLANG" ]]; then
echo "Customizing Python, CUDA, and framework images for sglang images"
BUILD_ARGS+=" --build-arg PYTHON_VERSION=${SGLANG_PYTHON_VERSION}"
BUILD_ARGS+=" --build-arg CUDA_VERSION=${SGLANG_CUDA_VERSION}"
else
BUILD_ARGS+=" --build-arg PYTHON_VERSION=${PYTHON_VERSION}"
fi
BUILD_ARGS+=" --build-arg PYTHON_VERSION=${PYTHON_VERSION}"
# Add sccache build arguments
if [ "$USE_SCCACHE" = true ]; then
BUILD_ARGS+=" --build-arg USE_SCCACHE=true"
......
......@@ -92,6 +92,13 @@ elif [ "$ARCH" = "aarch64" ]; then
ARCH="arm64"
fi
# Set alternative CPU architecture naming
if [ "$ARCH" = "amd64" ]; then
ALT_ARCH="x86_64"
elif [ "$ARCH" = "arm64" ]; then
ALT_ARCH="aarch64"
fi
export MAX_JOBS=$MAX_JOBS
export CUDA_HOME=/usr/local/cuda
......@@ -128,8 +135,6 @@ cd $INSTALLATION_DIR
git clone https://github.com/vllm-project/vllm.git vllm
cd vllm
git checkout $VLLM_REF
# TODO: remove this cherry-pick when vllm is upgraded to > 0.12.0 (when the fix is shipped)
git cherry-pick --no-commit 799804d140fc99ce3964648ba91aaa810cf28fef # nvshmem fix for CUDA 13.0
echo "✓ vLLM repository cloned"
......@@ -140,24 +145,15 @@ if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
elif [[ "$CUDA_VERSION_MAJOR" == "13" ]]; then
if [ "$ARCH" = "amd64" ]; then
echo "Installing vLLM $VLLM_REF from GitHub since CUDA 13 x86_64 wheel is only present on GitHub..."
uv pip install \
--index-strategy=unsafe-best-match \
--extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND} \
https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_31_x86_64.whl[flashinfer,runai] \
--torch-backend=${TORCH_BACKEND}
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
echo "✓ vLLM installation completed"
else
echo "⚠ Skipping LMCache on ARM64 (compatibility issues, missing aarch64 wheels)"
echo "Building vLLM from source for ${ARCH} architecture..."
echo "Try to install specific PyTorch and other dependencies first"
uv pip install --index-strategy=unsafe-best-match --index https://download.pytorch.org/whl/ -r requirements/cuda.txt
uv pip install setuptools_scm # required to build vLLM from source
MAX_JOBS=${MAX_JOBS} uv pip install -v --no-build-isolation .
fi
echo "⚠ Skipping LMCache on CUDA 13 env since LMCache doesn't support CUDA 13 "
echo "Installing vLLM $VLLM_REF from GitHub since CUDA 13 x86_64 wheel is only present on GitHub..."
uv pip install \
--index-strategy=unsafe-best-match \
--extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND} \
https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_35_${ALT_ARCH}.whl[flashinfer,runai] \
--torch-backend=${TORCH_BACKEND}
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
else
echo "❌ Unsupported CUDA version for vLLM installation: ${CUDA_VERSION}"
exit 1
......
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Integration test to check CUDA major version consistency across various packages."""
import re
import subprocess
import pytest
# Mark this with every framework to test every container
pytestmark = [
pytest.mark.gpu_0,
pytest.mark.integration,
pytest.mark.parallel,
pytest.mark.post_merge,
pytest.mark.pre_merge,
pytest.mark.sglang,
pytest.mark.trtllm,
pytest.mark.vllm,
]
# Easy to edit later:
IGNORE_PIP_PREFIXES = ("cupy", "nixl")
def sh(cmd: str) -> str:
"""
Run command and return stdout only.
We intentionally drop stderr to avoid noisy tools (pip warnings, etc.).
"""
p = subprocess.run(
["bash", "-lc", f"{cmd} 2>/dev/null"],
stdout=subprocess.PIPE,
text=True,
check=False,
)
return (p.stdout or "").strip()
def major_from_text(text: str) -> int | None:
"""Extract CUDA major (12 or 13) from arbitrary text; otherwise None."""
if not text:
return None
# fmt: off
pats = [
r"\bCUDA_VERSION=(1[23])\.", # CUDA_VERSION=13.0.2
r"\bNV_CUDA_.*?_VERSION=(1[23])\.", # NV_CUDA_CUDART_VERSION=13.0...
r"\+cuda(1[23])\.", # ...+cuda13.0
r"\bcuda\s*>=\s*(1[23])\.", # cuda>=13.0 ...
r"\brelease\s+(1[23])\.", # nvcc: release 13.0
r"-(1[23])-\d\b", # dpkg: ...-13-0
r"\bcuda(1[23])x\b", # cupy-cuda12x (from name)
r"[-+]cu(1[23])", # -cu13 or +cu13 in name
]
# fmt: on
for pat in pats:
m = re.search(pat, text, flags=re.IGNORECASE)
if m:
maj = int(m.group(1))
if maj in (12, 13):
return maj
return None
def pip_cuda_major_from_line(line: str) -> int | None:
"""
Given a pip freeze line like 'torch==2.9.0+cu130', infer CUDA major.
Returns 12/13 or None.
"""
return major_from_text(line)
def keep_pip_line(line: str) -> bool:
"""
Ignore some packages from pip signal (editable allow/ignore policy).
"""
name = line.split("==", 1)[0].strip().lower()
return not name.startswith(IGNORE_PIP_PREFIXES)
@pytest.mark.cuda
def test_cuda_major_consistency() -> None:
"""
Collect CUDA major versions (12/13) from predefined signals and assert consistency.
Prints a readable report with full relevant output when failing.
"""
signals = [
("env:CUDA_VERSION", "env | grep -i '^CUDA_VERSION='"),
("env:NV_CUDA_CUDART_VERSION", "env | grep -i '^NV_CUDA_CUDART_VERSION='"),
("env:NV_CUDA_LIB_VERSION", "env | grep -i '^NV_CUDA_LIB_VERSION='"),
("env:NV_LIBNCCL_PACKAGE", "env | grep -i '^NV_LIBNCCL_PACKAGE='"),
("env:NVIDIA_REQUIRE_CUDA", "env | grep -i '^NVIDIA_REQUIRE_CUDA='"),
("nvcc", "nvcc --version | grep -i 'release' || nvcc --version"),
("dpkg:cuda-*", "dpkg -l | grep -E '^(ii|hi)\\s+cuda-.*-(12|13)-'"),
(
"dpkg:libcublas/libnccl",
"dpkg -l | grep -E '^(ii|hi)\\s+lib(cublas|nccl).*-(12|13)-'",
),
# pip signal: gather a targeted list, then infer majors per line (excluding ignored prefixes)
(
"pip:selected",
"python -m pip list --format=freeze | grep -Ei '(cuda|cudnn|nccl|nvshmem|\\+cu(12|13)[0-9]{2}|-cu(12|13)|^(torch|torchaudio|torchvision)==)'",
),
]
rows: list[tuple[str, int | None, list[str]]] = []
for label, cmd in signals:
out = sh(cmd)
lines = [ln.strip() for ln in out.splitlines() if ln.strip()]
if label.startswith("pip:"):
lines = [ln for ln in lines if keep_pip_line(ln)]
majors = {pip_cuda_major_from_line(ln) for ln in lines}
majors.discard(None)
maj = majors.pop() if len(majors) == 1 else None # None if ambiguous/mixed
# If mixed, we’ll surface it via the global consistency check below.
else:
maj = major_from_text(out)
rows.append((label, maj, lines if lines else ["<no output>"]))
# Compute all detected majors across *all* signals, including per-line pip majors.
detected: list[int] = []
for label, maj, lines in rows:
if label.startswith("pip:"):
for ln in lines:
m = pip_cuda_major_from_line(ln)
if m is not None:
detected.append(m)
else:
if maj is not None:
detected.append(maj)
if not detected:
pytest.skip("No CUDA major (12/13) detected from any signal.")
unique = sorted(set(detected))
# Build a readable multi-line report (no truncation to first line).
report = [
"CUDA major signals (pip ignores prefixes: "
+ ", ".join(IGNORE_PIP_PREFIXES)
+ "):"
]
for label, maj, lines in rows:
maj_s = str(maj) if maj is not None else "-"
report.append(f" {maj_s:>2} {label}")
for ln in lines[:50]: # keep it readable; adjust if you want more
report.append(f" {ln}")
if len(lines) > 50:
report.append(f" ... ({len(lines) - 50} more lines)")
assert len(unique) == 1, (
"\n".join(report) + f"\n\nInconsistent CUDA majors detected: {unique}"
)
......@@ -34,6 +34,12 @@ from tests.utils.payloads import LoraTestChatPayload, ToolCallingChatPayload
logger = logging.getLogger(__name__)
def _is_cuda13() -> bool:
v = os.environ.get("CUDA_VERSION", "")
# handles "13", "13.0", "13.0.1", etc.
return v.startswith("13")
@dataclass
class VLLMConfig(EngineConfig):
"""Configuration for vLLM test scenarios"""
......@@ -111,6 +117,11 @@ vllm_configs = {
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.timeout(360), # 3x estimated time (70s) + download time (150s)
pytest.mark.xfail(
_is_cuda13(),
reason="lmcache does not support CUDA 13 as of v0.3.11",
strict=False,
),
],
model="Qwen/Qwen3-0.6B",
request_payloads=[
......@@ -128,6 +139,11 @@ vllm_configs = {
pytest.mark.gpu_1,
pytest.mark.pre_merge,
pytest.mark.timeout(360), # 3x estimated time (70s) + download time (150s)
pytest.mark.xfail(
_is_cuda13(),
reason="lmcache does not support CUDA 13 as of v0.3.11",
strict=False,
),
],
model="Qwen/Qwen3-0.6B",
env={
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment