feat: vllm sglang cuda13 main (#5218)

Signed-off-by: Anant Sharma <anants@nvidia.com> Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com> Co-authored-by: Anant Sharma <anants@nvidia.com> Co-authored-by: Dillon Cullinan <dcullinan@nvidia.com>

feat: vllm sglang cuda13 main (#5218)
Signed-off-by: Anant Sharma <anants@nvidia.com> Signed-off-by: Dmitry Tokarev <dtokarev@nvidia.com> Co-authored-by: Anant Sharma <anants@nvidia.com> Co-authored-by: Dillon Cullinan <dcullinan@nvidia.com>
4a2235d1 · Dmitry Tokarev · GitHub · 9bb7f101 · 4a2235d1 · 4a2235d1
Unverified Commit 4a2235d1 authored Jan 09, 2026 by Dmitry Tokarev Committed by GitHub Jan 09, 2026
8 changed files
--- a/.github/actions/docker-build/action.yml
+++ b/.github/actions/docker-build/action.yml
@@ -42,7 +42,7 @@ inputs:
    required: false
  cuda_version:
    description: 'Optional override for CUDA_VERSION build-arg'
-    required: false
+    required: true
  torch_backend:
    description: 'Optional override for TORCH_BACKEND build-arg (e.g., cu129)'
    required: false
@@ -86,6 +86,7 @@ runs:
        GITHUB_RUN_ID: ${{ github.run_id }}
        GITHUB_JOB: ${{ github.job }}
        GITHUB_REF_NAME: ${{ github.ref_name }}
+        CUDA_VERSION: ${{ inputs.cuda_version }}
      run: |
        set -x
        # Determine image tag
@@ -95,6 +96,8 @@ runs:
          IMAGE_TAG="${{ inputs.framework }}:latest"
        fi

+        CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*}
+
        BUILD_START_TIME=$(date -u +%Y-%m-%dT%H:%M:%SZ)
        echo "BUILD_START_TIME=${BUILD_START_TIME}" >> $GITHUB_ENV

@@ -110,10 +113,10 @@ runs:
        # Set base cache args and set --cache-to if this is a main commit
        EXTRA_ARGS=""
        EXTRA_ARGS="--cache-to type=inline "
-        EXTRA_ARGS+="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:${{ inputs.framework }}-${PLATFORM##*/}-cache "
-        EXTRA_ARGS+="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:main-${{ inputs.framework }}-${PLATFORM##*/} "
+        EXTRA_ARGS+="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:${{ inputs.framework }}-cuda${CUDA_VERSION_MAJOR}-${PLATFORM##*/}-cache "
+        EXTRA_ARGS+="--cache-from type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:main-${{ inputs.framework }}-cuda${CUDA_VERSION_MAJOR}-${PLATFORM##*/} "
        if [[ "$GITHUB_REF_NAME" == "main" ]]; then
-          EXTRA_ARGS+="--cache-to type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:${{ inputs.framework }}-${PLATFORM##*/}-cache,mode=max "
+          EXTRA_ARGS+="--cache-to type=registry,ref=${ECR_HOSTNAME}/ai-dynamo/dynamo:${{ inputs.framework }}-cuda${CUDA_VERSION_MAJOR}-${PLATFORM##*/}-cache,mode=max "
        fi

        echo "$EXTRA_ARGS"

--- a/.github/actions/pytest/action.yml
+++ b/.github/actions/pytest/action.yml
@@ -122,7 +122,7 @@ runs:
            echo "🔍 Mypy type checking enabled"
            MYPY_FLAG="--mypy"
          fi
-          PYTEST_CMD="pytest -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 ${MYPY_FLAG} -m \"${{ inputs.pytest_marks }}\""
+          PYTEST_CMD="pytest --continue-on-collection-errors -v --tb=short --basetemp=/tmp -o cache_dir=/tmp/.pytest_cache --junitxml=/workspace/test-results/${{ env.PYTEST_XML_FILE }} --durations=10 ${MYPY_FLAG} -m \"${{ inputs.pytest_marks }}\""

          # Detect GPU availability and conditionally add GPU flags
          GPU_FLAGS=""

--- a/.github/workflows/container-validation-backends.yml
+++ b/.github/workflows/container-validation-backends.yml
@@ -49,7 +49,7 @@ jobs:

  backend-status-check:
    runs-on: ubuntu-latest
-    needs: [changed-files, vllm, sglang, trtllm, operator]
+    needs: [changed-files, vllm, sglang, trtllm, operator]  # THIS list determines blocking jobs
    if: always()
    steps:
      - name: "Check all dependent jobs"
@@ -132,6 +132,73 @@ jobs:
          azure_push: 'true'
          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}

+  vllm-cuda-13:
+    needs: changed-files
+    if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true'
+    strategy:
+      fail-fast: false
+      matrix:
+        platform:
+          - { arch: amd64, runner: gpu-l40-amd64 }
+          - { arch: arm64, runner: cpu-arm-r8g-4xlarge }
+    name: vllm-cuda-13 (${{ matrix.platform.arch }})
+    runs-on: ${{ matrix.platform.runner }}
+    steps:
+      - name: Output Node Name
+        shell: bash
+        run: |
+          echo ${K8S_NODE_NAME}
+      - name: Checkout code
+        uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955  # v4.3.0
+        with:
+          lfs: true
+      - name: Docker Login
+        uses: ./.github/actions/docker-login
+        with:
+          ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
+          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
+          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
+      - name: Build Container
+        id: build-image
+        uses: ./.github/actions/docker-build
+        with:
+          framework: vllm
+          target: runtime
+          platform: 'linux/${{ matrix.platform.arch }}'
+          base_image_tag: '25.11-cuda13.0-devel-ubuntu24.04'
+          runtime_image_tag: '13.0.2-runtime-ubuntu24.04'
+          cuda_version: '13.0'
+          torch_backend: 'cu130'
+          ci_token: ${{ secrets.CI_TOKEN }}
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+          sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+      - name: Docker Tag and Push
+        uses: ./.github/actions/docker-tag-push
+        with:
+          local_image: ${{ steps.build-image.outputs.image_tag }}
+          push_tags: ai-dynamo/dynamo:${{ github.sha }}-vllm-cuda13-${{ matrix.platform.arch }}
+          conditional_tag: ${{ github.ref_name == 'main' && format('ai-dynamo/dynamo:main-vllm-cuda13-{0}', matrix.platform.arch) || '' }}
+          aws_push: 'true'
+          azure_push: 'true'
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
+      - name: Run tests
+        uses: ./.github/actions/pytest
+        with:
+          image_tag: ${{ steps.build-image.outputs.image_tag }}
+          # GH ARM runners have no GPUs, run CPU tests only on ARM
+          pytest_marks: ${{ matrix.platform.arch == 'amd64' && 'pre_merge and vllm and (gpu_0 or gpu_1)' || 'pre_merge and vllm and gpu_0' }}
+          framework: "vllm"
+          test_type: "pre_merge"
+          platform_arch: ${{ matrix.platform.arch }}
+
  vllm:
    needs: changed-files
    if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.vllm == 'true'
@@ -170,7 +237,7 @@ jobs:
          platform: 'linux/${{ matrix.platform.arch }}'
          base_image_tag: ${{ matrix.platform.arch == 'arm64' && '25.06-cuda12.9-devel-ubuntu24.04' || '' }}
          runtime_image_tag: ${{ matrix.platform.arch == 'arm64' && '12.9.0-runtime-ubuntu24.04' || '' }}
-          cuda_version: ${{ matrix.platform.arch == 'arm64' && '12.9' || '' }}
+          cuda_version: '12.9'
          torch_backend: ${{ matrix.platform.arch == 'arm64' && 'cu129' || '' }}
          ci_token: ${{ secrets.CI_TOKEN }}
          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
@@ -182,25 +249,90 @@ jobs:
        uses: ./.github/actions/docker-tag-push
        with:
          local_image: ${{ steps.build-image.outputs.image_tag }}
-          push_tags: ai-dynamo/dynamo:${{ github.sha }}-vllm-${{ matrix.platform.arch }}
-          conditional_tag: ${{ github.ref_name == 'main' && format('ai-dynamo/dynamo:main-vllm-{0}', matrix.platform.arch) || '' }}
+          push_tags: |
+            ai-dynamo/dynamo:${{ github.sha }}-vllm-${{ matrix.platform.arch }}
+            ai-dynamo/dynamo:${{ github.sha }}-vllm-cuda12-${{ matrix.platform.arch }}
+          conditional_tag: ${{ github.ref_name == 'main' && format('ai-dynamo/dynamo:main-vllm-cuda12-{0}', matrix.platform.arch) || '' }}
          aws_push: 'true'
          azure_push: 'true'
          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
      - name: Run tests
-        if: ${{ matrix.platform.arch != 'arm64' }}
        uses: ./.github/actions/pytest
        with:
          image_tag: ${{ steps.build-image.outputs.image_tag }}
-          pytest_marks: "pre_merge and vllm and (gpu_0 or gpu_1)"
+          # GH ARM runners have no GPUs, run CPU tests only on ARM
+          pytest_marks: ${{ matrix.platform.arch == 'amd64' && 'pre_merge and vllm and (gpu_0 or gpu_1)' || 'pre_merge and vllm and gpu_0' }}
          framework: "vllm"
          test_type: "pre_merge"
          platform_arch: ${{ matrix.platform.arch }}
          enable_mypy: 'true'
          hf_token: ${{ secrets.HF_TOKEN }}

+  sglang-cuda-13:
+    needs: changed-files
+    if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true'
+    strategy:
+      fail-fast: false
+      matrix:
+        platform:
+          - { arch: amd64, runner: gpu-l40-amd64 }
+          - { arch: arm64, runner: cpu-arm-r8g-4xlarge }
+    name: sglang-cuda-13 (${{ matrix.platform.arch }})
+    runs-on: ${{ matrix.platform.runner }}
+    steps:
+      - name: Output Node Name
+        shell: bash
+        run: |
+          echo ${K8S_NODE_NAME}
+      - name: Checkout repository
+        uses: actions/checkout@08eba0b27e820071cde6df949e0beb9ba4906955  # v4.3.0
+      - name: Docker Login
+        uses: ./.github/actions/docker-login
+        with:
+          ngc_ci_access_token: ${{ secrets.NGC_CI_ACCESS_TOKEN }}
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
+          azure_acr_user: ${{ secrets.AZURE_ACR_USER }}
+          azure_acr_password: ${{ secrets.AZURE_ACR_PASSWORD }}
+      - name: Build Container
+        id: build-image
+        uses: ./.github/actions/docker-build
+        with:
+          framework: sglang
+          target: runtime
+          platform: 'linux/${{ matrix.platform.arch }}'
+          cuda_version: '13.0'
+          torch_backend: 'cu130'
+          ci_token: ${{ secrets.CI_TOKEN }}
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+          sccache_s3_bucket: ${{ secrets.SCCACHE_S3_BUCKET }}
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+          aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+      - name: Docker Tag and Push
+        uses: ./.github/actions/docker-tag-push
+        with:
+          local_image: ${{ steps.build-image.outputs.image_tag }}
+          push_tags: ai-dynamo/dynamo:${{ github.sha }}-sglang-cuda13-${{ matrix.platform.arch }}
+          conditional_tag: ${{ github.ref_name == 'main' && format('ai-dynamo/dynamo:main-sglang-cuda13-{0}', matrix.platform.arch) || '' }}
+          aws_push: 'true'
+          azure_push: 'true'
+          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
+          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
+          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
+      - name: Run tests
+        uses: ./.github/actions/pytest
+        with:
+          image_tag: ${{ steps.build-image.outputs.image_tag }}
+          # GH arm runners have no GPUs, testing arm containers on cpu only
+          pytest_marks: ${{ matrix.platform.arch == 'amd64' && 'pre_merge and sglang and (gpu_0 or gpu_1)' || 'pre_merge and sglang and gpu_0' }}
+          framework: "sglang"
+          test_type: "pre_merge"
+          platform_arch: ${{ matrix.platform.arch }}
+
  sglang:
    needs: changed-files
    if: needs.changed-files.outputs.core == 'true' || needs.changed-files.outputs.sglang == 'true'
@@ -235,6 +367,7 @@ jobs:
        uses: ./.github/actions/docker-build
        with:
          framework: sglang
+          cuda_version: '12.9'
          target: runtime
          platform: 'linux/${{ matrix.platform.arch }}'
          ci_token: ${{ secrets.CI_TOKEN }}
@@ -247,19 +380,21 @@ jobs:
        uses: ./.github/actions/docker-tag-push
        with:
          local_image: ${{ steps.build-image.outputs.image_tag }}
-          push_tags: ai-dynamo/dynamo:${{ github.sha }}-sglang-${{ matrix.platform.arch }}
-          conditional_tag: ${{ github.ref_name == 'main' && format('ai-dynamo/dynamo:main-sglang-{0}', matrix.platform.arch) || '' }}
+          push_tags: |
+            ai-dynamo/dynamo:${{ github.sha }}-sglang-${{ matrix.platform.arch }}
+            ai-dynamo/dynamo:${{ github.sha }}-sglang-cuda12-${{ matrix.platform.arch }}
+          conditional_tag: ${{ github.ref_name == 'main' && format('ai-dynamo/dynamo:main-sglang-cuda12-{0}', matrix.platform.arch) || '' }}
          aws_push: 'true'
          azure_push: 'true'
          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
      - name: Run tests
-        if: ${{ matrix.platform.arch != 'arm64' }}
        uses: ./.github/actions/pytest
        with:
          image_tag: ${{ steps.build-image.outputs.image_tag }}
-          pytest_marks: "pre_merge and sglang and (gpu_0 or gpu_1)"
+          # GH arm runners have no GPUs, testing arm containers on cpu only
+          pytest_marks: ${{ matrix.platform.arch == 'amd64' && 'pre_merge and sglang and (gpu_0 or gpu_1)' || 'pre_merge and sglang and gpu_0' }}
          framework: "sglang"
          test_type: "pre_merge"
          platform_arch: ${{ matrix.platform.arch }}
@@ -300,6 +435,7 @@ jobs:
        uses: ./.github/actions/docker-build
        with:
          framework: trtllm
+          cuda_version: '13.0'
          target: runtime
          platform: 'linux/${{ matrix.platform.arch }}'
          ci_token: ${{ secrets.CI_TOKEN }}
@@ -312,19 +448,25 @@ jobs:
        uses: ./.github/actions/docker-tag-push
        with:
          local_image: ${{ steps.build-image.outputs.image_tag }}
-          push_tags: ai-dynamo/dynamo:${{ github.sha }}-trtllm-${{ matrix.platform.arch }}
-          conditional_tag: ${{ github.ref_name == 'main' && format('ai-dynamo/dynamo:main-trtllm-{0}', matrix.platform.arch) || '' }}
+          push_tags: |
+            ai-dynamo/dynamo:${{ github.sha }}-trtllm-${{ matrix.platform.arch }}
+            ai-dynamo/dynamo:${{ github.sha }}-trtllm-cuda13-${{ matrix.platform.arch }}
+          conditional_tag: ${{ github.ref_name == 'main' && format('ai-dynamo/dynamo:main-trtllm-cuda13-{0}', matrix.platform.arch) || '' }}
          aws_push: 'true'
          azure_push: 'true'
          aws_account_id: ${{ secrets.AWS_ACCOUNT_ID }}
          aws_default_region: ${{ secrets.AWS_DEFAULT_REGION }}
          azure_acr_hostname: ${{ secrets.AZURE_ACR_HOSTNAME }}
      - name: Run tests
+        # TODO: enable testing on ARM when these tests stop failing on collection on machines without GPUs:
+        #  components/src/dynamo/trtllm/tests/test_trtllm_autodeploy.py
+        #  components/src/dynamo/trtllm/tests/test_trtllm_unit.py
        if: ${{ matrix.platform.arch != 'arm64' }}
        uses: ./.github/actions/pytest
        with:
          image_tag: ${{ steps.build-image.outputs.image_tag }}
-          pytest_marks: "pre_merge and trtllm and (gpu_0 or gpu_1)"
+          # GH arm runners have no GPUs, testing arm containers on cpu only
+          pytest_marks: ${{ matrix.platform.arch == 'amd64' && 'pre_merge and trtllm and (gpu_0 or gpu_1)' || 'pre_merge and trtllm and gpu_0' }}
          framework: "trtllm"
          test_type: "pre_merge"
          platform_arch: ${{ matrix.platform.arch }}

--- a/container/Dockerfile.sglang
+++ b/container/Dockerfile.sglang
@@ -22,7 +22,7 @@
 # DEVELOPMENT PATHS THAT MUST BE GROUP-WRITABLE (for virtualenv containers):
 #   /workspace            - Users create/modify project files
 #   /home/dynamo          - Users create config/cache files
-#   /home/dynamo/.local   - SGLang uses $HOME/.local/lib/python3.10/site-packages for pip install
+#   /home/dynamo/.local   - SGLang uses $HOME/.local/lib/python3.12/site-packages for pip install
 #
 # HOW TO ACHIEVE GROUP-WRITABLE PERMISSIONS:
 # 1. SHELL + /etc/profile.d - Login shell sources umask 002 globally for all RUN commands (775/664)
@@ -499,12 +499,15 @@ RUN --mount=type=bind,source=.,target=/mnt/local_src \
    pip install --break-system-packages --no-cache . && \
    # pip/uv bypasses umask when creating .egg-info files, but chmod -R is fast here (small directory)
    chmod -R g+w /workspace/benchmarks && \
-    # Install NVIDIA packages that are needed for DeepEP to work properly
-    # This is done in the upstream runtime image too, but we overrode these packages earlier
-    pip install --no-cache-dir --break-system-packages --force-reinstall --no-deps \
-        nvidia-nccl-cu12==2.28.3 \
-        nvidia-cudnn-cu12==9.16.0.29 \
-        nvidia-cutlass-dsl==4.3.0
+    CUDA_MAJOR=$(nvcc --version | egrep -o 'cuda_[0-9]+' | cut -d_ -f2) && \
+    if [ "$CUDA_MAJOR" = "12" ]; then \
+        # Install NVIDIA packages that are needed for DeepEP to work properly
+        # This is done in the upstream runtime image too, but these packages are overridden in earlier commands
+        pip install --no-cache-dir --break-system-packages --force-reinstall --no-deps \
+            nvidia-nccl-cu12==2.28.3 \
+            nvidia-cudnn-cu12==9.16.0.29 \
+            nvidia-cutlass-dsl==4.3.0; \
+    fi

 # Copy tests, deploy and components for CI with correct ownership
 # Pattern: COPY --chmod=775 <path>; chmod g+w <path> done later as root because COPY --chmod only affects <path>/*, not <path>

--- a/container/build.sh
+++ b/container/build.sh
@@ -118,8 +118,10 @@ NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"

 SGLANG_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 SGLANG_BASE_IMAGE_TAG="25.06-cuda12.9-devel-ubuntu24.04"
+SGLANG_BASE_IMAGE_TAG_CU13="25.11-cuda13.0-devel-ubuntu24.04"
 SGLANG_CUDA_VERSION="12.9.1"
-SGLANG_PYTHON_VERSION="3.10"
+SGLANG_CUDA_VERSION_CU13="13.0.1"
+SGLANG_RUNTIME_IMAGE_TAG_CU13="v0.5.7-cu130-runtime"

 # GAIE (Gateway API Inference Extension) configuration for frontend (required for EPP binary for frontend image)
 GAIE_REPO_URL="https://github.com/kubernetes-sigs/gateway-api-inference-extension.git"
@@ -420,6 +422,17 @@ get_options() {
            echo "INFO: Overriding base image tag for vLLM with CUDA 13: $BASE_IMAGE_TAG AND RUNTIME_IMAGE_TAG: $RUNTIME_IMAGE_TAG"
        fi

+
+        if [[ $FRAMEWORK == "SGLANG" ]] && [[ $CUDA_VERSION == "13."* ]]; then
+            BASE_IMAGE_TAG=$SGLANG_BASE_IMAGE_TAG_CU13
+            BUILD_ARGS+=" --build-arg BASE_IMAGE_TAG=${SGLANG_BASE_IMAGE_TAG_CU13} "
+            SGLANG_CUDA_VERSION="${SGLANG_CUDA_VERSION_CU13}"
+            RUNTIME_IMAGE_TAG="${SGLANG_RUNTIME_IMAGE_TAG_CU13}"
+            BUILD_ARGS+=" --build-arg RUNTIME_IMAGE_TAG=${RUNTIME_IMAGE_TAG} "
+            echo "INFO: Overriding base image tag for SGLang with CUDA 13: $BASE_IMAGE_TAG AND RUNTIME_IMAGE_TAG: $RUNTIME_IMAGE_TAG"
+        fi
+
+
        if [ -z "$BASE_IMAGE" ]; then
            error "ERROR: Framework $FRAMEWORK without BASE_IMAGE"
        fi
@@ -923,11 +936,11 @@ fi

 if [[ $FRAMEWORK == "SGLANG" ]]; then
    echo "Customizing Python, CUDA, and framework images for sglang images"
-    BUILD_ARGS+=" --build-arg PYTHON_VERSION=${SGLANG_PYTHON_VERSION}"
    BUILD_ARGS+=" --build-arg CUDA_VERSION=${SGLANG_CUDA_VERSION}"
-else
-    BUILD_ARGS+=" --build-arg PYTHON_VERSION=${PYTHON_VERSION}"
 fi
+
+BUILD_ARGS+=" --build-arg PYTHON_VERSION=${PYTHON_VERSION}"
+
 # Add sccache build arguments
 if [ "$USE_SCCACHE" = true ]; then
    BUILD_ARGS+=" --build-arg USE_SCCACHE=true"

--- a/container/deps/vllm/install_vllm.sh
+++ b/container/deps/vllm/install_vllm.sh
@@ -92,6 +92,13 @@ elif [ "$ARCH" = "aarch64" ]; then
    ARCH="arm64"
 fi

+# Set alternative CPU architecture naming
+if [ "$ARCH" = "amd64" ]; then
+    ALT_ARCH="x86_64"
+elif [ "$ARCH" = "arm64" ]; then
+    ALT_ARCH="aarch64"
+fi
+
 export MAX_JOBS=$MAX_JOBS
 export CUDA_HOME=/usr/local/cuda

@@ -128,8 +135,6 @@ cd $INSTALLATION_DIR
 git clone https://github.com/vllm-project/vllm.git vllm
 cd vllm
 git checkout $VLLM_REF
-# TODO: remove this cherry-pick when vllm is upgraded to > 0.12.0 (when the fix is shipped)
-git cherry-pick --no-commit 799804d140fc99ce3964648ba91aaa810cf28fef # nvshmem fix for CUDA 13.0
 echo "✓ vLLM repository cloned"


@@ -140,24 +145,15 @@ if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
    uv pip install flashinfer-cubin==$FLASHINF_REF
    uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
 elif [[ "$CUDA_VERSION_MAJOR" == "13" ]]; then
-    if [ "$ARCH" = "amd64" ]; then
-        echo "Installing vLLM $VLLM_REF from GitHub since CUDA 13 x86_64 wheel is only present on GitHub..."
-        uv pip install \
-            --index-strategy=unsafe-best-match \
-            --extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND} \
-            https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_31_x86_64.whl[flashinfer,runai] \
-            --torch-backend=${TORCH_BACKEND}
-        uv pip install flashinfer-cubin==$FLASHINF_REF
-        uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
-        echo "✓ vLLM installation completed"
-    else
-        echo "⚠ Skipping LMCache on ARM64 (compatibility issues, missing aarch64 wheels)"
-        echo "Building vLLM from source for ${ARCH} architecture..."
-        echo "Try to install specific PyTorch and other dependencies first"
-        uv pip install --index-strategy=unsafe-best-match --index https://download.pytorch.org/whl/ -r requirements/cuda.txt
-        uv pip install setuptools_scm # required to build vLLM from source
-        MAX_JOBS=${MAX_JOBS} uv pip install -v --no-build-isolation .
-    fi
+    echo "⚠ Skipping LMCache on CUDA 13 env since LMCache doesn't support CUDA 13 "
+    echo "Installing vLLM $VLLM_REF from GitHub since CUDA 13 x86_64 wheel is only present on GitHub..."
+    uv pip install \
+        --index-strategy=unsafe-best-match \
+        --extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND} \
+        https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_35_${ALT_ARCH}.whl[flashinfer,runai] \
+        --torch-backend=${TORCH_BACKEND}
+    uv pip install flashinfer-cubin==$FLASHINF_REF
+    uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
 else
    echo "❌ Unsupported CUDA version for vLLM installation: ${CUDA_VERSION}"
    exit 1

--- a/tests/basic/test_cuda_version_consistency.py
+++ b/tests/basic/test_cuda_version_consistency.py
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Integration test to check CUDA major version consistency across various packages."""
+
+import re
+import subprocess
+
+import pytest
+
+# Mark this with every framework to test every container
+pytestmark = [
+    pytest.mark.gpu_0,
+    pytest.mark.integration,
+    pytest.mark.parallel,
+    pytest.mark.post_merge,
+    pytest.mark.pre_merge,
+    pytest.mark.sglang,
+    pytest.mark.trtllm,
+    pytest.mark.vllm,
+]
+
+# Easy to edit later:
+IGNORE_PIP_PREFIXES = ("cupy", "nixl")
+
+
+def sh(cmd: str) -> str:
+    """
+    Run command and return stdout only.
+    We intentionally drop stderr to avoid noisy tools (pip warnings, etc.).
+    """
+    p = subprocess.run(
+        ["bash", "-lc", f"{cmd} 2>/dev/null"],
+        stdout=subprocess.PIPE,
+        text=True,
+        check=False,
+    )
+    return (p.stdout or "").strip()
+
+
+def major_from_text(text: str) -> int | None:
+    """Extract CUDA major (12 or 13) from arbitrary text; otherwise None."""
+    if not text:
+        return None
+
+    # fmt: off
+    pats = [
+        r"\bCUDA_VERSION=(1[23])\.",          # CUDA_VERSION=13.0.2
+        r"\bNV_CUDA_.*?_VERSION=(1[23])\.",   # NV_CUDA_CUDART_VERSION=13.0...
+        r"\+cuda(1[23])\.",                   # ...+cuda13.0
+        r"\bcuda\s*>=\s*(1[23])\.",           # cuda>=13.0 ...
+        r"\brelease\s+(1[23])\.",             # nvcc: release 13.0
+        r"-(1[23])-\d\b",                     # dpkg: ...-13-0
+        r"\bcuda(1[23])x\b",                  # cupy-cuda12x (from name)
+        r"[-+]cu(1[23])",                     # -cu13 or +cu13 in name
+    ]
+    # fmt: on
+    for pat in pats:
+        m = re.search(pat, text, flags=re.IGNORECASE)
+        if m:
+            maj = int(m.group(1))
+            if maj in (12, 13):
+                return maj
+    return None
+
+
+def pip_cuda_major_from_line(line: str) -> int | None:
+    """
+    Given a pip freeze line like 'torch==2.9.0+cu130', infer CUDA major.
+    Returns 12/13 or None.
+    """
+    return major_from_text(line)
+
+
+def keep_pip_line(line: str) -> bool:
+    """
+    Ignore some packages from pip signal (editable allow/ignore policy).
+    """
+    name = line.split("==", 1)[0].strip().lower()
+    return not name.startswith(IGNORE_PIP_PREFIXES)
+
+
+@pytest.mark.cuda
+def test_cuda_major_consistency() -> None:
+    """
+    Collect CUDA major versions (12/13) from predefined signals and assert consistency.
+    Prints a readable report with full relevant output when failing.
+    """
+
+    signals = [
+        ("env:CUDA_VERSION", "env | grep -i '^CUDA_VERSION='"),
+        ("env:NV_CUDA_CUDART_VERSION", "env | grep -i '^NV_CUDA_CUDART_VERSION='"),
+        ("env:NV_CUDA_LIB_VERSION", "env | grep -i '^NV_CUDA_LIB_VERSION='"),
+        ("env:NV_LIBNCCL_PACKAGE", "env | grep -i '^NV_LIBNCCL_PACKAGE='"),
+        ("env:NVIDIA_REQUIRE_CUDA", "env | grep -i '^NVIDIA_REQUIRE_CUDA='"),
+        ("nvcc", "nvcc --version | grep -i 'release' || nvcc --version"),
+        ("dpkg:cuda-*", "dpkg -l | grep -E '^(ii|hi)\\s+cuda-.*-(12|13)-'"),
+        (
+            "dpkg:libcublas/libnccl",
+            "dpkg -l | grep -E '^(ii|hi)\\s+lib(cublas|nccl).*-(12|13)-'",
+        ),
+        # pip signal: gather a targeted list, then infer majors per line (excluding ignored prefixes)
+        (
+            "pip:selected",
+            "python -m pip list --format=freeze | grep -Ei '(cuda|cudnn|nccl|nvshmem|\\+cu(12|13)[0-9]{2}|-cu(12|13)|^(torch|torchaudio|torchvision)==)'",
+        ),
+    ]
+
+    rows: list[tuple[str, int | None, list[str]]] = []
+
+    for label, cmd in signals:
+        out = sh(cmd)
+        lines = [ln.strip() for ln in out.splitlines() if ln.strip()]
+
+        if label.startswith("pip:"):
+            lines = [ln for ln in lines if keep_pip_line(ln)]
+            majors = {pip_cuda_major_from_line(ln) for ln in lines}
+            majors.discard(None)
+            maj = majors.pop() if len(majors) == 1 else None  # None if ambiguous/mixed
+            # If mixed, we’ll surface it via the global consistency check below.
+        else:
+            maj = major_from_text(out)
+
+        rows.append((label, maj, lines if lines else ["<no output>"]))
+
+    # Compute all detected majors across *all* signals, including per-line pip majors.
+    detected: list[int] = []
+    for label, maj, lines in rows:
+        if label.startswith("pip:"):
+            for ln in lines:
+                m = pip_cuda_major_from_line(ln)
+                if m is not None:
+                    detected.append(m)
+        else:
+            if maj is not None:
+                detected.append(maj)
+
+    if not detected:
+        pytest.skip("No CUDA major (12/13) detected from any signal.")
+
+    unique = sorted(set(detected))
+
+    # Build a readable multi-line report (no truncation to first line).
+    report = [
+        "CUDA major signals (pip ignores prefixes: "
+        + ", ".join(IGNORE_PIP_PREFIXES)
+        + "):"
+    ]
+    for label, maj, lines in rows:
+        maj_s = str(maj) if maj is not None else "-"
+        report.append(f"  {maj_s:>2}  {label}")
+        for ln in lines[:50]:  # keep it readable; adjust if you want more
+            report.append(f"      {ln}")
+        if len(lines) > 50:
+            report.append(f"      ... ({len(lines) - 50} more lines)")
+
+    assert len(unique) == 1, (
+        "\n".join(report) + f"\n\nInconsistent CUDA majors detected: {unique}"
+    )
--- a/tests/serve/test_vllm.py
+++ b/tests/serve/test_vllm.py
@@ -34,6 +34,12 @@ from tests.utils.payloads import LoraTestChatPayload, ToolCallingChatPayload
 logger = logging.getLogger(__name__)


+def _is_cuda13() -> bool:
+    v = os.environ.get("CUDA_VERSION", "")
+    # handles "13", "13.0", "13.0.1", etc.
+    return v.startswith("13")
+
+
 @dataclass
 class VLLMConfig(EngineConfig):
    """Configuration for vLLM test scenarios"""
@@ -111,6 +117,11 @@ vllm_configs = {
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.timeout(360),  # 3x estimated time (70s) + download time (150s)
+            pytest.mark.xfail(
+                _is_cuda13(),
+                reason="lmcache does not support CUDA 13 as of v0.3.11",
+                strict=False,
+            ),
        ],
        model="Qwen/Qwen3-0.6B",
        request_payloads=[
@@ -128,6 +139,11 @@ vllm_configs = {
            pytest.mark.gpu_1,
            pytest.mark.pre_merge,
            pytest.mark.timeout(360),  # 3x estimated time (70s) + download time (150s)
+            pytest.mark.xfail(
+                _is_cuda13(),
+                reason="lmcache does not support CUDA 13 as of v0.3.11",
+                strict=False,
+            ),
        ],
        model="Qwen/Qwen3-0.6B",
        env={