install_vllm.sh

#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# This script is used to install vLLM and its dependencies
# If installing vLLM from a release tag, we will use pip to manage the install
# Otherwise, we will use git to checkout the vLLM source code and build it from source.
# The dependencies are installed in the following order:
# 1. vLLM
# 2. LMCache
# 3. DeepGEMM
# 4. EP kernels

set -euo pipefail

VLLM_REF="v0.10.2"

# Basic Configurations
ARCH=$(uname -m)
MAX_JOBS=16
INSTALLATION_DIR=/tmp

# VLLM and Dependency Configurations
TORCH_BACKEND="cu128"
TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels
DEEPGEMM_REF=""
CUDA_VERSION="12.8" # For DEEPGEMM

# These flags are applicable when installing vLLM from source code
EDITABLE=true
VLLM_GIT_URL="https://github.com/vllm-project/vllm.git"
FLASHINF_REF="v0.3.0"

while [[ $# -gt 0 ]]; do
    case $1 in
        --editable)
            EDITABLE=true
            shift
            ;;
        --no-editable)
            EDITABLE=false
            shift
            ;;
        --vllm-ref)
            VLLM_REF="$2"
            shift 2
            ;;
        --vllm-git-url)
            VLLM_GIT_URL="$2"
            shift 2
            ;;
        --max-jobs)
            MAX_JOBS="$2"
            shift 2
            ;;
        --arch)
            ARCH="$2"
            shift 2
            ;;
        --installation-dir)
            INSTALLATION_DIR="$2"
            shift 2
            ;;
        --deepgemm-ref)
            DEEPGEMM_REF="$2"
            shift 2
            ;;
        --flashinf-ref)
            FLASHINF_REF="$2"
            shift 2
            ;;
        --torch-backend)
            TORCH_BACKEND="$2"
            shift 2
            ;;
        --torch-cuda-arch-list)
            TORCH_CUDA_ARCH_LIST="$2"
            shift 2
            ;;
        --cuda-version)
            CUDA_VERSION="$2"
            shift 2
            ;;
        -h|--help)
            echo "Usage: $0 [--editable|--no-editable] [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--torch-backend BACKEND] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
            echo "Options:"
            echo "  --editable        Install vllm in editable mode (default)"
            echo "  --no-editable     Install vllm in non-editable mode"
            echo "  --vllm-ref REF    Git reference to checkout (default: ${VLLM_REF})"
            echo "  --max-jobs NUM    Maximum number of parallel jobs (default: ${MAX_JOBS})"
            echo "  --arch ARCH       Architecture (amd64|arm64, default: auto-detect)"
            echo "  --installation-dir DIR  Directory to install vllm (default: ${INSTALLATION_DIR})"
            echo "  --deepgemm-ref REF  Git reference for DeepGEMM (default: ${DEEPGEMM_REF})"
            echo "  --flashinf-ref REF  Git reference for Flash Infer (default: ${FLASHINF_REF})"
            echo "  --torch-backend BACKEND  Torch backend to use (default: ${TORCH_BACKEND})"
            echo "  --torch-cuda-arch-list LIST  CUDA architectures to compile for (default: ${TORCH_CUDA_ARCH_LIST})"
            echo "  --cuda-version VERSION  CUDA version to use (default: ${CUDA_VERSION})"
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            exit 1
            ;;
    esac
done

# Convert x86_64 to amd64 for consistency with Docker ARG
if [ "$ARCH" = "x86_64" ]; then
    ARCH="amd64"
elif [ "$ARCH" = "aarch64" ]; then
    ARCH="arm64"
fi

export MAX_JOBS=$MAX_JOBS
export CUDA_HOME=/usr/local/cuda

echo "=== Installing prerequisites ==="
uv pip install pip cuda-python

echo "\n=== Configuration Summary ==="
echo "  VLLM_REF=$VLLM_REF | EDITABLE=$EDITABLE | ARCH=$ARCH"
echo "  MAX_JOBS=$MAX_JOBS | TORCH_BACKEND=$TORCH_BACKEND | CUDA_VERSION=$CUDA_VERSION"
echo "  TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"
echo "  DEEPGEMM_REF=$DEEPGEMM_REF | FLASHINF_REF=$FLASHINF_REF"
echo "  INSTALLATION_DIR=$INSTALLATION_DIR | VLLM_GIT_URL=$VLLM_GIT_URL"

echo "\n=== Cloning vLLM repository ==="
# We need to clone to install dependencies
cd $INSTALLATION_DIR
git clone $VLLM_GIT_URL vllm
cd vllm
git checkout $VLLM_REF

# TODO remove in future vLLM release, re-instate ignore torch script
# https://github.com/vllm-project/vllm/pull/24729
GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 740f064


echo "\n=== Installing vLLM & FlashInfer ==="

if [[ $VLLM_REF =~ ^v ]] && [ "$ARCH" = "amd64" ]; then
    # VLLM_REF starts with 'v' and amd64 - use pip install with version tag
    echo "Installing vLLM $VLLM_REF from PyPI..."

    uv pip install vllm[flashinfer]==$VLLM_REF --torch-backend=$TORCH_BACKEND

else
    # VLLM_REF does not start with 'v' or amd64 - use git checkout path
    if [ "$ARCH" = "arm64" ]; then

        # torch 2.8.0 doesn't have a aarch wheel for cu128, vLLM uses torch 2.8.0 nightly wheel builds to compile its aarch wheel against
        # nightly can be unstable so we will not use it here
        # for now we will use torch 2.7.1+cu128 but this requires a recompilation from source

        echo "Building vLLM from source for ARM64 architecture..."

        # Try to install specific PyTorch version first
        echo "Attempting to install pinned PyTorch nightly versions..."
        if ! uv pip install torch==2.7.1+cu128 torchaudio==2.7.1 torchvision==0.22.1 --index-url https://download.pytorch.org/whl/cu128; then
            echo "Pinned versions failed"
            exit 1
        fi

        # Create constraints file to pin all PyTorch-related versions
        echo "Creating constraints file to preserve PyTorch ecosystem versions..."
        TORCH_VERSION=$(python -c "import torch; print(torch.__version__)")
        TORCHAUDIO_VERSION=$(python -c "import torchaudio; print(torchaudio.__version__)")
        TORCHVISION_VERSION=$(python -c "import torchvision; print(torchvision.__version__)")

        rm -rf /tmp/torch_constraints.txt
        echo "torch==$TORCH_VERSION" > /tmp/torch_constraints.txt
        echo "torchaudio==$TORCHAUDIO_VERSION" >> /tmp/torch_constraints.txt
        echo "torchvision==$TORCHVISION_VERSION" >> /tmp/torch_constraints.txt

        echo "Pinned versions:"
        echo "  - torch==$TORCH_VERSION"
        echo "  - torchaudio==$TORCHAUDIO_VERSION"
        echo "  - torchvision==$TORCHVISION_VERSION"

        python use_existing_torch.py
        uv pip install -c /tmp/torch_constraints.txt -r requirements/build.txt

        if [ "$EDITABLE" = "true" ]; then
            MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation -c /tmp/torch_constraints.txt -e . -v
        else
            MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation -c /tmp/torch_constraints.txt . -v
        fi

        echo "\n=== Installing FlashInfer from source ==="
        cd $INSTALLATION_DIR
        git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
        cd flashinfer
        git checkout $FLASHINF_REF

        # Install with constraints to prevent PyTorch upgrade
        uv pip install -v --no-build-isolation -c /tmp/torch_constraints.txt .

    else
        echo "Building vLLM from source for AMD64 architecture..."

        # When updating above VLLM_REF make sure precompiled wheel file URL is correct. Run this command:
        # aws s3 ls s3://vllm-wheels/${VLLM_REF}/ --region us-west-2 --no-sign-request
        export VLLM_PRECOMPILED_WHEEL_LOCATION="https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_REF}/vllm-0.10.2-cp38-abi3-manylinux1_x86_64.whl"

        if [ "$EDITABLE" = "true" ]; then
            uv pip install -e . --torch-backend=$TORCH_BACKEND
        else
            uv pip install . --torch-backend=$TORCH_BACKEND
        fi

        echo "\n=== Installing FlashInfer from PyPI ==="
        uv pip install flashinfer-python==$FLASHINF_REF

    fi
fi

echo "✓ vLLM installation completed"

echo "\n=== Installing LMCache ==="
if [ "$ARCH" = "amd64" ]; then
    # LMCache installation currently fails on arm64 due to CUDA dependency issues:
    # OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.
    # TODO: Re-enable for arm64 after verifying lmcache compatibility and resolving the build issue.

    # Alec: Likely lmcache was compiled witha different version of torch and need to install it from source for arm64
    uv pip install lmcache==0.3.3
    echo "✓ LMCache installed"
else
    echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
fi

echo "\n=== Installing DeepGEMM ==="
cd $INSTALLATION_DIR/vllm/tools

if [ -n "$DEEPGEMM_REF" ]; then
    bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF"
else
    bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}"
fi
echo "✓ DeepGEMM installation completed"

echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
cd ep_kernels/
TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh

echo "\n✅ All installations completed successfully!"