#!/usr/bin/env bash # SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # This script installs vLLM and its dependencies from PyPI (release versions only). # Installation order: # 1. LMCache (installed first so vLLM's dependencies take precedence) # 2. vLLM # 3. vLLM-Omni # 4. DeepGEMM # 5. EP kernels set -euo pipefail VLLM_VER="0.16.0" VLLM_REF="v${VLLM_VER}" DEVICE="cuda" # Basic Configurations ARCH=$(uname -m) MAX_JOBS=16 INSTALLATION_DIR=/tmp # VLLM and Dependency Configurations TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- TODO: check if we need to add 12.0+PTX DEEPGEMM_REF="" CUDA_VERSION="12.9" FLASHINF_REF="v0.6.3" LMCACHE_REF="0.3.14" VLLM_OMNI_REF="v0.16.0rc1" while [[ $# -gt 0 ]]; do case $1 in --device) DEVICE="$2" shift 2 ;; --vllm-ref) VLLM_REF="$2" shift 2 ;; --max-jobs) MAX_JOBS="$2" shift 2 ;; --arch) ARCH="$2" shift 2 ;; --installation-dir) INSTALLATION_DIR="$2" shift 2 ;; --deepgemm-ref) DEEPGEMM_REF="$2" shift 2 ;; --flashinf-ref) FLASHINF_REF="$2" shift 2 ;; --lmcache-ref) LMCACHE_REF="$2" shift 2 ;; --vllm-omni-ref) VLLM_OMNI_REF="$2" shift 2 ;; --torch-cuda-arch-list) TORCH_CUDA_ARCH_LIST="$2" shift 2 ;; --cuda-version) CUDA_VERSION="$2" shift 2 ;; -h|--help) echo "Usage: $0 [--device DEVICE] [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--vllm-omni-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]" echo "Options:" echo " --device DEVICE Device Selection (default: cuda)" echo " --vllm-ref REF vLLM release version (default: ${VLLM_REF})" echo " --max-jobs NUM Maximum parallel jobs (default: ${MAX_JOBS})" echo " --arch ARCH Architecture amd64|arm64 (default: auto-detect)" echo " --installation-dir DIR Install directory (default: ${INSTALLATION_DIR})" echo " --deepgemm-ref REF DeepGEMM git ref (default: ${DEEPGEMM_REF})" echo " --flashinf-ref REF FlashInfer version (default: ${FLASHINF_REF})" echo " --lmcache-ref REF LMCache version (default: ${LMCACHE_REF})" echo " --vllm-omni-ref REF vLLM-Omni version (default: ${VLLM_OMNI_REF})" echo " --torch-cuda-arch-list LIST CUDA architectures (default: ${TORCH_CUDA_ARCH_LIST})" echo " --cuda-version VERSION CUDA version (default: ${CUDA_VERSION})" exit 0 ;; *) echo "Unknown option: $1" exit 1 ;; esac done # Convert x86_64 to amd64 for consistency with Docker ARG if [ "$ARCH" = "x86_64" ]; then ARCH="amd64" elif [ "$ARCH" = "aarch64" ]; then ARCH="arm64" fi # Set alternative CPU architecture naming if [ "$ARCH" = "amd64" ]; then ALT_ARCH="x86_64" elif [ "$ARCH" = "arm64" ]; then ALT_ARCH="aarch64" fi export MAX_JOBS=$MAX_JOBS if [ "$DEVICE" = "cuda" ]; then export CUDA_HOME=/usr/local/cuda # Derive torch backend from CUDA version (e.g., "12.9" -> "cu129") TORCH_BACKEND="cu$(echo $CUDA_VERSION | tr -d '.')" CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*} echo "=== Installing prerequisites ===" uv pip install pip cuda-python fi if [ "$DEVICE" = "cuda" ]; then echo "\n=== Configuration Summary ===" echo " VLLM_REF=$VLLM_REF | ARCH=$ARCH | CUDA_VERSION=$CUDA_VERSION | TORCH_BACKEND=$TORCH_BACKEND" echo " TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | INSTALLATION_DIR=$INSTALLATION_DIR" elif [ "$DEVICE" = "xpu" ]; then echo "\n=== Configuration Summary ===" echo " VLLM_REF=$VLLM_REF | ARCH=$ARCH | INSTALLATION_DIR=$INSTALLATION_DIR" fi if [ "$DEVICE" = "cuda" ]; then if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then echo " FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF" echo "\n=== Installing LMCache ===" if [ "$ARCH" = "amd64" ]; then # LMCache installation currently fails on arm64 due to CUDA dependency issues # Install LMCache BEFORE vLLM so vLLM's dependencies take precedence uv pip install lmcache==${LMCACHE_REF} --torch-backend=${TORCH_BACKEND} echo "✓ LMCache ${LMCACHE_REF} installed" else echo "⚠ Skipping LMCache on ARM64 (compatibility issues)" fi else echo " FLASHINF_REF=$FLASHINF_REF | LMCache will not be installed as it doesn't support CUDA 13 yet | DEEPGEMM_REF=$DEEPGEMM_REF" fi elif [ "$DEVICE" = "xpu" ]; then echo " LMCACHE_REF=$LMCACHE_REF " echo "\n=== Installing LMCache ===" if [ "$ARCH" = "amd64" ]; then uv pip install lmcache==${LMCACHE_REF} echo "✓ LMCache ${LMCACHE_REF} installed" fi fi echo "\n=== Cloning vLLM repository ===" # Clone needed for DeepGEMM and EP kernels install scripts cd $INSTALLATION_DIR git clone https://github.com/vllm-project/vllm.git vllm cd vllm git checkout $VLLM_REF echo "✓ vLLM repository cloned" if [ "$DEVICE" = "xpu" ]; then echo "\n=== Installing vLLM ===" git apply --ignore-whitespace /tmp/vllm-xpu.patch uv pip install -r requirements/xpu.txt --index-strategy unsafe-best-match uv pip install --verbose --no-build-isolation . fi if [ "$DEVICE" = "cuda" ]; then echo "\n=== Installing vLLM & FlashInfer ===" # Build GitHub release wheel URL per CUDA version # CUDA 12 wheels have no +cu suffix and use manylinux_2_31 # CUDA 13 wheels have +cu130 suffix and use manylinux_2_35 if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}-cp38-abi3-manylinux_2_31_${ALT_ARCH}.whl" EXTRA_PIP_ARGS="" elif [[ "$CUDA_VERSION_MAJOR" == "13" ]]; then VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_35_${ALT_ARCH}.whl" EXTRA_PIP_ARGS="--index-strategy=unsafe-best-match --extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND}" else echo "❌ Unsupported CUDA version for vLLM installation: ${CUDA_VERSION}" exit 1 fi VLLM_GITHUB_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/${VLLM_GITHUB_WHEEL}" # Install vLLM wheel # CUDA 12: Try PyPI first, fall back to GitHub release # CUDA 13: Always use GitHub release (PyPI only has cu12 wheels, --torch-backend # does not prevent uv from resolving the cu12 variant) echo "Installing vLLM $VLLM_VER (torch backend: $TORCH_BACKEND)..." if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then if uv pip install "vllm[flashinfer,runai]==${VLLM_VER}" ${EXTRA_PIP_ARGS} --torch-backend=${TORCH_BACKEND} 2>&1; then echo "✓ vLLM ${VLLM_VER} installed from PyPI" else echo "⚠ PyPI install failed, installing from GitHub release..." uv pip install ${EXTRA_PIP_ARGS} \ "${VLLM_GITHUB_URL}[flashinfer,runai]" \ --torch-backend=${TORCH_BACKEND} echo "✓ vLLM ${VLLM_VER} installed from GitHub" fi else echo "Installing vLLM from GitHub release (cu130 wheel not available on PyPI)..." uv pip install ${EXTRA_PIP_ARGS} \ "${VLLM_GITHUB_URL}[flashinfer,runai]" \ --torch-backend=${TORCH_BACKEND} echo "✓ vLLM ${VLLM_VER} installed from GitHub" fi uv pip install flashinfer-cubin==$FLASHINF_REF uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND} fi echo "✓ vLLM installation completed" echo "\n=== Installing vLLM-Omni ===" if [ -n "$VLLM_OMNI_REF" ] && [ "$ARCH" = "amd64" ]; then # Save original vllm entrypoint before vllm-omni overwrites it VLLM_BIN=$(which vllm) cp "$VLLM_BIN" /tmp/vllm-entrypoint-backup # Try PyPI first, fall back to building from source if uv pip install vllm-omni==${VLLM_OMNI_REF#v} 2>&1; then echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed from PyPI" else echo "⚠ PyPI install failed, building from source..." git clone --depth 1 --branch ${VLLM_OMNI_REF} https://github.com/vllm-project/vllm-omni.git $INSTALLATION_DIR/vllm-omni uv pip install $INSTALLATION_DIR/vllm-omni rm -rf $INSTALLATION_DIR/vllm-omni echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed from source" fi # Restore original vllm CLI entrypoint (vllm-omni replaces it with its own) cp /tmp/vllm-entrypoint-backup "$VLLM_BIN" echo "✓ Original vllm entrypoint preserved" else echo "⚠ Skipping vLLM-Omni (no ref provided or ARM64 not supported)" fi if [ "$DEVICE" = "cuda" ]; then echo "\n=== Installing DeepGEMM ===" cd $INSTALLATION_DIR/vllm/tools if [ -n "$DEEPGEMM_REF" ]; then bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF" else bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" fi echo "✓ DeepGEMM installation completed" echo "\n=== Installing EP Kernels (PPLX and DeepEP) ===" cd ep_kernels/ # TODO we will be able to specify which pplx and deepep commit we want in future TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh fi echo "\n✅ All installations completed successfully!"