install_vllm.sh 6.91 KB
Newer Older
1
#!/usr/bin/env bash
2
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
# SPDX-License-Identifier: Apache-2.0
4

5
6
7
8
# This script installs vLLM and its dependencies from PyPI (release versions only).
# Installation order:
# 1. LMCache (installed first so vLLM's dependencies take precedence)
# 2. vLLM
9
10
# 3. DeepGEMM
# 4. EP kernels
11
12
13

set -euo pipefail

Dmitry Tokarev's avatar
Dmitry Tokarev committed
14
15
VLLM_VER="0.12.0"
VLLM_REF="v${VLLM_VER}"
16
17
18

# Basic Configurations
ARCH=$(uname -m)
19
20
MAX_JOBS=16
INSTALLATION_DIR=/tmp
21
22

# VLLM and Dependency Configurations
Dmitry Tokarev's avatar
Dmitry Tokarev committed
23
TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- TODO: check if we need to add 12.0+PTX
24
DEEPGEMM_REF=""
25
CUDA_VERSION="12.9"
26
FLASHINF_REF="v0.5.3"
27
# LMCache version - 0.3.9+ required for vLLM 0.11.2 compatibility
28
LMCACHE_REF="0.3.10"
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

while [[ $# -gt 0 ]]; do
    case $1 in
        --vllm-ref)
            VLLM_REF="$2"
            shift 2
            ;;
        --max-jobs)
            MAX_JOBS="$2"
            shift 2
            ;;
        --arch)
            ARCH="$2"
            shift 2
            ;;
        --installation-dir)
            INSTALLATION_DIR="$2"
            shift 2
            ;;
        --deepgemm-ref)
            DEEPGEMM_REF="$2"
            shift 2
            ;;
        --flashinf-ref)
            FLASHINF_REF="$2"
            shift 2
            ;;
56
57
        --lmcache-ref)
            LMCACHE_REF="$2"
58
59
            shift 2
            ;;
60
61
62
63
64
65
66
67
        --torch-cuda-arch-list)
            TORCH_CUDA_ARCH_LIST="$2"
            shift 2
            ;;
        --cuda-version)
            CUDA_VERSION="$2"
            shift 2
            ;;
68
        -h|--help)
69
            echo "Usage: $0 [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
70
            echo "Options:"
71
72
73
74
75
76
77
78
79
            echo "  --vllm-ref REF      vLLM release version (default: ${VLLM_REF})"
            echo "  --max-jobs NUM      Maximum parallel jobs (default: ${MAX_JOBS})"
            echo "  --arch ARCH         Architecture amd64|arm64 (default: auto-detect)"
            echo "  --installation-dir DIR  Install directory (default: ${INSTALLATION_DIR})"
            echo "  --deepgemm-ref REF  DeepGEMM git ref (default: ${DEEPGEMM_REF})"
            echo "  --flashinf-ref REF  FlashInfer version (default: ${FLASHINF_REF})"
            echo "  --lmcache-ref REF   LMCache version (default: ${LMCACHE_REF})"
            echo "  --torch-cuda-arch-list LIST  CUDA architectures (default: ${TORCH_CUDA_ARCH_LIST})"
            echo "  --cuda-version VERSION  CUDA version (default: ${CUDA_VERSION})"
80
81
82
83
84
85
86
87
88
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            exit 1
            ;;
    esac
done

89
90
91
92
93
94
95
# Convert x86_64 to amd64 for consistency with Docker ARG
if [ "$ARCH" = "x86_64" ]; then
    ARCH="amd64"
elif [ "$ARCH" = "aarch64" ]; then
    ARCH="arm64"
fi

96
97
98
export MAX_JOBS=$MAX_JOBS
export CUDA_HOME=/usr/local/cuda

99
100
# Derive torch backend from CUDA version (e.g., "12.9" -> "cu129")
TORCH_BACKEND="cu$(echo $CUDA_VERSION | tr -d '.')"
Dmitry Tokarev's avatar
Dmitry Tokarev committed
101
CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*}
102

103
echo "=== Installing prerequisites ==="
104
105
uv pip install pip cuda-python

106
echo "\n=== Configuration Summary ==="
107
108
109
echo "  VLLM_REF=$VLLM_REF | ARCH=$ARCH | CUDA_VERSION=$CUDA_VERSION | TORCH_BACKEND=$TORCH_BACKEND"
echo "  TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | INSTALLATION_DIR=$INSTALLATION_DIR"

Dmitry Tokarev's avatar
Dmitry Tokarev committed
110
111
112
113
114
115
116
117
118
119
120
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
    echo "  FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF"
    echo "\n=== Installing LMCache ==="
    if [ "$ARCH" = "amd64" ]; then
        # LMCache installation currently fails on arm64 due to CUDA dependency issues
        # Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
        uv pip install lmcache==${LMCACHE_REF} --torch-backend=${TORCH_BACKEND}
        echo "✓ LMCache ${LMCACHE_REF} installed"
    else
        echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
    fi
121
else
Dmitry Tokarev's avatar
Dmitry Tokarev committed
122
    echo "  FLASHINF_REF=$FLASHINF_REF | LMCache will not be installed as it doesn't support CUDA 13 yet | DEEPGEMM_REF=$DEEPGEMM_REF"
123
fi
124

Dmitry Tokarev's avatar
Dmitry Tokarev committed
125

126
echo "\n=== Cloning vLLM repository ==="
127
# Clone needed for DeepGEMM and EP kernels install scripts
128
cd $INSTALLATION_DIR
129
git clone https://github.com/vllm-project/vllm.git vllm
130
131
cd vllm
git checkout $VLLM_REF
Dmitry Tokarev's avatar
Dmitry Tokarev committed
132
133
134
# TODO: remove this cherry-pick when vllm is upgraded to > 0.12.0 (when the fix is shipped)
git cherry-pick --no-commit 799804d140fc99ce3964648ba91aaa810cf28fef # nvshmem fix for CUDA 13.0
echo "✓ vLLM repository cloned"
135
136


Dmitry Tokarev's avatar
Dmitry Tokarev committed
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
echo "\n=== Installing vLLM & FlashInfer ==="
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
    echo "Installing vLLM $VLLM_REF from PyPI..."
    uv pip install vllm[flashinfer,runai]==$VLLM_REF --torch-backend=${TORCH_BACKEND}
    uv pip install flashinfer-cubin==$FLASHINF_REF
    uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
elif [[ "$CUDA_VERSION_MAJOR" == "13" ]]; then
    if [ "$ARCH" = "amd64" ]; then
        echo "Installing vLLM $VLLM_REF from GitHub since CUDA 13 x86_64 wheel is only present on GitHub..."
        uv pip install \
            --index-strategy=unsafe-best-match \
            --extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND} \
            https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_31_x86_64.whl[flashinfer,runai] \
            --torch-backend=${TORCH_BACKEND}
        uv pip install flashinfer-cubin==$FLASHINF_REF
        uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
        echo "✓ vLLM installation completed"
    else
        echo "⚠ Skipping LMCache on ARM64 (compatibility issues, missing aarch64 wheels)"
        echo "Building vLLM from source for ${ARCH} architecture..."
        echo "Try to install specific PyTorch and other dependencies first"
        uv pip install --index-strategy=unsafe-best-match --index https://download.pytorch.org/whl/ -r requirements/cuda.txt
        uv pip install setuptools_scm # required to build vLLM from source
        MAX_JOBS=${MAX_JOBS} uv pip install -v --no-build-isolation .
    fi
else
    echo "❌ Unsupported CUDA version for vLLM installation: ${CUDA_VERSION}"
    exit 1
fi
166
echo "✓ vLLM installation completed"
167

168
169
170
171
echo "\n=== Installing DeepGEMM ==="
cd $INSTALLATION_DIR/vllm/tools
if [ -n "$DEEPGEMM_REF" ]; then
    bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF"
172
else
173
    bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}"
174
fi
175
176
177
178
echo "✓ DeepGEMM installation completed"

echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
cd ep_kernels/
179
# TODO we will be able to specify which pplx and deepep commit we want in future
180
TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
181

Alec's avatar
Alec committed
182
echo "\n✅ All installations completed successfully!"