install_vllm.sh 11.3 KB
Newer Older
1
#!/usr/bin/env bash
2
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
# SPDX-License-Identifier: Apache-2.0
4

5
6
# This script installs vLLM and its dependencies from PyPI (release versions only).
# Installation order:
7
8
# 1. vLLM
# 2. LMCache (built from source AFTER vLLM so c_ops.so is compiled against installed PyTorch)
9
10
11
# 3. vLLM-Omni
# 4. DeepGEMM
# 5. EP kernels
12
13
14

set -euo pipefail

Alec's avatar
Alec committed
15
VLLM_VER="0.19.0"
Dmitry Tokarev's avatar
Dmitry Tokarev committed
16
VLLM_REF="v${VLLM_VER}"
17
DEVICE="cuda"
18
19
20

# Basic Configurations
ARCH=$(uname -m)
21
22
MAX_JOBS=16
INSTALLATION_DIR=/tmp
23
24

# VLLM and Dependency Configurations
Dmitry Tokarev's avatar
Dmitry Tokarev committed
25
TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- TODO: check if we need to add 12.0+PTX
26
DEEPGEMM_REF=""
27
CUDA_VERSION="12.9"
Alec's avatar
Alec committed
28
29
FLASHINF_REF="v0.6.6"
LMCACHE_REF="0.4.2"
30
VLLM_OMNI_REF="v0.16.0"
31
32
33

while [[ $# -gt 0 ]]; do
    case $1 in
34
35
36
37
        --device)
            DEVICE="$2"
            shift 2
            ;;
38
39
        --vllm-ref)
            VLLM_REF="$2"
40
            VLLM_VER="${VLLM_REF#v}"
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
            shift 2
            ;;
        --max-jobs)
            MAX_JOBS="$2"
            shift 2
            ;;
        --arch)
            ARCH="$2"
            shift 2
            ;;
        --installation-dir)
            INSTALLATION_DIR="$2"
            shift 2
            ;;
        --deepgemm-ref)
            DEEPGEMM_REF="$2"
            shift 2
            ;;
        --flashinf-ref)
            FLASHINF_REF="$2"
            shift 2
            ;;
63
64
        --lmcache-ref)
            LMCACHE_REF="$2"
65
66
            shift 2
            ;;
67
68
69
70
        --vllm-omni-ref)
            VLLM_OMNI_REF="$2"
            shift 2
            ;;
71
72
73
74
75
76
77
78
        --torch-cuda-arch-list)
            TORCH_CUDA_ARCH_LIST="$2"
            shift 2
            ;;
        --cuda-version)
            CUDA_VERSION="$2"
            shift 2
            ;;
79
        -h|--help)
80
            echo "Usage: $0 [--device DEVICE] [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--vllm-omni-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
81
            echo "Options:"
82
            echo "  --device DEVICE     Device Selection (default: cuda)"
83
84
85
86
87
88
89
            echo "  --vllm-ref REF      vLLM release version (default: ${VLLM_REF})"
            echo "  --max-jobs NUM      Maximum parallel jobs (default: ${MAX_JOBS})"
            echo "  --arch ARCH         Architecture amd64|arm64 (default: auto-detect)"
            echo "  --installation-dir DIR  Install directory (default: ${INSTALLATION_DIR})"
            echo "  --deepgemm-ref REF  DeepGEMM git ref (default: ${DEEPGEMM_REF})"
            echo "  --flashinf-ref REF  FlashInfer version (default: ${FLASHINF_REF})"
            echo "  --lmcache-ref REF   LMCache version (default: ${LMCACHE_REF})"
90
            echo "  --vllm-omni-ref REF vLLM-Omni version (default: ${VLLM_OMNI_REF})"
91
92
            echo "  --torch-cuda-arch-list LIST  CUDA architectures (default: ${TORCH_CUDA_ARCH_LIST})"
            echo "  --cuda-version VERSION  CUDA version (default: ${CUDA_VERSION})"
93
94
95
96
97
98
99
100
101
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            exit 1
            ;;
    esac
done

102
103
104
105
106
107
108
# Convert x86_64 to amd64 for consistency with Docker ARG
if [ "$ARCH" = "x86_64" ]; then
    ARCH="amd64"
elif [ "$ARCH" = "aarch64" ]; then
    ARCH="arm64"
fi

109
110
111
112
113
114
115
# Set alternative CPU architecture naming
if [ "$ARCH" = "amd64" ]; then
    ALT_ARCH="x86_64"
elif [ "$ARCH" = "arm64" ]; then
    ALT_ARCH="aarch64"
fi

116
export MAX_JOBS=$MAX_JOBS
117
118
if [ "$DEVICE" = "cuda" ]; then
    export CUDA_HOME=/usr/local/cuda
119

120
121
122
    # Derive torch backend from CUDA version (e.g., "12.9" -> "cu129")
    TORCH_BACKEND="cu$(echo $CUDA_VERSION | tr -d '.')"
    CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*}
123

124
125
126
    echo "=== Installing prerequisites ==="
    uv pip install pip cuda-python
fi
127

128
129
130
131
if [ "$DEVICE" = "cuda" ]; then
    echo "\n=== Configuration Summary ==="
    echo "  VLLM_REF=$VLLM_REF | ARCH=$ARCH | CUDA_VERSION=$CUDA_VERSION | TORCH_BACKEND=$TORCH_BACKEND"
    echo "  TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | INSTALLATION_DIR=$INSTALLATION_DIR"
132
elif [ "$DEVICE" = "xpu" ] || [ "$DEVICE" = "cpu" ]; then
133
134
135
    echo "\n=== Configuration Summary ==="
    echo "  VLLM_REF=$VLLM_REF | ARCH=$ARCH | INSTALLATION_DIR=$INSTALLATION_DIR"
fi
136

137
echo "\n=== Cloning vLLM repository ==="
138
# Clone needed for DeepGEMM and EP kernels install scripts
139
cd $INSTALLATION_DIR
140
git clone https://github.com/vllm-project/vllm.git vllm
141
142
cd vllm
git checkout $VLLM_REF
Dmitry Tokarev's avatar
Dmitry Tokarev committed
143
echo "✓ vLLM repository cloned"
144

Alec's avatar
Alec committed
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
echo "\n=== Installing vLLM-Omni ==="
# Install omni BEFORE vLLM. Its transitive dependencies can otherwise upgrade the
# torch/transformers stack after vLLM is installed, which can leave vllm._C ABI-mismatched.
# vLLM should remain the final owner of the runtime stack in this environment.
if [ -n "$VLLM_OMNI_REF" ] && [ "$ARCH" = "amd64" ]; then
    # Try PyPI first, fall back to building from source
    if uv pip install vllm-omni==${VLLM_OMNI_REF#v} 2>&1; then
        echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed from PyPI"
    else
        echo "⚠ PyPI install failed, building from source..."
        git clone --depth 1 --branch ${VLLM_OMNI_REF} https://github.com/vllm-project/vllm-omni.git $INSTALLATION_DIR/vllm-omni
        uv pip install $INSTALLATION_DIR/vllm-omni
        rm -rf $INSTALLATION_DIR/vllm-omni
        echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed from source"
    fi
else
    echo "⚠ Skipping vLLM-Omni (no ref provided or ARM64 not supported)"
fi

164
165
166
167
168
if [ "$DEVICE" = "xpu" ]; then
    echo "\n=== Installing vLLM ==="
    uv pip install -r requirements/xpu.txt --index-strategy unsafe-best-match
    uv pip install --verbose --no-build-isolation .
fi
169

170
171
if [ "$DEVICE" = "cuda" ]; then
    echo "\n=== Installing vLLM & FlashInfer ==="
172

173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
    # Build GitHub release wheel URL per CUDA version
    # CUDA 12 wheels have no +cu suffix and use manylinux_2_31
    # CUDA 13 wheels have +cu130 suffix and use manylinux_2_35
    if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
        VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}-cp38-abi3-manylinux_2_31_${ALT_ARCH}.whl"
        EXTRA_PIP_ARGS=""
    elif [[ "$CUDA_VERSION_MAJOR" == "13" ]]; then
        VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_35_${ALT_ARCH}.whl"
        EXTRA_PIP_ARGS="--index-strategy=unsafe-best-match --extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND}"
    else
        echo "❌ Unsupported CUDA version for vLLM installation: ${CUDA_VERSION}"
        exit 1
    fi
    VLLM_GITHUB_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/${VLLM_GITHUB_WHEEL}"

    # Install vLLM wheel
    # CUDA 12: Try PyPI first, fall back to GitHub release
    # CUDA 13: Always use GitHub release (PyPI only has cu12 wheels, --torch-backend
    #           does not prevent uv from resolving the cu12 variant)
    echo "Installing vLLM $VLLM_VER (torch backend: $TORCH_BACKEND)..."
    if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
194
        if uv pip install "vllm[flashinfer,runai,otel]==${VLLM_VER}" ${EXTRA_PIP_ARGS} --torch-backend=${TORCH_BACKEND} 2>&1; then
195
196
197
198
            echo "✓ vLLM ${VLLM_VER} installed from PyPI"
        else
            echo "⚠ PyPI install failed, installing from GitHub release..."
            uv pip install ${EXTRA_PIP_ARGS} \
199
                "${VLLM_GITHUB_URL}[flashinfer,runai,otel]" \
200
201
202
                --torch-backend=${TORCH_BACKEND}
            echo "✓ vLLM ${VLLM_VER} installed from GitHub"
        fi
203
    else
204
        echo "Installing vLLM from GitHub release (cu130 wheel not available on PyPI)..."
205
        uv pip install ${EXTRA_PIP_ARGS} \
206
            "${VLLM_GITHUB_URL}[flashinfer,runai,otel]" \
207
208
209
            --torch-backend=${TORCH_BACKEND}
        echo "✓ vLLM ${VLLM_VER} installed from GitHub"
    fi
210
211
    uv pip install flashinfer-cubin==$FLASHINF_REF
    uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
212
fi
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227

if [ "$DEVICE" = "cpu" ]; then
    echo "\n=== Installing vLLM for cpu ==="
    if [ -n "${CACHE_BUSTER:-}" ]; then
        echo "$CACHE_BUSTER" > /tmp/builder-buster
    fi
    # vLLM CPU requirements pin torch with a +cpu local version (e.g. 2.10.0+cpu),
    # which is published on the PyTorch CPU wheel index instead of PyPI.
    # Install torchvision, torchaudio from the same index to get the correct versions with +cpu suffix.
    uv pip install -r requirements/cpu-build.txt --extra-index-url https://download.pytorch.org/whl/cpu --index-strategy unsafe-best-match
    uv pip install torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu --index-strategy unsafe-best-match
    VLLM_TARGET_DEVICE=cpu \
    python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
    uv pip install dist/*.whl
fi
228
echo "✓ vLLM installation completed"
229

230
231
232
233
234
echo "\n=== Installing LMCache from source ==="
# LMCache prebuilt wheels are built against PyTorch <=2.8.0 and fail with PyTorch 2.10+
# (undefined symbol: c10::cuda::c10_cuda_check_implementation).
# Build from source AFTER vLLM so c_ops.so compiles against the installed PyTorch.
# Ref: https://docs.lmcache.ai/getting_started/installation.html#install-latest-lmcache-from-source
235
if [ "$DEVICE" = "cuda" ]; then
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
    git clone --depth 1 --branch v${LMCACHE_REF} https://github.com/LMCache/LMCache.git ${INSTALLATION_DIR}/lmcache
    cd ${INSTALLATION_DIR}/lmcache
    uv pip install -r requirements/build.txt
    # Get torch lib dir and embed it as RPATH so c_ops.so finds torch libs at runtime
    TORCH_LIB=$(python3 -c "import torch, os; print(os.path.dirname(torch.__file__) + '/lib')")
    # Build from source with --no-build-isolation (uses installed torch) + RPATH for runtime linking
    TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9;9.0;10.0+PTX" LDFLAGS="-Wl,-rpath,${TORCH_LIB}" \
        uv pip install --no-build-isolation --no-cache .
    # Verify c_ops.so was compiled (cannot import at build time without GPU/CUDA driver)
    # cd to neutral dir so Python finds installed lmcache, not the source checkout
    cd /tmp
    LMCACHE_DIR=$(python3 -c "import lmcache, os; print(os.path.dirname(lmcache.__file__))")
    if ls "${LMCACHE_DIR}"/c_ops*.so > /dev/null 2>&1; then
        echo "✓ lmcache c_ops.so verified: $(ls ${LMCACHE_DIR}/c_ops*.so | head -1 | xargs basename)"
    else
        echo "ERROR: c_ops.so not found in ${LMCACHE_DIR} - CUDA extension was not compiled"
        exit 1
    fi
    rm -rf ${INSTALLATION_DIR}/lmcache
    echo "✓ LMCache ${LMCACHE_REF} installed from source"
elif [ "$DEVICE" = "xpu" ] && [ "$ARCH" = "amd64" ]; then
    uv pip install lmcache==${LMCACHE_REF}
    echo "✓ LMCache ${LMCACHE_REF} installed from PyPI (XPU)"
else
260
    echo "⚠ Skipping LMCache for DEVICE=${DEVICE} ARCH=${ARCH} (not supported)"
261
262
fi

263
264
265
266
267
268
269
270
271
if [ "$DEVICE" = "cuda" ]; then
    echo "\n=== Installing DeepGEMM ==="
    cd $INSTALLATION_DIR/vllm/tools
    if [ -n "$DEEPGEMM_REF" ]; then
        bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF"
    else
        bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}"
    fi
    echo "✓ DeepGEMM installation completed"
272

273
274
275
276
277
    echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
    cd ep_kernels/
    # TODO we will be able to specify which pplx and deepep commit we want in future
    TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
fi
Alec's avatar
Alec committed
278
echo "\n✅ All installations completed successfully!"