install_vllm.sh 9.63 KB
Newer Older
1
#!/usr/bin/env bash
2
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
# SPDX-License-Identifier: Apache-2.0
4

5
6
7
8
# This script installs vLLM and its dependencies from PyPI (release versions only).
# Installation order:
# 1. LMCache (installed first so vLLM's dependencies take precedence)
# 2. vLLM
9
10
11
# 3. vLLM-Omni
# 4. DeepGEMM
# 5. EP kernels
12
13
14

set -euo pipefail

15
VLLM_VER="0.16.0"
Dmitry Tokarev's avatar
Dmitry Tokarev committed
16
VLLM_REF="v${VLLM_VER}"
17
DEVICE="cuda"
18
19
20

# Basic Configurations
ARCH=$(uname -m)
21
22
MAX_JOBS=16
INSTALLATION_DIR=/tmp
23
24

# VLLM and Dependency Configurations
Dmitry Tokarev's avatar
Dmitry Tokarev committed
25
TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- TODO: check if we need to add 12.0+PTX
26
DEEPGEMM_REF=""
27
CUDA_VERSION="12.9"
28
29
30
FLASHINF_REF="v0.6.3"
LMCACHE_REF="0.3.14"
VLLM_OMNI_REF="v0.16.0rc1"
31
32
33

while [[ $# -gt 0 ]]; do
    case $1 in
34
35
36
37
        --device)
            DEVICE="$2"
            shift 2
            ;;
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
        --vllm-ref)
            VLLM_REF="$2"
            shift 2
            ;;
        --max-jobs)
            MAX_JOBS="$2"
            shift 2
            ;;
        --arch)
            ARCH="$2"
            shift 2
            ;;
        --installation-dir)
            INSTALLATION_DIR="$2"
            shift 2
            ;;
        --deepgemm-ref)
            DEEPGEMM_REF="$2"
            shift 2
            ;;
        --flashinf-ref)
            FLASHINF_REF="$2"
            shift 2
            ;;
62
63
        --lmcache-ref)
            LMCACHE_REF="$2"
64
65
            shift 2
            ;;
66
67
68
69
        --vllm-omni-ref)
            VLLM_OMNI_REF="$2"
            shift 2
            ;;
70
71
72
73
74
75
76
77
        --torch-cuda-arch-list)
            TORCH_CUDA_ARCH_LIST="$2"
            shift 2
            ;;
        --cuda-version)
            CUDA_VERSION="$2"
            shift 2
            ;;
78
        -h|--help)
79
            echo "Usage: $0 [--device DEVICE] [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--vllm-omni-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
80
            echo "Options:"
81
            echo "  --device DEVICE     Device Selection (default: cuda)"
82
83
84
85
86
87
88
            echo "  --vllm-ref REF      vLLM release version (default: ${VLLM_REF})"
            echo "  --max-jobs NUM      Maximum parallel jobs (default: ${MAX_JOBS})"
            echo "  --arch ARCH         Architecture amd64|arm64 (default: auto-detect)"
            echo "  --installation-dir DIR  Install directory (default: ${INSTALLATION_DIR})"
            echo "  --deepgemm-ref REF  DeepGEMM git ref (default: ${DEEPGEMM_REF})"
            echo "  --flashinf-ref REF  FlashInfer version (default: ${FLASHINF_REF})"
            echo "  --lmcache-ref REF   LMCache version (default: ${LMCACHE_REF})"
89
            echo "  --vllm-omni-ref REF vLLM-Omni version (default: ${VLLM_OMNI_REF})"
90
91
            echo "  --torch-cuda-arch-list LIST  CUDA architectures (default: ${TORCH_CUDA_ARCH_LIST})"
            echo "  --cuda-version VERSION  CUDA version (default: ${CUDA_VERSION})"
92
93
94
95
96
97
98
99
100
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            exit 1
            ;;
    esac
done

101
102
103
104
105
106
107
# Convert x86_64 to amd64 for consistency with Docker ARG
if [ "$ARCH" = "x86_64" ]; then
    ARCH="amd64"
elif [ "$ARCH" = "aarch64" ]; then
    ARCH="arm64"
fi

108
109
110
111
112
113
114
# Set alternative CPU architecture naming
if [ "$ARCH" = "amd64" ]; then
    ALT_ARCH="x86_64"
elif [ "$ARCH" = "arm64" ]; then
    ALT_ARCH="aarch64"
fi

115
export MAX_JOBS=$MAX_JOBS
116
117
if [ "$DEVICE" = "cuda" ]; then
    export CUDA_HOME=/usr/local/cuda
118

119
120
121
    # Derive torch backend from CUDA version (e.g., "12.9" -> "cu129")
    TORCH_BACKEND="cu$(echo $CUDA_VERSION | tr -d '.')"
    CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*}
122

123
124
125
    echo "=== Installing prerequisites ==="
    uv pip install pip cuda-python
fi
126

127
128
129
130
131
132
133
134
if [ "$DEVICE" = "cuda" ]; then
    echo "\n=== Configuration Summary ==="
    echo "  VLLM_REF=$VLLM_REF | ARCH=$ARCH | CUDA_VERSION=$CUDA_VERSION | TORCH_BACKEND=$TORCH_BACKEND"
    echo "  TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | INSTALLATION_DIR=$INSTALLATION_DIR"
elif [ "$DEVICE" = "xpu" ]; then
    echo "\n=== Configuration Summary ==="
    echo "  VLLM_REF=$VLLM_REF | ARCH=$ARCH | INSTALLATION_DIR=$INSTALLATION_DIR"
fi
135

136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
if [ "$DEVICE" = "cuda" ]; then
    if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
        echo "  FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF"
        echo "\n=== Installing LMCache ==="
        if [ "$ARCH" = "amd64" ]; then
            # LMCache installation currently fails on arm64 due to CUDA dependency issues
            # Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
            uv pip install lmcache==${LMCACHE_REF} --torch-backend=${TORCH_BACKEND}
            echo "✓ LMCache ${LMCACHE_REF} installed"
        else
            echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
        fi
    else
        echo "  FLASHINF_REF=$FLASHINF_REF | LMCache will not be installed as it doesn't support CUDA 13 yet | DEEPGEMM_REF=$DEEPGEMM_REF"
    fi
elif [ "$DEVICE" = "xpu" ]; then
    echo " LMCACHE_REF=$LMCACHE_REF "
Dmitry Tokarev's avatar
Dmitry Tokarev committed
153
154
    echo "\n=== Installing LMCache ==="
    if [ "$ARCH" = "amd64" ]; then
155
        uv pip install lmcache==${LMCACHE_REF}
Dmitry Tokarev's avatar
Dmitry Tokarev committed
156
157
        echo "✓ LMCache ${LMCACHE_REF} installed"
    fi
158
fi
159

160
echo "\n=== Cloning vLLM repository ==="
161
# Clone needed for DeepGEMM and EP kernels install scripts
162
cd $INSTALLATION_DIR
163
git clone https://github.com/vllm-project/vllm.git vllm
164
165
cd vllm
git checkout $VLLM_REF
Dmitry Tokarev's avatar
Dmitry Tokarev committed
166
echo "✓ vLLM repository cloned"
167

168
169
170
171
172
173
if [ "$DEVICE" = "xpu" ]; then
    echo "\n=== Installing vLLM ==="
    git apply --ignore-whitespace /tmp/vllm-xpu.patch
    uv pip install -r requirements/xpu.txt --index-strategy unsafe-best-match
    uv pip install --verbose --no-build-isolation .
fi
174

175
176
if [ "$DEVICE" = "cuda" ]; then
    echo "\n=== Installing vLLM & FlashInfer ==="
177

178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
    # Build GitHub release wheel URL per CUDA version
    # CUDA 12 wheels have no +cu suffix and use manylinux_2_31
    # CUDA 13 wheels have +cu130 suffix and use manylinux_2_35
    if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
        VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}-cp38-abi3-manylinux_2_31_${ALT_ARCH}.whl"
        EXTRA_PIP_ARGS=""
    elif [[ "$CUDA_VERSION_MAJOR" == "13" ]]; then
        VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_35_${ALT_ARCH}.whl"
        EXTRA_PIP_ARGS="--index-strategy=unsafe-best-match --extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND}"
    else
        echo "❌ Unsupported CUDA version for vLLM installation: ${CUDA_VERSION}"
        exit 1
    fi
    VLLM_GITHUB_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/${VLLM_GITHUB_WHEEL}"

    # Install vLLM wheel
    # CUDA 12: Try PyPI first, fall back to GitHub release
    # CUDA 13: Always use GitHub release (PyPI only has cu12 wheels, --torch-backend
    #           does not prevent uv from resolving the cu12 variant)
    echo "Installing vLLM $VLLM_VER (torch backend: $TORCH_BACKEND)..."
    if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
        if uv pip install "vllm[flashinfer,runai]==${VLLM_VER}" ${EXTRA_PIP_ARGS} --torch-backend=${TORCH_BACKEND} 2>&1; then
            echo "✓ vLLM ${VLLM_VER} installed from PyPI"
        else
            echo "⚠ PyPI install failed, installing from GitHub release..."
            uv pip install ${EXTRA_PIP_ARGS} \
                "${VLLM_GITHUB_URL}[flashinfer,runai]" \
                --torch-backend=${TORCH_BACKEND}
            echo "✓ vLLM ${VLLM_VER} installed from GitHub"
        fi
208
    else
209
        echo "Installing vLLM from GitHub release (cu130 wheel not available on PyPI)..."
210
211
212
213
214
        uv pip install ${EXTRA_PIP_ARGS} \
            "${VLLM_GITHUB_URL}[flashinfer,runai]" \
            --torch-backend=${TORCH_BACKEND}
        echo "✓ vLLM ${VLLM_VER} installed from GitHub"
    fi
215
216
    uv pip install flashinfer-cubin==$FLASHINF_REF
    uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
217
fi
218
echo "✓ vLLM installation completed"
219

220
221
222
223
224
echo "\n=== Installing vLLM-Omni ==="
if [ -n "$VLLM_OMNI_REF" ] && [ "$ARCH" = "amd64" ]; then
    # Save original vllm entrypoint before vllm-omni overwrites it
    VLLM_BIN=$(which vllm)
    cp "$VLLM_BIN" /tmp/vllm-entrypoint-backup
225
226
227
228
229
230
231
232
233
234
    # Try PyPI first, fall back to building from source
    if uv pip install vllm-omni==${VLLM_OMNI_REF#v} 2>&1; then
        echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed from PyPI"
    else
        echo "⚠ PyPI install failed, building from source..."
        git clone --depth 1 --branch ${VLLM_OMNI_REF} https://github.com/vllm-project/vllm-omni.git $INSTALLATION_DIR/vllm-omni
        uv pip install $INSTALLATION_DIR/vllm-omni
        rm -rf $INSTALLATION_DIR/vllm-omni
        echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed from source"
    fi
235
236
    # Restore original vllm CLI entrypoint (vllm-omni replaces it with its own)
    cp /tmp/vllm-entrypoint-backup "$VLLM_BIN"
237
    echo "✓ Original vllm entrypoint preserved"
238
239
240
241
else
    echo "⚠ Skipping vLLM-Omni (no ref provided or ARM64 not supported)"
fi

242
243
244
245
246
247
248
249
250
if [ "$DEVICE" = "cuda" ]; then
    echo "\n=== Installing DeepGEMM ==="
    cd $INSTALLATION_DIR/vllm/tools
    if [ -n "$DEEPGEMM_REF" ]; then
        bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF"
    else
        bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}"
    fi
    echo "✓ DeepGEMM installation completed"
251

252
253
254
255
256
    echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
    cd ep_kernels/
    # TODO we will be able to specify which pplx and deepep commit we want in future
    TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
fi
Alec's avatar
Alec committed
257
echo "\n✅ All installations completed successfully!"