install_vllm.sh 6.28 KB
Newer Older
1
#!/usr/bin/env bash
2
# SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
# SPDX-License-Identifier: Apache-2.0
4

5
6
7
8
# This script installs vLLM and its dependencies from PyPI (release versions only).
# Installation order:
# 1. LMCache (installed first so vLLM's dependencies take precedence)
# 2. vLLM
9
10
# 3. DeepGEMM
# 4. EP kernels
11
12
13

set -euo pipefail

Alec's avatar
Alec committed
14
VLLM_VER="0.14.1"
Dmitry Tokarev's avatar
Dmitry Tokarev committed
15
VLLM_REF="v${VLLM_VER}"
16
17
18

# Basic Configurations
ARCH=$(uname -m)
19
20
MAX_JOBS=16
INSTALLATION_DIR=/tmp
21
22

# VLLM and Dependency Configurations
Dmitry Tokarev's avatar
Dmitry Tokarev committed
23
TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- TODO: check if we need to add 12.0+PTX
24
DEEPGEMM_REF=""
25
CUDA_VERSION="12.9"
26
FLASHINF_REF="v0.5.3"
27
LMCACHE_REF="0.3.12"
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54

while [[ $# -gt 0 ]]; do
    case $1 in
        --vllm-ref)
            VLLM_REF="$2"
            shift 2
            ;;
        --max-jobs)
            MAX_JOBS="$2"
            shift 2
            ;;
        --arch)
            ARCH="$2"
            shift 2
            ;;
        --installation-dir)
            INSTALLATION_DIR="$2"
            shift 2
            ;;
        --deepgemm-ref)
            DEEPGEMM_REF="$2"
            shift 2
            ;;
        --flashinf-ref)
            FLASHINF_REF="$2"
            shift 2
            ;;
55
56
        --lmcache-ref)
            LMCACHE_REF="$2"
57
58
            shift 2
            ;;
59
60
61
62
63
64
65
66
        --torch-cuda-arch-list)
            TORCH_CUDA_ARCH_LIST="$2"
            shift 2
            ;;
        --cuda-version)
            CUDA_VERSION="$2"
            shift 2
            ;;
67
        -h|--help)
68
            echo "Usage: $0 [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
69
            echo "Options:"
70
71
72
73
74
75
76
77
78
            echo "  --vllm-ref REF      vLLM release version (default: ${VLLM_REF})"
            echo "  --max-jobs NUM      Maximum parallel jobs (default: ${MAX_JOBS})"
            echo "  --arch ARCH         Architecture amd64|arm64 (default: auto-detect)"
            echo "  --installation-dir DIR  Install directory (default: ${INSTALLATION_DIR})"
            echo "  --deepgemm-ref REF  DeepGEMM git ref (default: ${DEEPGEMM_REF})"
            echo "  --flashinf-ref REF  FlashInfer version (default: ${FLASHINF_REF})"
            echo "  --lmcache-ref REF   LMCache version (default: ${LMCACHE_REF})"
            echo "  --torch-cuda-arch-list LIST  CUDA architectures (default: ${TORCH_CUDA_ARCH_LIST})"
            echo "  --cuda-version VERSION  CUDA version (default: ${CUDA_VERSION})"
79
80
81
82
83
84
85
86
87
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            exit 1
            ;;
    esac
done

88
89
90
91
92
93
94
# Convert x86_64 to amd64 for consistency with Docker ARG
if [ "$ARCH" = "x86_64" ]; then
    ARCH="amd64"
elif [ "$ARCH" = "aarch64" ]; then
    ARCH="arm64"
fi

95
96
97
98
99
100
101
# Set alternative CPU architecture naming
if [ "$ARCH" = "amd64" ]; then
    ALT_ARCH="x86_64"
elif [ "$ARCH" = "arm64" ]; then
    ALT_ARCH="aarch64"
fi

102
103
104
export MAX_JOBS=$MAX_JOBS
export CUDA_HOME=/usr/local/cuda

105
106
# Derive torch backend from CUDA version (e.g., "12.9" -> "cu129")
TORCH_BACKEND="cu$(echo $CUDA_VERSION | tr -d '.')"
Dmitry Tokarev's avatar
Dmitry Tokarev committed
107
CUDA_VERSION_MAJOR=${CUDA_VERSION%%.*}
108

109
echo "=== Installing prerequisites ==="
110
111
uv pip install pip cuda-python

112
echo "\n=== Configuration Summary ==="
113
114
115
echo "  VLLM_REF=$VLLM_REF | ARCH=$ARCH | CUDA_VERSION=$CUDA_VERSION | TORCH_BACKEND=$TORCH_BACKEND"
echo "  TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | INSTALLATION_DIR=$INSTALLATION_DIR"

Dmitry Tokarev's avatar
Dmitry Tokarev committed
116
117
118
119
120
121
122
123
124
125
126
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
    echo "  FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF"
    echo "\n=== Installing LMCache ==="
    if [ "$ARCH" = "amd64" ]; then
        # LMCache installation currently fails on arm64 due to CUDA dependency issues
        # Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
        uv pip install lmcache==${LMCACHE_REF} --torch-backend=${TORCH_BACKEND}
        echo "✓ LMCache ${LMCACHE_REF} installed"
    else
        echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
    fi
127
else
Dmitry Tokarev's avatar
Dmitry Tokarev committed
128
    echo "  FLASHINF_REF=$FLASHINF_REF | LMCache will not be installed as it doesn't support CUDA 13 yet | DEEPGEMM_REF=$DEEPGEMM_REF"
129
fi
130

Dmitry Tokarev's avatar
Dmitry Tokarev committed
131

132
echo "\n=== Cloning vLLM repository ==="
133
# Clone needed for DeepGEMM and EP kernels install scripts
134
cd $INSTALLATION_DIR
135
git clone https://github.com/vllm-project/vllm.git vllm
136
137
cd vllm
git checkout $VLLM_REF
Dmitry Tokarev's avatar
Dmitry Tokarev committed
138
echo "✓ vLLM repository cloned"
139
140


Dmitry Tokarev's avatar
Dmitry Tokarev committed
141
142
143
144
145
146
147
echo "\n=== Installing vLLM & FlashInfer ==="
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
    echo "Installing vLLM $VLLM_REF from PyPI..."
    uv pip install vllm[flashinfer,runai]==$VLLM_REF --torch-backend=${TORCH_BACKEND}
    uv pip install flashinfer-cubin==$FLASHINF_REF
    uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
elif [[ "$CUDA_VERSION_MAJOR" == "13" ]]; then
148
149
150
151
152
153
154
155
156
    echo "⚠ Skipping LMCache on CUDA 13 env since LMCache doesn't support CUDA 13 "
    echo "Installing vLLM $VLLM_REF from GitHub since CUDA 13 x86_64 wheel is only present on GitHub..."
    uv pip install \
        --index-strategy=unsafe-best-match \
        --extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND} \
        https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_35_${ALT_ARCH}.whl[flashinfer,runai] \
        --torch-backend=${TORCH_BACKEND}
    uv pip install flashinfer-cubin==$FLASHINF_REF
    uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
Dmitry Tokarev's avatar
Dmitry Tokarev committed
157
158
159
160
else
    echo "❌ Unsupported CUDA version for vLLM installation: ${CUDA_VERSION}"
    exit 1
fi
161
echo "✓ vLLM installation completed"
162

163
164
165
166
echo "\n=== Installing DeepGEMM ==="
cd $INSTALLATION_DIR/vllm/tools
if [ -n "$DEEPGEMM_REF" ]; then
    bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF"
167
else
168
    bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}"
169
fi
170
171
172
173
echo "✓ DeepGEMM installation completed"

echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
cd ep_kernels/
174
# TODO we will be able to specify which pplx and deepep commit we want in future
175
TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
176

Alec's avatar
Alec committed
177
echo "\n✅ All installations completed successfully!"