"vllm/entrypoints/openai/responses/context.py" did not exist on "0d0c929f2360cde5bae6817ad0f555641329e79d"
install_vllm.sh 8.95 KB
Newer Older
1
2
3
#!/usr/bin/env bash
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
4
5
6
7
8
9
10
11
12

# This script is used to install vLLM and its dependencies
# If installing vLLM from a release tag, we will use pip to manage the install
# Otherwise, we will use git to checkout the vLLM source code and build it from source.
# The dependencies are installed in the following order:
# 1. vLLM
# 2. LMCache
# 3. DeepGEMM
# 4. EP kernels
13
14
15

set -euo pipefail

16
17
18
19
VLLM_REF="v0.10.2"

# Basic Configurations
ARCH=$(uname -m)
20
21
MAX_JOBS=16
INSTALLATION_DIR=/tmp
22
23

# VLLM and Dependency Configurations
24
TORCH_BACKEND="cu128"
25
26
27
TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels
DEEPGEMM_REF=""
CUDA_VERSION="12.8" # For DEEPGEMM
28

29
30
31
32
# These flags are applicable when installing vLLM from source code
EDITABLE=true
VLLM_GIT_URL="https://github.com/vllm-project/vllm.git"
FLASHINF_REF="v0.3.0"
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47

while [[ $# -gt 0 ]]; do
    case $1 in
        --editable)
            EDITABLE=true
            shift
            ;;
        --no-editable)
            EDITABLE=false
            shift
            ;;
        --vllm-ref)
            VLLM_REF="$2"
            shift 2
            ;;
48
49
50
51
        --vllm-git-url)
            VLLM_GIT_URL="$2"
            shift 2
            ;;
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
        --max-jobs)
            MAX_JOBS="$2"
            shift 2
            ;;
        --arch)
            ARCH="$2"
            shift 2
            ;;
        --installation-dir)
            INSTALLATION_DIR="$2"
            shift 2
            ;;
        --deepgemm-ref)
            DEEPGEMM_REF="$2"
            shift 2
            ;;
        --flashinf-ref)
            FLASHINF_REF="$2"
            shift 2
            ;;
72
73
74
75
        --torch-backend)
            TORCH_BACKEND="$2"
            shift 2
            ;;
76
77
78
79
80
81
82
83
        --torch-cuda-arch-list)
            TORCH_CUDA_ARCH_LIST="$2"
            shift 2
            ;;
        --cuda-version)
            CUDA_VERSION="$2"
            shift 2
            ;;
84
        -h|--help)
85
            echo "Usage: $0 [--editable|--no-editable] [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--torch-backend BACKEND] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
86
87
88
            echo "Options:"
            echo "  --editable        Install vllm in editable mode (default)"
            echo "  --no-editable     Install vllm in non-editable mode"
Dmitry Tokarev's avatar
Dmitry Tokarev committed
89
90
            echo "  --vllm-ref REF    Git reference to checkout (default: ${VLLM_REF})"
            echo "  --max-jobs NUM    Maximum number of parallel jobs (default: ${MAX_JOBS})"
91
            echo "  --arch ARCH       Architecture (amd64|arm64, default: auto-detect)"
Dmitry Tokarev's avatar
Dmitry Tokarev committed
92
93
94
95
            echo "  --installation-dir DIR  Directory to install vllm (default: ${INSTALLATION_DIR})"
            echo "  --deepgemm-ref REF  Git reference for DeepGEMM (default: ${DEEPGEMM_REF})"
            echo "  --flashinf-ref REF  Git reference for Flash Infer (default: ${FLASHINF_REF})"
            echo "  --torch-backend BACKEND  Torch backend to use (default: ${TORCH_BACKEND})"
96
97
            echo "  --torch-cuda-arch-list LIST  CUDA architectures to compile for (default: ${TORCH_CUDA_ARCH_LIST})"
            echo "  --cuda-version VERSION  CUDA version to use (default: ${CUDA_VERSION})"
98
99
100
101
102
103
104
105
106
            exit 0
            ;;
        *)
            echo "Unknown option: $1"
            exit 1
            ;;
    esac
done

107
108
109
110
111
112
113
# Convert x86_64 to amd64 for consistency with Docker ARG
if [ "$ARCH" = "x86_64" ]; then
    ARCH="amd64"
elif [ "$ARCH" = "aarch64" ]; then
    ARCH="arm64"
fi

114
115
116
export MAX_JOBS=$MAX_JOBS
export CUDA_HOME=/usr/local/cuda

117
echo "=== Installing prerequisites ==="
118
119
uv pip install pip cuda-python

120
121
122
123
124
125
echo "\n=== Configuration Summary ==="
echo "  VLLM_REF=$VLLM_REF | EDITABLE=$EDITABLE | ARCH=$ARCH"
echo "  MAX_JOBS=$MAX_JOBS | TORCH_BACKEND=$TORCH_BACKEND | CUDA_VERSION=$CUDA_VERSION"
echo "  TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"
echo "  DEEPGEMM_REF=$DEEPGEMM_REF | FLASHINF_REF=$FLASHINF_REF"
echo "  INSTALLATION_DIR=$INSTALLATION_DIR | VLLM_GIT_URL=$VLLM_GIT_URL"
126

127
128
echo "\n=== Cloning vLLM repository ==="
# We need to clone to install dependencies
129
cd $INSTALLATION_DIR
130
git clone $VLLM_GIT_URL vllm
131
132
133
cd vllm
git checkout $VLLM_REF

134
135
136
137
138
139
140
141
142
143
144
145
# TODO remove in future vLLM release, re-instate ignore torch script
# https://github.com/vllm-project/vllm/pull/24729
GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 740f064


echo "\n=== Installing vLLM & FlashInfer ==="

if [[ $VLLM_REF =~ ^v ]] && [ "$ARCH" = "amd64" ]; then
    # VLLM_REF starts with 'v' and amd64 - use pip install with version tag
    echo "Installing vLLM $VLLM_REF from PyPI..."

    uv pip install vllm[flashinfer]==$VLLM_REF --torch-backend=$TORCH_BACKEND
146
147

else
148
149
    # VLLM_REF does not start with 'v' or amd64 - use git checkout path
    if [ "$ARCH" = "arm64" ]; then
150

151
152
153
154
155
        # torch 2.8.0 doesn't have a aarch wheel for cu128, vLLM uses torch 2.8.0 nightly wheel builds to compile its aarch wheel against
        # nightly can be unstable so we will not use it here
        # for now we will use torch 2.7.1+cu128 but this requires a recompilation from source

        echo "Building vLLM from source for ARM64 architecture..."
156

157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
        # Try to install specific PyTorch version first
        echo "Attempting to install pinned PyTorch nightly versions..."
        if ! uv pip install torch==2.7.1+cu128 torchaudio==2.7.1 torchvision==0.22.1 --index-url https://download.pytorch.org/whl/cu128; then
            echo "Pinned versions failed"
            exit 1
        fi

        # Create constraints file to pin all PyTorch-related versions
        echo "Creating constraints file to preserve PyTorch ecosystem versions..."
        TORCH_VERSION=$(python -c "import torch; print(torch.__version__)")
        TORCHAUDIO_VERSION=$(python -c "import torchaudio; print(torchaudio.__version__)")
        TORCHVISION_VERSION=$(python -c "import torchvision; print(torchvision.__version__)")

        rm -rf /tmp/torch_constraints.txt
        echo "torch==$TORCH_VERSION" > /tmp/torch_constraints.txt
        echo "torchaudio==$TORCHAUDIO_VERSION" >> /tmp/torch_constraints.txt
        echo "torchvision==$TORCHVISION_VERSION" >> /tmp/torch_constraints.txt

        echo "Pinned versions:"
        echo "  - torch==$TORCH_VERSION"
        echo "  - torchaudio==$TORCHAUDIO_VERSION"
        echo "  - torchvision==$TORCHVISION_VERSION"

        python use_existing_torch.py
        uv pip install -c /tmp/torch_constraints.txt -r requirements/build.txt

        if [ "$EDITABLE" = "true" ]; then
            MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation -c /tmp/torch_constraints.txt -e . -v
        else
            MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation -c /tmp/torch_constraints.txt . -v
        fi

        echo "\n=== Installing FlashInfer from source ==="
        cd $INSTALLATION_DIR
        git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
        cd flashinfer
        git checkout $FLASHINF_REF

        # Install with constraints to prevent PyTorch upgrade
        uv pip install -v --no-build-isolation -c /tmp/torch_constraints.txt .
197

198
    else
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
        echo "Building vLLM from source for AMD64 architecture..."

        # When updating above VLLM_REF make sure precompiled wheel file URL is correct. Run this command:
        # aws s3 ls s3://vllm-wheels/${VLLM_REF}/ --region us-west-2 --no-sign-request
        export VLLM_PRECOMPILED_WHEEL_LOCATION="https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_REF}/vllm-0.10.2-cp38-abi3-manylinux1_x86_64.whl"

        if [ "$EDITABLE" = "true" ]; then
            uv pip install -e . --torch-backend=$TORCH_BACKEND
        else
            uv pip install . --torch-backend=$TORCH_BACKEND
        fi

        echo "\n=== Installing FlashInfer from PyPI ==="
        uv pip install flashinfer-python==$FLASHINF_REF

214
215
216
    fi
fi

217
echo "✓ vLLM installation completed"
218

219
220
221
222
223
echo "\n=== Installing LMCache ==="
if [ "$ARCH" = "amd64" ]; then
    # LMCache installation currently fails on arm64 due to CUDA dependency issues:
    # OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.
    # TODO: Re-enable for arm64 after verifying lmcache compatibility and resolving the build issue.
224

225
226
227
228
229
230
    # Alec: Likely lmcache was compiled witha different version of torch and need to install it from source for arm64
    uv pip install lmcache==0.3.3
    echo "✓ LMCache installed"
else
    echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
fi
231

232
233
echo "\n=== Installing DeepGEMM ==="
cd $INSTALLATION_DIR/vllm/tools
234

235
236
if [ -n "$DEEPGEMM_REF" ]; then
    bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}" --ref "$DEEPGEMM_REF"
237
else
238
    bash install_deepgemm.sh --cuda-version "${CUDA_VERSION}"
239
fi
240
241
242
243
244
echo "✓ DeepGEMM installation completed"

echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
cd ep_kernels/
TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
245

246
echo "\n✅ All installations completed successfully!"