Dockerfile 5.68 KB
Newer Older
Simon Mo's avatar
Simon Mo committed
1
2
3
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.

4
5
6
7
# Please update any changes made here to
# docs/source/dev/dockerfile/dockerfile.rst and
# docs/source/assets/dev/dockerfile-stages-dependency.png

Simon Mo's avatar
Simon Mo committed
8
#################### BASE BUILD IMAGE ####################
9
# prepare basic build environment
10
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
Stephen Krider's avatar
Stephen Krider committed
11
12

RUN apt-get update -y \
13
    && apt-get install -y python3-pip git
Stephen Krider's avatar
Stephen Krider committed
14

15
16
17
18
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
19
RUN ldconfig /usr/local/cuda-12.4/compat/
20

Stephen Krider's avatar
Stephen Krider committed
21
22
23
WORKDIR /workspace

# install build and runtime dependencies
24
25
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
Stephen Krider's avatar
Stephen Krider committed
26
RUN --mount=type=cache,target=/root/.cache/pip \
27
    pip install -r requirements-cuda.txt
28

Stephen Krider's avatar
Stephen Krider committed
29
30
31
32
# install development dependencies
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-dev.txt
33
34
35
36
37
38
39

# cuda arch list used by torch
# can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2
# see https://github.com/pytorch/pytorch/pull/123243
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
Simon Mo's avatar
Simon Mo committed
40
41
#################### BASE BUILD IMAGE ####################

Stephen Krider's avatar
Stephen Krider committed
42

43
#################### WHEEL BUILD IMAGE ####################
Stephen Krider's avatar
Stephen Krider committed
44
45
FROM dev AS build

46
47
48
49
50
# install build dependencies
COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-build.txt

51
52
53
# install compiler cache to speed up compilation leveraging local or remote caching
RUN apt-get update -y && apt-get install -y ccache

54
# files and directories related to build wheels
Stephen Krider's avatar
Stephen Krider committed
55
56
COPY csrc csrc
COPY setup.py setup.py
bnellnm's avatar
bnellnm committed
57
58
COPY cmake cmake
COPY CMakeLists.txt CMakeLists.txt
59
60
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
Stephen Krider's avatar
Stephen Krider committed
61
COPY pyproject.toml pyproject.toml
62
COPY vllm vllm
Stephen Krider's avatar
Stephen Krider committed
63
64

# max jobs used by Ninja to build extensions
65
66
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
67
68
69
# number of threads used by nvcc
ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads
70
71
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1
72

73
74
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
75
76
77
    --mount=type=cache,target=/root/.cache/pip \
    python3 setup.py bdist_wheel --dist-dir=dist

78
79
80
81
# check the size of the wheel, we cannot upload wheels larger than 100MB
COPY .buildkite/check-wheel-size.py check-wheel-size.py
RUN python3 check-wheel-size.py dist

82
83
84
85
86
87
# the `vllm_nccl` package must be installed from source distribution
# pip is too smart to store a wheel in the cache, and other CI jobs
# will directly use the wheel from the cache, which is not what we want.
# we need to remove it manually
RUN --mount=type=cache,target=/root/.cache/pip \
    pip cache remove vllm_nccl*
Simon Mo's avatar
Simon Mo committed
88
#################### EXTENSION Build IMAGE ####################
Stephen Krider's avatar
Stephen Krider committed
89

90
91
92
93
94
95
#################### FLASH_ATTENTION Build IMAGE ####################
FROM dev as flash-attn-builder
# max jobs used for build
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# flash attention version
96
ARG flash_attn_version=v2.5.8
97
98
99
100
101
102
103
104
105
ENV FLASH_ATTN_VERSION=${flash_attn_version}

WORKDIR /usr/src/flash-attention-v2

# Download the wheel or build it if a pre-compiled release doesn't exist
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
    --no-build-isolation --no-deps --no-cache-dir

#################### FLASH_ATTENTION Build IMAGE ####################
Simon Mo's avatar
Simon Mo committed
106

107
108
#################### vLLM installation IMAGE ####################
# image with vLLM installed
109
FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
Simon Mo's avatar
Simon Mo committed
110
WORKDIR /vllm-workspace
111
112
113
114
115
116
117
118

RUN apt-get update -y \
    && apt-get install -y python3-pip git vim

# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
119
RUN ldconfig /usr/local/cuda-12.4/compat/
120
121
122
123
124
125

# install vllm wheel first, so that torch etc will be installed
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/pip \
    pip install dist/*.whl --verbose

126
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
127
    --mount=type=cache,target=/root/.cache/pip \
128
    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
129
#################### vLLM installation IMAGE ####################
Stephen Krider's avatar
Stephen Krider committed
130
131


132
133
134
135
#################### TEST IMAGE ####################
# image to run unit testing suite
# note that this uses vllm installed by `pip`
FROM vllm-base AS test
Stephen Krider's avatar
Stephen Krider committed
136

137
ADD . /vllm-workspace/
Stephen Krider's avatar
Stephen Krider committed
138

139
# install development dependencies (for testing)
Stephen Krider's avatar
Stephen Krider committed
140
RUN --mount=type=cache,target=/root/.cache/pip \
141
    pip install -r requirements-dev.txt
142

143
144
145
146
147
148
# doc requires source code
# we hide them inside `test_docs/` , so that this source code
# will not be imported by other tests
RUN mkdir test_docs
RUN mv docs test_docs/
RUN mv vllm test_docs/
Stephen Krider's avatar
Stephen Krider committed
149

150
#################### TEST IMAGE ####################
Stephen Krider's avatar
Stephen Krider committed
151

Simon Mo's avatar
Simon Mo committed
152
#################### OPENAI API SERVER ####################
Stephen Krider's avatar
Stephen Krider committed
153
154
# openai api server alternative
FROM vllm-base AS vllm-openai
155

156
# install additional dependencies for openai api server
Stephen Krider's avatar
Stephen Krider committed
157
RUN --mount=type=cache,target=/root/.cache/pip \
158
    pip install accelerate hf_transfer modelscope
Stephen Krider's avatar
Stephen Krider committed
159

yhu422's avatar
yhu422 committed
160
161
ENV VLLM_USAGE_SOURCE production-docker-image

Stephen Krider's avatar
Stephen Krider committed
162
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
Simon Mo's avatar
Simon Mo committed
163
#################### OPENAI API SERVER ####################