Dockerfile 5.37 KB
Newer Older
Simon Mo's avatar
Simon Mo committed
1
2
3
4
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.

#################### BASE BUILD IMAGE ####################
5
# prepare basic build environment
6
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
Stephen Krider's avatar
Stephen Krider committed
7
8

RUN apt-get update -y \
9
    && apt-get install -y python3-pip git
Stephen Krider's avatar
Stephen Krider committed
10

11
12
13
14
15
16
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-12.1/compat/

Stephen Krider's avatar
Stephen Krider committed
17
18
19
WORKDIR /workspace

# install build and runtime dependencies
20
21
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
Stephen Krider's avatar
Stephen Krider committed
22
RUN --mount=type=cache,target=/root/.cache/pip \
23
    pip install -r requirements-cuda.txt
24

Stephen Krider's avatar
Stephen Krider committed
25
26
27
28
# install development dependencies
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-dev.txt
29
30
31
32
33
34
35

# cuda arch list used by torch
# can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2
# see https://github.com/pytorch/pytorch/pull/123243
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
Simon Mo's avatar
Simon Mo committed
36
37
#################### BASE BUILD IMAGE ####################

Stephen Krider's avatar
Stephen Krider committed
38

39
#################### WHEEL BUILD IMAGE ####################
Stephen Krider's avatar
Stephen Krider committed
40
41
FROM dev AS build

42
43
44
45
46
# install build dependencies
COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-build.txt

47
48
49
# install compiler cache to speed up compilation leveraging local or remote caching
RUN apt-get update -y && apt-get install -y ccache

50
# files and directories related to build wheels
Stephen Krider's avatar
Stephen Krider committed
51
52
COPY csrc csrc
COPY setup.py setup.py
bnellnm's avatar
bnellnm committed
53
54
COPY cmake cmake
COPY CMakeLists.txt CMakeLists.txt
55
56
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
Stephen Krider's avatar
Stephen Krider committed
57
COPY pyproject.toml pyproject.toml
58
COPY vllm vllm
Stephen Krider's avatar
Stephen Krider committed
59
60

# max jobs used by Ninja to build extensions
61
62
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
63
64
65
# number of threads used by nvcc
ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads
66
67
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1
68

69
70
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
71
72
73
74
75
76
77
78
79
    --mount=type=cache,target=/root/.cache/pip \
    python3 setup.py bdist_wheel --dist-dir=dist

# the `vllm_nccl` package must be installed from source distribution
# pip is too smart to store a wheel in the cache, and other CI jobs
# will directly use the wheel from the cache, which is not what we want.
# we need to remove it manually
RUN --mount=type=cache,target=/root/.cache/pip \
    pip cache remove vllm_nccl*
Simon Mo's avatar
Simon Mo committed
80
#################### EXTENSION Build IMAGE ####################
Stephen Krider's avatar
Stephen Krider committed
81

82
83
84
85
86
87
#################### FLASH_ATTENTION Build IMAGE ####################
FROM dev as flash-attn-builder
# max jobs used for build
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
# flash attention version
88
ARG flash_attn_version=v2.5.8
89
90
91
92
93
94
95
96
97
ENV FLASH_ATTN_VERSION=${flash_attn_version}

WORKDIR /usr/src/flash-attention-v2

# Download the wheel or build it if a pre-compiled release doesn't exist
RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
    --no-build-isolation --no-deps --no-cache-dir

#################### FLASH_ATTENTION Build IMAGE ####################
Simon Mo's avatar
Simon Mo committed
98

99
100
101
#################### vLLM installation IMAGE ####################
# image with vLLM installed
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
Simon Mo's avatar
Simon Mo committed
102
WORKDIR /vllm-workspace
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117

RUN apt-get update -y \
    && apt-get install -y python3-pip git vim

# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-12.1/compat/

# install vllm wheel first, so that torch etc will be installed
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
    --mount=type=cache,target=/root/.cache/pip \
    pip install dist/*.whl --verbose

118
RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
119
    --mount=type=cache,target=/root/.cache/pip \
120
    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
121
#################### vLLM installation IMAGE ####################
Stephen Krider's avatar
Stephen Krider committed
122
123


124
125
126
127
#################### TEST IMAGE ####################
# image to run unit testing suite
# note that this uses vllm installed by `pip`
FROM vllm-base AS test
Stephen Krider's avatar
Stephen Krider committed
128

129
ADD . /vllm-workspace/
Stephen Krider's avatar
Stephen Krider committed
130

131
# install development dependencies (for testing)
Stephen Krider's avatar
Stephen Krider committed
132
RUN --mount=type=cache,target=/root/.cache/pip \
133
    pip install -r requirements-dev.txt
134

135
136
137
138
139
140
# doc requires source code
# we hide them inside `test_docs/` , so that this source code
# will not be imported by other tests
RUN mkdir test_docs
RUN mv docs test_docs/
RUN mv vllm test_docs/
Stephen Krider's avatar
Stephen Krider committed
141

142
#################### TEST IMAGE ####################
Stephen Krider's avatar
Stephen Krider committed
143

Simon Mo's avatar
Simon Mo committed
144
#################### OPENAI API SERVER ####################
Stephen Krider's avatar
Stephen Krider committed
145
146
# openai api server alternative
FROM vllm-base AS vllm-openai
147

148
# install additional dependencies for openai api server
Stephen Krider's avatar
Stephen Krider committed
149
RUN --mount=type=cache,target=/root/.cache/pip \
150
    pip install accelerate hf_transfer modelscope
Stephen Krider's avatar
Stephen Krider committed
151

yhu422's avatar
yhu422 committed
152
153
ENV VLLM_USAGE_SOURCE production-docker-image

Stephen Krider's avatar
Stephen Krider committed
154
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
Simon Mo's avatar
Simon Mo committed
155
#################### OPENAI API SERVER ####################