Dockerfile 4.18 KB
Newer Older
Simon Mo's avatar
Simon Mo committed
1
2
3
4
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.

#################### BASE BUILD IMAGE ####################
5
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
Stephen Krider's avatar
Stephen Krider committed
6

Simon Mo's avatar
Simon Mo committed
7
8
9
10
11
12
13
14
15
# Set the DEBIAN_FRONTEND variable to noninteractive to avoid interactive prompts
ENV DEBIAN_FRONTEND=noninteractive

# Preconfigure tzdata for US Central Time (build running in us-central-1 but this really doesn't matter.)
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
    && echo 'tzdata tzdata/Zones/America select Chicago' | debconf-set-selections

# We install an older version of python here for testing to make sure vllm works with older versions of Python.
# For the actual openai compatible server, we will use the latest version of Python.
Stephen Krider's avatar
Stephen Krider committed
16
RUN apt-get update -y \
Simon Mo's avatar
Simon Mo committed
17
18
19
20
21
    && apt-get install -y software-properties-common \
    && add-apt-repository ppa:deadsnakes/ppa -y \
    && apt-get update -y \
    && apt-get install -y python3.8 python3.8-dev python3.8-venv python3-pip git \
    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
Stephen Krider's avatar
Stephen Krider committed
22

23
24
25
26
27
28
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
RUN ldconfig /usr/local/cuda-12.1/compat/

Stephen Krider's avatar
Stephen Krider committed
29
30
31
32
33
34
WORKDIR /workspace

# install build and runtime dependencies
COPY requirements.txt requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements.txt
35

Stephen Krider's avatar
Stephen Krider committed
36
37
38
39
# install development dependencies
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-dev.txt
Simon Mo's avatar
Simon Mo committed
40
41
#################### BASE BUILD IMAGE ####################

Stephen Krider's avatar
Stephen Krider committed
42

Simon Mo's avatar
Simon Mo committed
43
#################### EXTENSION BUILD IMAGE ####################
Stephen Krider's avatar
Stephen Krider committed
44
45
FROM dev AS build

46
47
48
49
50
# install build dependencies
COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements-build.txt

Stephen Krider's avatar
Stephen Krider committed
51
52
53
54
55
56
57
# copy input files
COPY csrc csrc
COPY setup.py setup.py
COPY requirements.txt requirements.txt
COPY pyproject.toml pyproject.toml
COPY vllm/__init__.py vllm/__init__.py

Simon Mo's avatar
Simon Mo committed
58
# cuda arch list used by torch
59
60
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
Stephen Krider's avatar
Stephen Krider committed
61
# max jobs used by Ninja to build extensions
62
63
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
64
65
66
# number of threads used by nvcc
ARG nvcc_threads=8
ENV NVCC_THREADS=$nvcc_threads
67
68
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1
69

Stephen Krider's avatar
Stephen Krider committed
70
RUN python3 setup.py build_ext --inplace
Simon Mo's avatar
Simon Mo committed
71
#################### EXTENSION Build IMAGE ####################
Stephen Krider's avatar
Stephen Krider committed
72

Simon Mo's avatar
Simon Mo committed
73
74

#################### TEST IMAGE ####################
Stephen Krider's avatar
Stephen Krider committed
75
76
77
78
79
# image to run unit testing suite
FROM dev AS test

# copy pytorch extensions separately to avoid having to rebuild
# when python code changes
Simon Mo's avatar
Simon Mo committed
80
81
82
83
84
85
86
87
WORKDIR /vllm-workspace
# ADD is used to preserve directory structure
ADD . /vllm-workspace/
COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
# ignore build dependencies installation because we are using pre-complied extensions
RUN rm pyproject.toml
RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
#################### TEST IMAGE ####################
Stephen Krider's avatar
Stephen Krider committed
88
89


Simon Mo's avatar
Simon Mo committed
90
#################### RUNTIME BASE IMAGE ####################
Stephen Krider's avatar
Stephen Krider committed
91
# use CUDA base as CUDA runtime dependencies are already installed via pip
92
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
Stephen Krider's avatar
Stephen Krider committed
93
94
95
96
97
98
99
100
101

# libnccl required for ray
RUN apt-get update -y \
    && apt-get install -y python3-pip

WORKDIR /workspace
COPY requirements.txt requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
    pip install -r requirements.txt
Simon Mo's avatar
Simon Mo committed
102
#################### RUNTIME BASE IMAGE ####################
Stephen Krider's avatar
Stephen Krider committed
103
104


Simon Mo's avatar
Simon Mo committed
105
#################### OPENAI API SERVER ####################
Stephen Krider's avatar
Stephen Krider committed
106
107
# openai api server alternative
FROM vllm-base AS vllm-openai
108
# install additional dependencies for openai api server
Stephen Krider's avatar
Stephen Krider committed
109
RUN --mount=type=cache,target=/root/.cache/pip \
110
    pip install accelerate
Stephen Krider's avatar
Stephen Krider committed
111

112
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
113
COPY vllm vllm
Stephen Krider's avatar
Stephen Krider committed
114
115

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
Simon Mo's avatar
Simon Mo committed
116
#################### OPENAI API SERVER ####################