"examples/sglang/graphs/agg.py" did not exist on "ac13ed0676e308931b4d0c0cb01617d33ed571ee"
Dockerfile 7.87 KB
Newer Older
1
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
# SPDX-License-Identifier: Apache-2.0
3
4
5
6
7
8
9
10
11
12
13
14
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
15
16

ARG BASE_IMAGE="nvcr.io/nvidia/tritonserver"
17
ARG BASE_IMAGE_TAG="25.01-py3"
18
19
20
21
22
23
24

FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS triton-distributed

# TODO: non root user by default

USER root

25
# TODO: separate dev from runtime dependendcies
26

27
# Rust build/dev dependencies
Biswa Panda's avatar
Biswa Panda committed
28
29
30
RUN apt-get update && \
    apt-get install --no-install-recommends --yes  gdb protobuf-compiler cmake libssl-dev pkg-config

31
32
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
33

Neelay Shah's avatar
Neelay Shah committed
34
35
36
# Install OpenAI-compatible frontend and its dependencies from triton server
# repository. These are used to have a consistent interface, schema, and FastAPI
# app between Triton Core and Triton Distributed implementations.
37
ARG OPENAI_SERVER_TAG="r25.01"
Neelay Shah's avatar
Neelay Shah committed
38
39
40
RUN mkdir -p /opt/tritonserver/python && \
    cd /opt/tritonserver/python && \
    rm -rf openai && \
41
    git clone -b ${OPENAI_SERVER_TAG} --single-branch https://github.com/triton-inference-server/server.git && \
Neelay Shah's avatar
Neelay Shah committed
42
43
44
45
46
47
48
49
50
51
    cd server && \
    git checkout ${SERVER_OPENAI_COMMIT} && \
    cd .. && \
    mv server/python/openai openai && \
    chown -R root:root openai && \
    chmod 755 openai && \
    chmod -R go-w openai && \
    rm -rf server && \
    python3 -m pip install -r openai/requirements.txt

52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# Common dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
    pip install --timeout=2000 --requirement /tmp/requirements.txt
RUN --mount=type=bind,source=./container/deps/requirements.nats.txt,target=/tmp/requirements.txt \
    pip install --timeout=2000 --requirement /tmp/requirements.txt
RUN --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.txt \
    pip install --timeout=2000 --requirement /tmp/requirements.txt

# Finish pyright install
RUN pyright --help > /dev/null 2>&1

# In Process Python API Install
RUN find /opt/tritonserver/python -maxdepth 1 -type f -name \
    "tritonserver-*.whl" | xargs -I {} pip3 install --force-reinstall --upgrade {}[all]

# GENAI Perf Install
68
69
# TODO: Move to tag when fix for genai-perf will be released
ARG GENAI_PERF_TAG="25d0188713adc47868d6b3f22426375237a90529"
70
71
72
73
RUN pip install "git+https://github.com/triton-inference-server/perf_analyzer.git@${GENAI_PERF_TAG}#subdirectory=genai-perf"

# Backend & Framework Specific Installation
ARG FRAMEWORK="STANDARD"
74
75
ARG TENSORRTLLM_BACKEND_REPO_TAG=
ARG TENSORRTLLM_BACKEND_REBUILD=
76
ARG TENSORRTLLM_SKIP_CLONE=
77
ENV FRAMEWORK=${FRAMEWORK}
78
79
RUN --mount=type=bind,source=./container/deps/requirements.tensorrtllm.txt,target=/tmp/requirements.txt \
    --mount=type=bind,source=./container/deps/clone_tensorrtllm.sh,target=/tmp/clone_tensorrtllm.sh \
80
    if [[ "$FRAMEWORK" == "TENSORRTLLM" ]] ; then pip install --timeout=2000 -r /tmp/requirements.txt; if [ ${TENSORRTLLM_SKIP_CLONE} -ne 1 ] ; then /tmp/clone_tensorrtllm.sh --tensorrtllm-backend-repo-tag ${TENSORRTLLM_BACKEND_REPO_TAG} --tensorrtllm-backend-rebuild ${TENSORRTLLM_BACKEND_REBUILD} --triton-llm-path /opt/triton/llm_binding ; fi ; fi
81

82

83
84
85
86
87
88
89
RUN --mount=type=bind,source=./container/deps/requirements.standard.txt,target=/tmp/requirements.txt \
    if [[ "$FRAMEWORK" == "STANDARD" ]] ; then pip install --timeout=2000 -r /tmp/requirements.txt ; fi

# Backend & Framework Specific LD_LIBRARY_PATH
ARG TENSORRTLLM_FRAMEWORK
ENV FRAMEWORK_LD_LIBRARY_PATH=${TENSORRTLLM_FRAMEWORK:+/opt/tritonserver/backends/tensorrtllm/}
ENV LD_LIBRARY_PATH=${FRAMEWORK_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}
90
ENV TENSORRTLLM_BACKEND_REPO_TAG=$TENSORRTLLM_BACKEND_REPO_TAG
91
ENV TRTLLM_USE_MPI_KVCACHE=${TENSORRTLLM_FRAMEWORK:+"1"}
92

93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
# TODO set VLLM Version
# ENV VLLM_VERSION
ARG VLLM_FRAMEWORK
# DEFAULT VLLM VARIABLES
# ENV VLLM_ATTENTION_BACKEND=${VLLM_FRAMEWORK:+FLASHINFER}
ENV VLLM_WORKER_MULTIPROC_METHOD=${VLLM_FRAMEWORK:+spawn}
ENV VLLM_TORCH_HOST=${VLLM_FRAMEWORK:+localhost}
ENV VLLM_TORCH_PORT=${VLLM_FRAMEWORK:+36183}
ENV VLLM_DATA_PLANE_BACKEND=${VLLM_FRAMEWORK:+nccl}
ENV VLLM_BASELINE_WORKERS=${VLLM_FRAMEWORK:+0}
ENV VLLM_CONTEXT_WORKERS=${VLLM_FRAMEWORK:+1}
ENV VLLM_GENERATE_WORKERS=${VLLM_FRAMEWORK:+1}
ENV VLLM_BASELINE_TP_SIZE=${VLLM_FRAMEWORK:+1}
ENV VLLM_CONTEXT_TP_SIZE=${VLLM_FRAMEWORK:+1}
ENV VLLM_GENERATE_TP_SIZE=${VLLM_FRAMEWORK:+1}
ENV VLLM_KV_CAPI_PATH="/opt/triton/llm_binding/lib/libtriton_llm_capi.so"
ENV PYTHONUNBUFFERED=1

111
112
113
# Install NATS - pointing toward NATS github instead of binaries.nats.dev due to server instability
RUN wget https://github.com/nats-io/nats-server/releases/download/v2.10.24/nats-server-v2.10.24-amd64.deb && dpkg -i nats-server-v2.10.24-amd64.deb

114
115
116
117
118
119
120
# etcd
ENV ETCD_VERSION="v3.5.18"
RUN wget https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-amd64.tar.gz -O /tmp/etcd.tar.gz && \
mkdir -p /usr/local/bin/etcd && \
tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1
ENV PATH=/usr/local/bin/etcd/:$PATH

121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# Enable Git operations in the /workspace directory.
RUN printf "[safe]\n      directory=/workspace\n" > /root/.gitconfig

# emacs docker-tramp requires /bin/sh to be linked to bash to operate correctly
RUN ln -sf /bin/bash /bin/sh

# Install NGINX
RUN apt-get install nginx -y
RUN rm -rf /etc/nginx/sites-enabled/default

# Install demo utils
RUN apt-get install nvtop -y
RUN apt-get install tmux -y

# Working directory
WORKDIR /workspace

138
139
140
141
142
143
# Copy Python wheel configuration files
COPY pyproject.toml /workspace/
COPY README.md /workspace/
COPY LICENSE /workspace/

# Build Rust runtime
Neelay Shah's avatar
Neelay Shah committed
144
145
COPY lib/runtime /workspace/lib/runtime
RUN cd lib/runtime && \
146
147
    cargo build --release --locked && cargo doc --no-deps

148
# Build OpenAI HTTP Service binaries
Neelay Shah's avatar
Neelay Shah committed
149
COPY lib/llm /workspace/lib/llm
150
151
152
153
154
155
COPY examples/rust /workspace/examples/rust
RUN cd examples/rust && \
    cargo build --release && \
    cp target/release/http /usr/local/bin/ && \
    cp target/release/llmctl /usr/local/bin/

156
# Generate C bindings. Note that this is required for TRTLLM backend re-build
Neelay Shah's avatar
Neelay Shah committed
157
158
COPY lib/bindings /workspace/lib/bindings
RUN cd lib/bindings/c/ && \
159
160
    cargo build --release --locked && cargo doc --no-deps

Neelay Shah's avatar
Neelay Shah committed
161
# Install uv, create virtualenv for general use, and build triton_distributed  wheel
162
163
164
165
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
RUN mkdir /opt/triton && \
    uv venv /opt/triton/venv --python 3.12 && \
    source /opt/triton/venv/bin/activate && \
166
167
    uv build --wheel --out-dir /workspace/dist && \
    uv pip install /workspace/dist/triton_distributed*cp312*.whl
168

169
# Package the bindings
Neelay Shah's avatar
Neelay Shah committed
170
171
RUN mkdir -p /opt/triton/bindings/wheels && \
    mkdir /opt/triton/bindings/lib && \
172
    cp dist/triton_distributed*cp312*.whl /opt/triton/bindings/wheels/. && \
Neelay Shah's avatar
Neelay Shah committed
173
174
    cp lib/bindings/c/target/release/libtriton_distributed_llm_capi.so /opt/triton/bindings/lib/. && \
    cp -r lib/bindings/c/include /opt/triton/bindings/.
175

Neelay Shah's avatar
Neelay Shah committed
176
# Install triton_distributed_runtime and triton_distributed_llm wheels globally in container for tests that
177
178
# currently run without virtual environment activated.
# TODO: In future, we may use a virtualenv for everything and remove this.
Neelay Shah's avatar
Neelay Shah committed
179
RUN pip install /opt/triton/bindings/wheels/triton_distributed*cp312*.whl
180

181
182
183
184
# Copy everything in after install steps to avoid re-running build/install
# commands on unrelated changes in other dirs.
COPY . /workspace

185
186
187
# Enable system UCX
ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true

188
189
190
# Command and Entrypoint
CMD []
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]