Commit 6e0ccccb authored by Ryan McCormick's avatar Ryan McCormick Committed by GitHub
Browse files

build: Multi-stage VLLM Build (#225)

parent 15de1807
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
ARG BASE_IMAGE="nvcr.io/nvidia/tritonserver"
ARG BASE_IMAGE_TAG="25.01-py3"
ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS triton-distributed
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dev
USER root
# Rust build/dev dependencies
RUN apt-get update; apt-get install -y gdb protobuf-compiler
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
# Install utilities
RUN apt update -y && apt install -y git wget curl nvtop tmux vim
# nats
RUN wget https://github.com/nats-io/nats-server/releases/download/v2.10.24/nats-server-v2.10.24-amd64.deb && dpkg -i nats-server-v2.10.24-amd64.deb
# etcd
ENV ETCD_VERSION="v3.5.18"
RUN wget https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-amd64.tar.gz -O /tmp/etcd.tar.gz && \
mkdir -p /usr/local/bin/etcd && \
tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1
ENV PATH=/usr/local/bin/etcd/:$PATH
### VIRTUAL ENVIRONMENT SETUP ###
# Install uv and create virtualenv
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
......@@ -22,66 +31,45 @@ RUN mkdir /opt/triton && \
ENV VIRTUAL_ENV=/opt/triton/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
# Install OpenAI-compatible frontend
# TODO: can this be removed since we have rust http server?
ARG OPENAI_SERVER_TAG="r25.01"
RUN mkdir -p /opt/tritonserver/python && \
cd /opt/tritonserver/python && \
rm -rf openai && \
git clone -b ${OPENAI_SERVER_TAG} --single-branch https://github.com/triton-inference-server/server.git && \
cd server && \
git checkout ${SERVER_OPENAI_COMMIT} && \
cd .. && \
mv server/python/openai openai && \
chown -R root:root openai && \
chmod 755 openai && \
chmod -R go-w openai && \
rm -rf server && \
uv pip install -r openai/requirements.txt
# Common dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
uv pip install --requirement /tmp/requirements.txt
RUN --mount=type=bind,source=./container/deps/requirements.nats.txt,target=/tmp/requirements.txt \
uv pip install --requirement /tmp/requirements.txt
RUN --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.txt \
uv pip install --requirement /tmp/requirements.txt
# Finish pyright install
RUN pyright --help > /dev/null 2>&1
# In Process Python API Install
RUN find /opt/tritonserver/python -maxdepth 1 -type f -name \
"tritonserver-*.whl" | xargs -I {} uv pip install --force-reinstall --upgrade {}[all]
# Install patched vllm - keep this early in Dockerfile to avoid
# rebuilds from unrelated source code changes
ARG VLLM_REF="v0.7.2"
ARG VLLM_PATCH="vllm_${VLLM_REF}-triton-kv-disagg-patch.patch"
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm
# GENAI Perf Install
# Install genai-perf for benchmarking
ARG GENAI_PERF_TAG="r25.01"
RUN uv pip install "git+https://github.com/triton-inference-server/perf_analyzer.git@${GENAI_PERF_TAG}#subdirectory=genai-perf"
# Install NATS
RUN wget https://github.com/nats-io/nats-server/releases/download/v2.10.24/nats-server-v2.10.24-amd64.deb && dpkg -i nats-server-v2.10.24-amd64.deb
# Install test dependencies
RUN --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.txt \
uv pip install --requirement /tmp/requirements.txt
# Install etcd
ENV ETCD_VERSION="v3.5.18"
RUN wget https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-amd64.tar.gz -O /tmp/etcd.tar.gz && \
mkdir -p /usr/local/bin/etcd && \
tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1
ENV PATH=/usr/local/bin/etcd/:$PATH
### MISC UTILITY SETUP ###
# Finish pyright install
RUN pyright --help > /dev/null 2>&1
# Enable Git operations in the /workspace directory
RUN printf "[safe]\n directory=/workspace\n" > /root/.gitconfig
RUN ln -sf /bin/bash /bin/sh
# Install NGINX and utilities
# TODO: can this be removed since we do not use the nginx?
RUN apt-get install nginx nvtop tmux -y
RUN rm -rf /etc/nginx/sites-enabled/default
### BUILDS ###
# Rust build/dev dependencies
RUN apt update -y && \
apt install -y \
build-essential \
protobuf-compiler && \
curl https://sh.rustup.rs -sSf | bash -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
# Working directory
WORKDIR /workspace
COPY runtime /workspace/runtime
COPY runtime/rust /workspace/runtime/rust
RUN cd runtime/rust && \
cargo build --release --locked && cargo doc --no-deps
......@@ -93,8 +81,11 @@ RUN cd examples/rust && \
cp target/release/http /usr/local/bin/ && \
cp target/release/llmctl /usr/local/bin/
# TODO: Build tio
# COPY applications/...
# Generate C bindings for kv cache routing in vLLM
COPY llm /workspace/llm
COPY llm/rust /workspace/llm/rust
RUN cd llm/rust/ && \
cargo build --release --locked && cargo doc --no-deps
......@@ -112,29 +103,59 @@ RUN cp -r llm/rust/libtriton-llm/include /opt/triton/llm_binding/.
# Tell vllm to use the Triton LLM C API for KV Cache Routing
ENV VLLM_KV_CAPI_PATH="/opt/triton/llm_binding/lib/libtriton_llm_capi.so"
# Install patched vllm
ARG VLLM_REF="v0.7.2"
ARG VLLM_PATCH="vllm_${VLLM_REF}-triton-kv-disagg-patch.patch"
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm
# FIXME: Copy more specific folders in for dev/debug after directory restructure
COPY . /workspace
# Install triton_distributed_rs wheel globally
# RUN pip install runtime/rust/python-wheel/dist/triton_distributed_rs*cp312*.whl
# FIXME: May want a modification with triton-distributed banner on entry
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
COPY icp /workspace/icp
RUN /workspace/icp/protos/gen_python.sh
CMD []
# Install python packages
ARG PYTHON_PACKAGE_VERSION=0.0.1.dev+unknown
RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_TRITON_DISTRIBUTED_ICP=${PYTHON_PACKAGE_VERSION} uv pip install -e /workspace/icp/python
RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_TRITON_DISTRIBUTED_RUNTIME=${PYTHON_PACKAGE_VERSION} uv pip install -e /workspace/runtime/python
### Lean Runtime Image Stage ###
# Copy remaining files
COPY . /workspace
# FIXME: Separate build and runtime images
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS runtime
USER root
# Environment setup
ENV PYTHONPATH="${PYTHONPATH}:/workspace/examples/python:/opt/tritonserver/python/openai/openai_frontend"
# Install tools for interactive convenience
RUN apt update -y && \
apt install -y curl tmux vim && \
echo "set -g mouse on" >> /root/.tmux.conf
# Set environment variables
ENV VIRTUAL_ENV=/opt/triton/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true
ENV VLLM_KV_CAPI_PATH="/opt/triton/llm_binding/lib/libtriton_llm_capi.so"
CMD []
# Copy binaries
COPY --from=dev /usr/local/bin/http /usr/local/bin/http
COPY --from=dev /usr/local/bin/llmctl /usr/local/bin/llmctl
COPY --from=dev /usr/local/bin/etcd/etcd /usr/local/bin/etcd
COPY --from=dev /usr/bin/nats-server /usr/local/bin/nats-server
COPY --from=dev /bin/uv /usr/local/bin/uv
COPY --from=dev /bin/uvx /usr/local/bin/uvx
# Copy venv with installed packages
RUN uv python install 3.12
COPY --from=dev /opt/vllm /opt/vllm
COPY --from=dev ${VIRTUAL_ENV} ${VIRTUAL_ENV}
# Copy minimal set of files for testing. May consider separate stage for testing
# if test dependencies start to negatively impact deployment environment/size.
COPY pyproject.toml /workspace/pyproject.toml
COPY container/deps/vllm /workspace/container/deps/vllm
COPY python-wheel/python /workspace/python-wheel/python
# Add library for KV routing
COPY --from=dev ${VLLM_KV_CAPI_PATH} ${VLLM_KV_CAPI_PATH}
# Copy minimal set of files for deployment/examples
# FIXME: Use a more consolidated path after directory restructure
COPY examples/python_rs/llm/vllm /workspace/examples/python_rs/llm/vllm
WORKDIR /workspace
# FIXME: May want a modification with triton-distributed banner on entry
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
......@@ -67,11 +67,8 @@ TENSORRTLLM_BACKEND_REPO_TAG=triton-llm/v0.17.0
# trt-llm backend repo branch.
TENSORRTLLM_BACKEND_REBUILD=1
# vllm installation is done later in the Dockerfile so it will overwrite the
# vllm version installed in the base image.
VLLM_BASE_VERSION=25.01
VLLM_BASE_IMAGE=nvcr.io/nvidia/tritonserver
VLLM_BASE_IMAGE_TAG=${VLLM_BASE_VERSION}-py3
VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
VLLM_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
get_options() {
while :; do
......@@ -128,6 +125,14 @@ get_options() {
missing_requirement $1
fi
;;
--target)
if [ "$2" ]; then
TARGET=$2
shift
else
missing_requirement $1
fi
;;
--build-arg)
if [ "$2" ]; then
BUILD_ARGS+="--build-arg $2 "
......@@ -212,13 +217,18 @@ get_options() {
if [ -z "$TAG" ]; then
TAG="--tag triton-distributed:${VERSION}-${FRAMEWORK,,}"
if [ ! -z ${TARGET} ]; then
TAG="${TAG}-${TARGET}"
fi
fi
if [ ! -z "$PLATFORM" ]; then
PLATFORM="--platform ${PLATFORM}"
fi
if [ ! -z "$TARGET" ]; then
TARGET_STR="--target ${TARGET}"
fi
}
......@@ -292,15 +302,17 @@ if [ ! -z ${HF_TOKEN} ]; then
fi
LATEST_TAG="--tag triton-distributed:latest-${FRAMEWORK,,}"
if [ ! -z ${TARGET} ]; then
LATEST_TAG="${LATEST_TAG}-${TARGET}"
fi
show_image_options
if [ -z "$RUN_PREFIX" ]; then
set -x
fi
$RUN_PREFIX docker build -f $DOCKERFILE $PLATFORM $BUILD_ARGS $CACHE_FROM $TAG $LATEST_TAG $BUILD_CONTEXT $NO_CACHE
$RUN_PREFIX docker build -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $TAG $LATEST_TAG $BUILD_CONTEXT $NO_CACHE
{ set +x; } 2>/dev/null
......
......@@ -13,8 +13,11 @@
# See the License for the specific language governing permissions and
# limitations under the License.
pyright
pytest
pytest-asyncio
pytest-benchmark
pytest-cov
pytest-md-report
pytest-mypy
pytest-timeout
......@@ -26,8 +26,6 @@ pre-commit
protobuf==5.27.3
pydantic==2.7.1
pyright
pytest-md-report
pytest-mypy
sentencepiece
transformers
tritonclient==2.53.0
......
......@@ -14,7 +14,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
TAG=
RUN_PREFIX=
# Frameworks
......@@ -62,6 +61,14 @@ get_options() {
missing_requirement $1
fi
;;
--target)
if [ "$2" ]; then
TARGET=$2
shift
else
missing_requirement $1
fi
;;
--name)
if [ "$2" ]; then
NAME=$2
......@@ -87,9 +94,9 @@ get_options() {
missing_requirement $1
fi
;;
--command)
--entrypoint)
if [ "$2" ]; then
COMMAND=$2
ENTRYPOINT=$2
shift
else
missing_requirement $1
......@@ -172,6 +179,9 @@ get_options() {
if [ -z "$IMAGE" ]; then
IMAGE="triton-distributed:latest-${FRAMEWORK,,}"
if [ ! -z ${TARGET} ]; then
IMAGE="${IMAGE}-${TARGET}"
fi
fi
if [[ ${GPUS^^} == "NONE" ]]; then
......@@ -186,6 +196,12 @@ get_options() {
NAME_STRING="--name ${NAME}"
fi
if [[ ${ENTRYPOINT^^} == "" ]]; then
ENTRYPOINT_STRING=""
else
ENTRYPOINT_STRING="--entrypoint ${ENTRYPOINT}"
fi
if [ ! -z "$MOUNT_WORKSPACE" ]; then
VOLUME_MOUNTS+=" -v ${SOURCE_DIR}/..:/workspace "
VOLUME_MOUNTS+=" -v /tmp:/tmp "
......@@ -273,6 +289,6 @@ if [ -z "$RUN_PREFIX" ]; then
set -x
fi
${RUN_PREFIX} docker run ${GPU_STRING} ${INTERACTIVE} ${RM_STRING} --network host --shm-size=10G --ulimit memlock=-1 --ulimit stack=67108864 ${ENVIRONMENT_VARIABLES} ${VOLUME_MOUNTS} -w /workspace --cap-add CAP_SYS_PTRACE --ipc host ${PRIVILEGED_STRING} ${NAME_STRING} ${IMAGE} "${REMAINING_ARGS[@]}"
${RUN_PREFIX} docker run ${GPU_STRING} ${INTERACTIVE} ${RM_STRING} --network host --shm-size=10G --ulimit memlock=-1 --ulimit stack=67108864 ${ENVIRONMENT_VARIABLES} ${VOLUME_MOUNTS} -w /workspace --cap-add CAP_SYS_PTRACE --ipc host ${PRIVILEGED_STRING} ${NAME_STRING} ${ENTRYPOINT_STRING} ${IMAGE} "${REMAINING_ARGS[@]}"
{ set +x; } 2>/dev/null
#!/bin/bash -e
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# LIMITATIONS:
# - Must have at least 2 GPUs since CUDA_VISIBLE_DEVICES is hard-coded to 0 and 1
# - Must use a single node
if [ $# -gt 2 ]; then
echo "Usage: $0 [model_name] [endpoint_name]"
echo "Optional: model_name (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)"
echo "Optional: endpoint_name (default: triton-init.vllm.generate)"
exit 1
fi
MODEL_NAME=${1:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
ENDPOINT_NAME=${2:-"triton-init.vllm.generate"}
SESSION_NAME="vllm_disagg"
WORKDIR="$(dirname $0)/.."
INIT_CMD="cd $WORKDIR"
########################################################
# TMUX SESSION SETUP
########################################################
# Start new session
tmux new-session -d -s "$SESSION_NAME"
# Split into 4 equal panes
tmux split-window -h
tmux split-window -v
tmux select-pane -t 0
tmux split-window -v
########################################################
# HTTP Server
########################################################
HTTP_HOST="localhost"
HTTP_PORT=8080
HTTP_CMD="TRD_LOG=DEBUG http --host ${HTTP_HOST} --port ${HTTP_PORT}"
tmux select-pane -t 0
tmux send-keys "$INIT_CMD && $HTTP_CMD" C-m
########################################################
# LLMCTL
########################################################
LLMCTL_CMD="sleep 5 && llmctl http remove chat-model $MODEL_NAME && \
llmctl http add chat-model $MODEL_NAME $ENDPOINT_NAME && \
llmctl http list chat-model"
tmux select-pane -t 1
tmux send-keys "$INIT_CMD && $LLMCTL_CMD" C-m
CURL_CMD="curl ${HTTP_HOST}:${HTTP_PORT}/v1/chat/completions \
-H \"Content-Type: application/json\" \
-d '{
\"model\": \"$MODEL_NAME\",
\"messages\": [
{\"role\": \"user\", \"content\": \"What is the capital of France?\"}
],
\"stream\": true,
\"max_tokens\": 10
}'"
# Prepare a curl command for a quick test, but don't execute it since the server
# needs to spin up first.
tmux send-keys "$CURL_CMD"
########################################################
# Processor
########################################################
# skip
########################################################
# Router
########################################################
# skip
########################################################
# Prefill
########################################################
PREFILL_CMD="VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=0 \
python3 -m disaggregated.prefill_worker \
--model $MODEL_NAME \
--gpu-memory-utilization 0.8 \
--enforce-eager \
--max-model-len 1000 \
--tensor-parallel-size 1 \
--kv-transfer-config \
'{\"kv_connector\":\"TritonNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_rank\":0,\"kv_parallel_size\":2}'"
tmux select-pane -t 2
tmux send-keys "$INIT_CMD && $PREFILL_CMD" C-m
########################################################
# Decode
########################################################
DECODE_CMD="VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=1 \
python3 -m disaggregated.decode_worker \
--model $MODEL_NAME \
--gpu-memory-utilization 0.8 \
--enforce-eager \
--max-model-len 1000 \
--tensor-parallel-size 1 \
--kv-transfer-config \
'{\"kv_connector\":\"TritonNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_rank\":1,\"kv_parallel_size\":2}'"
tmux select-pane -t 3
tmux send-keys "$INIT_CMD && $DECODE_CMD" C-m
tmux attach-session -t "$SESSION_NAME"
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
......@@ -13,8 +14,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#!/bin/bash
# LIMITATIONS:
# - Must use a single GPU for workers as CUDA_VISIBLE_DEVICES is set to a fixed value
# - Must use a single node
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment