build: Multi-stage VLLM Build (#225)

6e0ccccb · Ryan McCormick · GitHub · 15de1807 · 6e0ccccb · 6e0ccccb
Commit 6e0ccccb authored Feb 21, 2025 by Ryan McCormick Committed by GitHub Feb 21, 2025
7 changed files
--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
 # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0

-ARG BASE_IMAGE="nvcr.io/nvidia/tritonserver"
-ARG BASE_IMAGE_TAG="25.01-py3"
+ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
+ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"

-FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS triton-distributed
+FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dev

 USER root

-# Rust build/dev dependencies
-RUN apt-get update; apt-get install -y gdb protobuf-compiler
-RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
-ENV PATH="/root/.cargo/bin:${PATH}"
+# Install utilities
+RUN apt update -y && apt install -y git wget curl nvtop tmux vim
+# nats
+RUN wget https://github.com/nats-io/nats-server/releases/download/v2.10.24/nats-server-v2.10.24-amd64.deb && dpkg -i nats-server-v2.10.24-amd64.deb
+# etcd
+ENV ETCD_VERSION="v3.5.18"
+RUN wget https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-amd64.tar.gz -O /tmp/etcd.tar.gz && \
+mkdir -p /usr/local/bin/etcd && \
+tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1
+ENV PATH=/usr/local/bin/etcd/:$PATH
+
+
+### VIRTUAL ENVIRONMENT SETUP ###

 # Install uv and create virtualenv
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
@@ -22,66 +31,45 @@ RUN mkdir /opt/triton && \
 ENV VIRTUAL_ENV=/opt/triton/venv
 ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"

-# Install OpenAI-compatible frontend
-# TODO: can this be removed since we have rust http server?
-ARG OPENAI_SERVER_TAG="r25.01"
-RUN mkdir -p /opt/tritonserver/python && \
-    cd /opt/tritonserver/python && \
-    rm -rf openai && \
-    git clone -b ${OPENAI_SERVER_TAG} --single-branch https://github.com/triton-inference-server/server.git && \
-    cd server && \
-    git checkout ${SERVER_OPENAI_COMMIT} && \
-    cd .. && \
-    mv server/python/openai openai && \
-    chown -R root:root openai && \
-    chmod 755 openai && \
-    chmod -R go-w openai && \
-    rm -rf server && \
-    uv pip install -r openai/requirements.txt
-
-# Common dependencies
-RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
-    uv pip install --requirement /tmp/requirements.txt
-RUN --mount=type=bind,source=./container/deps/requirements.nats.txt,target=/tmp/requirements.txt \
-    uv pip install --requirement /tmp/requirements.txt
-RUN --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.txt \
-    uv pip install --requirement /tmp/requirements.txt
-
-# Finish pyright install
-RUN pyright --help > /dev/null 2>&1
-
-# In Process Python API Install
-RUN find /opt/tritonserver/python -maxdepth 1 -type f -name \
-    "tritonserver-*.whl" | xargs -I {} uv pip install --force-reinstall --upgrade {}[all]
+# Install patched vllm - keep this early in Dockerfile to avoid
+# rebuilds from unrelated source code changes
+ARG VLLM_REF="v0.7.2"
+ARG VLLM_PATCH="vllm_${VLLM_REF}-triton-kv-disagg-patch.patch"
+RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
+    bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm

-# GENAI Perf Install
+# Install genai-perf for benchmarking
 ARG GENAI_PERF_TAG="r25.01"
 RUN uv pip install "git+https://github.com/triton-inference-server/perf_analyzer.git@${GENAI_PERF_TAG}#subdirectory=genai-perf"

-# Install NATS
-RUN wget https://github.com/nats-io/nats-server/releases/download/v2.10.24/nats-server-v2.10.24-amd64.deb && dpkg -i nats-server-v2.10.24-amd64.deb
+# Install test dependencies
+RUN --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.txt \
+    uv pip install --requirement /tmp/requirements.txt

-# Install etcd
-ENV ETCD_VERSION="v3.5.18"
-RUN wget https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-amd64.tar.gz -O /tmp/etcd.tar.gz && \
-mkdir -p /usr/local/bin/etcd && \
-tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1
-ENV PATH=/usr/local/bin/etcd/:$PATH
+### MISC UTILITY SETUP ###
+
+# Finish pyright install
+RUN pyright --help > /dev/null 2>&1

 # Enable Git operations in the /workspace directory
 RUN printf "[safe]\n      directory=/workspace\n" > /root/.gitconfig

 RUN ln -sf /bin/bash /bin/sh

-# Install NGINX and utilities
-# TODO: can this be removed since we do not use the nginx?
-RUN apt-get install nginx nvtop tmux -y
-RUN rm -rf /etc/nginx/sites-enabled/default
+### BUILDS ###
+
+# Rust build/dev dependencies
+RUN apt update -y && \
+    apt install -y \
+    build-essential \
+    protobuf-compiler && \
+    curl https://sh.rustup.rs -sSf | bash -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"

 # Working directory
 WORKDIR /workspace

-COPY runtime /workspace/runtime
+COPY runtime/rust /workspace/runtime/rust
 RUN cd runtime/rust && \
    cargo build --release --locked && cargo doc --no-deps

@@ -93,8 +81,11 @@ RUN cd examples/rust && \
    cp target/release/http /usr/local/bin/ && \
    cp target/release/llmctl /usr/local/bin/

+# TODO: Build tio
+# COPY applications/...
+
 # Generate C bindings for kv cache routing in vLLM
-COPY llm /workspace/llm
+COPY llm/rust /workspace/llm/rust
 RUN cd llm/rust/ && \
 cargo build --release --locked && cargo doc --no-deps

@@ -112,29 +103,59 @@ RUN cp -r llm/rust/libtriton-llm/include /opt/triton/llm_binding/.
 # Tell vllm to use the Triton LLM C API for KV Cache Routing
 ENV VLLM_KV_CAPI_PATH="/opt/triton/llm_binding/lib/libtriton_llm_capi.so"

-# Install patched vllm
-ARG VLLM_REF="v0.7.2"
-ARG VLLM_PATCH="vllm_${VLLM_REF}-triton-kv-disagg-patch.patch"
-RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
-    bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm
+# FIXME: Copy more specific folders in for dev/debug after directory restructure
+COPY . /workspace

-# Install triton_distributed_rs wheel globally
-# RUN pip install runtime/rust/python-wheel/dist/triton_distributed_rs*cp312*.whl
+# FIXME: May want a modification with triton-distributed banner on entry
+ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]

-COPY icp /workspace/icp
-RUN /workspace/icp/protos/gen_python.sh
+CMD []

-# Install python packages
-ARG PYTHON_PACKAGE_VERSION=0.0.1.dev+unknown
-RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_TRITON_DISTRIBUTED_ICP=${PYTHON_PACKAGE_VERSION} uv pip install -e /workspace/icp/python
-RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_TRITON_DISTRIBUTED_RUNTIME=${PYTHON_PACKAGE_VERSION} uv pip install -e /workspace/runtime/python
+### Lean Runtime Image Stage ###

-# Copy remaining files
-COPY . /workspace
+# FIXME: Separate build and runtime images
+FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS runtime
+
+USER root

-# Environment setup
-ENV PYTHONPATH="${PYTHONPATH}:/workspace/examples/python:/opt/tritonserver/python/openai/openai_frontend"
+# Install tools for interactive convenience
+RUN apt update -y && \
+    apt install -y curl tmux vim && \
+    echo "set -g mouse on" >> /root/.tmux.conf
+
+# Set environment variables
+ENV VIRTUAL_ENV=/opt/triton/venv
+ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
 ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true
+ENV VLLM_KV_CAPI_PATH="/opt/triton/llm_binding/lib/libtriton_llm_capi.so"

-CMD []
+# Copy binaries
+COPY --from=dev /usr/local/bin/http /usr/local/bin/http
+COPY --from=dev /usr/local/bin/llmctl /usr/local/bin/llmctl
+COPY --from=dev /usr/local/bin/etcd/etcd /usr/local/bin/etcd
+COPY --from=dev /usr/bin/nats-server /usr/local/bin/nats-server
+COPY --from=dev /bin/uv /usr/local/bin/uv
+COPY --from=dev /bin/uvx /usr/local/bin/uvx
+
+# Copy venv with installed packages
+RUN uv python install 3.12
+COPY --from=dev /opt/vllm /opt/vllm
+COPY --from=dev ${VIRTUAL_ENV} ${VIRTUAL_ENV}
+
+# Copy minimal set of files for testing. May consider separate stage for testing
+# if test dependencies start to negatively impact deployment environment/size.
+COPY pyproject.toml /workspace/pyproject.toml
+COPY container/deps/vllm /workspace/container/deps/vllm
+COPY python-wheel/python /workspace/python-wheel/python
+# Add library for KV routing
+COPY --from=dev ${VLLM_KV_CAPI_PATH} ${VLLM_KV_CAPI_PATH}
+# Copy minimal set of files for deployment/examples
+# FIXME: Use a more consolidated path after directory restructure
+COPY examples/python_rs/llm/vllm /workspace/examples/python_rs/llm/vllm
+
+WORKDIR /workspace
+
+# FIXME: May want a modification with triton-distributed banner on entry
 ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
+
+CMD []
--- a/container/build.sh
+++ b/container/build.sh
@@ -67,11 +67,8 @@ TENSORRTLLM_BACKEND_REPO_TAG=triton-llm/v0.17.0
 # trt-llm backend repo branch.
 TENSORRTLLM_BACKEND_REBUILD=1

-# vllm installation is done later in the Dockerfile so it will overwrite the
-# vllm version installed in the base image.
-VLLM_BASE_VERSION=25.01
-VLLM_BASE_IMAGE=nvcr.io/nvidia/tritonserver
-VLLM_BASE_IMAGE_TAG=${VLLM_BASE_VERSION}-py3
+VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
+VLLM_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"

 get_options() {
    while :; do
@@ -128,6 +125,14 @@ get_options() {
                missing_requirement $1
            fi
            ;;
+        --target)
+            if [ "$2" ]; then
+                TARGET=$2
+                shift
+            else
+                missing_requirement $1
+            fi
+            ;;
        --build-arg)
            if [ "$2" ]; then
                BUILD_ARGS+="--build-arg $2 "
@@ -212,13 +217,18 @@ get_options() {

    if [ -z "$TAG" ]; then
        TAG="--tag triton-distributed:${VERSION}-${FRAMEWORK,,}"
+        if [ ! -z ${TARGET} ]; then
+            TAG="${TAG}-${TARGET}"
+        fi
    fi

    if [ ! -z "$PLATFORM" ]; then
        PLATFORM="--platform ${PLATFORM}"
    fi

-
+    if [ ! -z "$TARGET" ]; then
+        TARGET_STR="--target ${TARGET}"
+    fi
 }


@@ -292,15 +302,17 @@ if [ ! -z ${HF_TOKEN} ]; then
 fi

 LATEST_TAG="--tag triton-distributed:latest-${FRAMEWORK,,}"
+if [ ! -z ${TARGET} ]; then
+    LATEST_TAG="${LATEST_TAG}-${TARGET}"
+fi

 show_image_options

-
 if [ -z "$RUN_PREFIX" ]; then
    set -x
 fi

-$RUN_PREFIX docker build -f $DOCKERFILE $PLATFORM $BUILD_ARGS $CACHE_FROM $TAG $LATEST_TAG $BUILD_CONTEXT $NO_CACHE
+$RUN_PREFIX docker build -f $DOCKERFILE $TARGET_STR $PLATFORM $BUILD_ARGS $CACHE_FROM $TAG $LATEST_TAG $BUILD_CONTEXT $NO_CACHE

 { set +x; } 2>/dev/null


--- a/container/deps/requirements.test.txt
+++ b/container/deps/requirements.test.txt
@@ -13,8 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+pyright
 pytest
 pytest-asyncio
 pytest-benchmark
 pytest-cov
+pytest-md-report
+pytest-mypy
 pytest-timeout
--- a/container/deps/requirements.txt
+++ b/container/deps/requirements.txt
@@ -26,8 +26,6 @@ pre-commit
 protobuf==5.27.3
 pydantic==2.7.1
 pyright
-pytest-md-report
-pytest-mypy
 sentencepiece
 transformers
 tritonclient==2.53.0

--- a/container/run.sh
+++ b/container/run.sh
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-TAG=
 RUN_PREFIX=

 # Frameworks
@@ -62,6 +61,14 @@ get_options() {
 		missing_requirement $1
            fi
            ;;
+        --target)
+            if [ "$2" ]; then
+                TARGET=$2
+                shift
+            else
+                missing_requirement $1
+            fi
+            ;;
 	--name)
            if [ "$2" ]; then
                NAME=$2
@@ -87,9 +94,9 @@ get_options() {
 		missing_requirement $1
            fi
            ;;
-	--command)
+	--entrypoint)
            if [ "$2" ]; then
-                COMMAND=$2
+                ENTRYPOINT=$2
                shift
            else
 		missing_requirement $1
@@ -172,6 +179,9 @@ get_options() {

    if [ -z "$IMAGE" ]; then
        IMAGE="triton-distributed:latest-${FRAMEWORK,,}"
+        if [ ! -z ${TARGET} ]; then
+            IMAGE="${IMAGE}-${TARGET}"
+        fi
    fi

    if [[ ${GPUS^^} == "NONE" ]]; then
@@ -186,6 +196,12 @@ get_options() {
 	NAME_STRING="--name ${NAME}"
    fi

+    if [[ ${ENTRYPOINT^^} == "" ]]; then
+	ENTRYPOINT_STRING=""
+    else
+	ENTRYPOINT_STRING="--entrypoint ${ENTRYPOINT}"
+    fi
+
    if [ ! -z "$MOUNT_WORKSPACE" ]; then
 	VOLUME_MOUNTS+=" -v ${SOURCE_DIR}/..:/workspace "
 	VOLUME_MOUNTS+=" -v /tmp:/tmp "
@@ -273,6 +289,6 @@ if [ -z "$RUN_PREFIX" ]; then
    set -x
 fi

-${RUN_PREFIX} docker run ${GPU_STRING} ${INTERACTIVE} ${RM_STRING} --network host --shm-size=10G --ulimit memlock=-1 --ulimit stack=67108864 ${ENVIRONMENT_VARIABLES} ${VOLUME_MOUNTS} -w /workspace --cap-add CAP_SYS_PTRACE --ipc host ${PRIVILEGED_STRING} ${NAME_STRING} ${IMAGE} "${REMAINING_ARGS[@]}"
+${RUN_PREFIX} docker run ${GPU_STRING} ${INTERACTIVE} ${RM_STRING} --network host --shm-size=10G --ulimit memlock=-1 --ulimit stack=67108864 ${ENVIRONMENT_VARIABLES} ${VOLUME_MOUNTS} -w /workspace --cap-add CAP_SYS_PTRACE --ipc host ${PRIVILEGED_STRING} ${NAME_STRING} ${ENTRYPOINT_STRING} ${IMAGE} "${REMAINING_ARGS[@]}"

 { set +x; } 2>/dev/null
--- a/examples/python_rs/llm/vllm/scripts/disaggregated.sh
+++ b/examples/python_rs/llm/vllm/scripts/disaggregated.sh
+#!/bin/bash -e
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# LIMITATIONS:
+# - Must have at least 2 GPUs since CUDA_VISIBLE_DEVICES is hard-coded to 0 and 1
+# - Must use a single node
+
+if [ $# -gt 2 ]; then
+    echo "Usage: $0 [model_name] [endpoint_name]"
+    echo "Optional: model_name (default: deepseek-ai/DeepSeek-R1-Distill-Llama-8B)"
+    echo "Optional: endpoint_name (default: triton-init.vllm.generate)"
+    exit 1
+fi
+
+MODEL_NAME=${1:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
+ENDPOINT_NAME=${2:-"triton-init.vllm.generate"}
+SESSION_NAME="vllm_disagg"
+WORKDIR="$(dirname $0)/.."
+INIT_CMD="cd $WORKDIR"
+
+########################################################
+# TMUX SESSION SETUP
+########################################################
+
+# Start new session
+tmux new-session -d -s "$SESSION_NAME"
+
+# Split into 4 equal panes
+tmux split-window -h
+tmux split-window -v
+tmux select-pane -t 0
+tmux split-window -v
+
+########################################################
+# HTTP Server
+########################################################
+HTTP_HOST="localhost"
+HTTP_PORT=8080
+HTTP_CMD="TRD_LOG=DEBUG http --host ${HTTP_HOST} --port ${HTTP_PORT}"
+tmux select-pane -t 0
+tmux send-keys "$INIT_CMD && $HTTP_CMD" C-m
+
+########################################################
+# LLMCTL
+########################################################
+LLMCTL_CMD="sleep 5 && llmctl http remove chat-model $MODEL_NAME && \
+    llmctl http add chat-model $MODEL_NAME $ENDPOINT_NAME && \
+    llmctl http list chat-model"
+tmux select-pane -t 1
+tmux send-keys "$INIT_CMD && $LLMCTL_CMD" C-m
+
+CURL_CMD="curl ${HTTP_HOST}:${HTTP_PORT}/v1/chat/completions \
+  -H \"Content-Type: application/json\" \
+  -d '{
+    \"model\": \"$MODEL_NAME\",
+    \"messages\": [
+      {\"role\": \"user\", \"content\": \"What is the capital of France?\"}
+    ],
+    \"stream\": true,
+    \"max_tokens\": 10
+  }'"
+# Prepare a curl command for a quick test, but don't execute it since the server
+# needs to spin up first.
+tmux send-keys "$CURL_CMD"
+
+########################################################
+# Processor
+########################################################
+
+# skip
+
+########################################################
+# Router
+########################################################
+
+# skip
+
+########################################################
+# Prefill
+########################################################
+PREFILL_CMD="VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=0 \
+    python3 -m disaggregated.prefill_worker \
+    --model $MODEL_NAME \
+    --gpu-memory-utilization 0.8 \
+    --enforce-eager \
+    --max-model-len 1000 \
+    --tensor-parallel-size 1 \
+    --kv-transfer-config \
+    '{\"kv_connector\":\"TritonNcclConnector\",\"kv_role\":\"kv_producer\",\"kv_rank\":0,\"kv_parallel_size\":2}'"
+
+tmux select-pane -t 2
+tmux send-keys "$INIT_CMD && $PREFILL_CMD" C-m
+
+########################################################
+# Decode
+########################################################
+DECODE_CMD="VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=1 \
+    python3 -m disaggregated.decode_worker \
+    --model $MODEL_NAME \
+    --gpu-memory-utilization 0.8 \
+    --enforce-eager \
+    --max-model-len 1000 \
+    --tensor-parallel-size 1 \
+    --kv-transfer-config \
+    '{\"kv_connector\":\"TritonNcclConnector\",\"kv_role\":\"kv_consumer\",\"kv_rank\":1,\"kv_parallel_size\":2}'"
+
+tmux select-pane -t 3
+tmux send-keys "$INIT_CMD && $DECODE_CMD" C-m
+tmux attach-session -t "$SESSION_NAME"
--- a/examples/python_rs/llm/vllm/kv-router-run.sh
+++ b/examples/python_rs/llm/vllm/kv-router-run.sh
+#!/bin/bash
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -13,8 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-#!/bin/bash
-
 # LIMITATIONS:
 # - Must use a single GPU for workers as CUDA_VISIBLE_DEVICES is set to a fixed value
 # - Must use a single node