feat: Add Dockerfile.vllm (#184)

d8aada0b · ptarasiewiczNV · GitHub · 9a7b506c · d8aada0b · d8aada0b
Commit d8aada0b authored Feb 18, 2025 by ptarasiewiczNV Committed by GitHub Feb 17, 2025
5 changed files
--- a/.github/workflows/trigger_ci.yml
+++ b/.github/workflows/trigger_ci.yml
@@ -53,7 +53,9 @@ jobs:
      with:
        filters: |
          vllm:
+            - 'container/Dockerfile.vllm'
            - 'examples/python/llm/**'
+            - 'examples/python_rs/llm/**'
            - 'container/deps/requirements.vllm.txt'
            - 'container/deps/vllm/**'
    - name: Trigger Pipeline

--- a/container/Dockerfile
+++ b/container/Dockerfile
@@ -128,16 +128,6 @@ RUN mkdir /opt/triton && \
    uv build && \
    uv pip install dist/triton_distributed_rs*cp312*.whl
-# Install patched vllm
-ARG VLLM_REF="v0.7.2"
-ARG VLLM_PATCH="vllm_v0.7.2.patch"
-RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
-    if [[ "$FRAMEWORK" == "VLLM" ]]; then \
-        source /opt/triton/venv/bin/activate && \
-        bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm ; \
-    fi
 # Install triton_distributed_rs wheel globally in container for tests that
 # currently run without virtual environment activated.
 # TODO: In future, we may use a virtualenv for everything and remove this.

--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+ARG BASE_IMAGE="nvcr.io/nvidia/tritonserver"
+ARG BASE_IMAGE_TAG="25.01-py3"
+FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS triton-distributed
+USER root
+# Rust build/dev dependencies
+RUN apt-get update; apt-get install -y gdb protobuf-compiler
+RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
+ENV PATH="/root/.cargo/bin:${PATH}"
+# Install uv and create virtualenv
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+RUN mkdir /opt/triton && \
+    uv venv /opt/triton/venv --python 3.12
+# Activate virtual environment
+ENV VIRTUAL_ENV=/opt/triton/venv
+ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
+# Install OpenAI-compatible frontend
+# TODO: can this be removed since we have rust http server?
+ARG OPENAI_SERVER_TAG="r25.01"
+RUN mkdir -p /opt/tritonserver/python && \
+    cd /opt/tritonserver/python && \
+    rm -rf openai && \
+    git clone -b ${OPENAI_SERVER_TAG} --single-branch https://github.com/triton-inference-server/server.git && \
+    cd server && \
+    git checkout ${SERVER_OPENAI_COMMIT} && \
+    cd .. && \
+    mv server/python/openai openai && \
+    chown -R root:root openai && \
+    chmod 755 openai && \
+    chmod -R go-w openai && \
+    rm -rf server && \
+    uv pip install -r openai/requirements.txt
+# Common dependencies
+RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
+    uv pip install --requirement /tmp/requirements.txt
+RUN --mount=type=bind,source=./container/deps/requirements.nats.txt,target=/tmp/requirements.txt \
+    uv pip install --requirement /tmp/requirements.txt
+RUN --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.txt \
+    uv pip install --requirement /tmp/requirements.txt
+# Finish pyright install
+RUN pyright --help > /dev/null 2>&1
+# In Process Python API Install
+RUN find /opt/tritonserver/python -maxdepth 1 -type f -name \
+    "tritonserver-*.whl" | xargs -I {} uv pip install --force-reinstall --upgrade {}[all]
+# GENAI Perf Install
+ARG GENAI_PERF_TAG="r25.01"
+RUN uv pip install "git+https://github.com/triton-inference-server/perf_analyzer.git@${GENAI_PERF_TAG}#subdirectory=genai-perf"
+# Install NATS
+RUN wget https://github.com/nats-io/nats-server/releases/download/v2.10.24/nats-server-v2.10.24-amd64.deb && dpkg -i nats-server-v2.10.24-amd64.deb
+# Enable Git operations in the /workspace directory
+RUN printf "[safe]\n      directory=/workspace\n" > /root/.gitconfig
+RUN ln -sf /bin/bash /bin/sh
+# Install NGINX and utilities
+# TODO: can this be removed since we do not use the nginx?
+RUN apt-get install nginx nvtop tmux -y
+RUN rm -rf /etc/nginx/sites-enabled/default
+# Working directory
+WORKDIR /workspace
+COPY runtime /workspace/runtime
+RUN cd runtime/rust && \
+    cargo build --release --locked && cargo doc --no-deps
+# Build triton_distributed_rs wheel
+RUN cd runtime/rust/python-wheel && \
+    uv build && \
+    uv pip install dist/triton_distributed_rs*cp312*.whl
+# Install patched vllm
+ARG VLLM_REF="v0.7.2"
+ARG VLLM_PATCH="vllm_${VLLM_REF}.patch"
+RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
+    bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm
+# Install triton_distributed_rs wheel globally
+# RUN pip install runtime/rust/python-wheel/dist/triton_distributed_rs*cp312*.whl
+COPY icp /workspace/icp
+RUN /workspace/icp/protos/gen_python.sh
+# Install python packages
+ARG PYTHON_PACKAGE_VERSION=0.0.1.dev+unknown
+RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_TRITON_DISTRIBUTED_ICP=${PYTHON_PACKAGE_VERSION} uv pip install -e /workspace/icp/python
+RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_TRITON_DISTRIBUTED_RUNTIME=${PYTHON_PACKAGE_VERSION} uv pip install -e /workspace/runtime/python
+# Copy remaining files
+COPY . /workspace
+# Environment setup
+ENV PYTHONPATH="${PYTHONPATH}:/workspace/examples/python:/opt/tritonserver/python/openai/openai_frontend"
+ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true
+CMD []
+ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
--- a/container/build.sh
+++ b/container/build.sh
@@ -265,6 +265,11 @@ error() {
 get_options "$@"
+# Update DOCKERFILE if framework is VLLM
+if [[ $FRAMEWORK == "VLLM" ]]; then
+    DOCKERFILE=${SOURCE_DIR}/Dockerfile.vllm
+fi
 # BUILD DEV IMAGE
 BUILD_ARGS+=" --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BASE_IMAGE_TAG=$BASE_IMAGE_TAG --build-arg FRAMEWORK=$FRAMEWORK --build-arg ${FRAMEWORK}_FRAMEWORK=1 --build-arg VERSION=$VERSION --build-arg PYTHON_PACKAGE_VERSION=$PYTHON_PACKAGE_VERSION"

--- a/examples/python_rs/llm/vllm/README.md
+++ b/examples/python_rs/llm/vllm/README.md
@@ -59,9 +59,6 @@ Run the server and client components in separate terminal sessions:
 **Terminal 1 - Server:**
 ```bash
-# Activate virtual environment
-source /opt/triton/venv/bin/activate
 # Launch worker
 cd /workspace/examples/python_rs/llm/vllm
 python3 -m monolith.worker \
@@ -72,9 +69,6 @@ python3 -m monolith.worker \
 **Terminal 2 - Client:**
 ```bash
-# Activate virtual environment
-source /opt/triton/venv/bin/activate
 # Run client
 cd /workspace/examples/python_rs/llm/vllm
 python3 -m common.client \
@@ -104,9 +98,6 @@ This deployment option splits the model serving across prefill and decode worker
 **Terminal 1 - Prefill Worker:**
 ```bash
-# Activate virtual environment
-source /opt/triton/venv/bin/activate
 # Launch prefill worker
 cd /workspace/examples/python_rs/llm/vllm
 VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=0 python3 -m disaggregated.prefill_worker \
@@ -121,9 +112,6 @@ VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=0 python3 -m disaggregat
 **Terminal 2 - Decode Worker:**
 ```bash
-# Activate virtual environment
-source /opt/triton/venv/bin/activate
 # Launch decode worker
 cd /workspace/examples/python_rs/llm/vllm
 VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=1,2 python3 -m disaggregated.decode_worker \
@@ -138,9 +126,6 @@ VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=1,2 python3 -m disaggreg
 **Terminal 3 - Client:**
 ```bash
-# Activate virtual environment
-source /opt/triton/venv/bin/activate
 # Run client
 cd /workspace/examples/python_rs/llm/vllm
 python3 -m common.client \