Commit d8aada0b authored by ptarasiewiczNV's avatar ptarasiewiczNV Committed by GitHub
Browse files

feat: Add Dockerfile.vllm (#184)

parent 9a7b506c
...@@ -53,7 +53,9 @@ jobs: ...@@ -53,7 +53,9 @@ jobs:
with: with:
filters: | filters: |
vllm: vllm:
- 'container/Dockerfile.vllm'
- 'examples/python/llm/**' - 'examples/python/llm/**'
- 'examples/python_rs/llm/**'
- 'container/deps/requirements.vllm.txt' - 'container/deps/requirements.vllm.txt'
- 'container/deps/vllm/**' - 'container/deps/vllm/**'
- name: Trigger Pipeline - name: Trigger Pipeline
......
...@@ -128,16 +128,6 @@ RUN mkdir /opt/triton && \ ...@@ -128,16 +128,6 @@ RUN mkdir /opt/triton && \
uv build && \ uv build && \
uv pip install dist/triton_distributed_rs*cp312*.whl uv pip install dist/triton_distributed_rs*cp312*.whl
# Install patched vllm
ARG VLLM_REF="v0.7.2"
ARG VLLM_PATCH="vllm_v0.7.2.patch"
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
if [[ "$FRAMEWORK" == "VLLM" ]]; then \
source /opt/triton/venv/bin/activate && \
bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm ; \
fi
# Install triton_distributed_rs wheel globally in container for tests that # Install triton_distributed_rs wheel globally in container for tests that
# currently run without virtual environment activated. # currently run without virtual environment activated.
# TODO: In future, we may use a virtualenv for everything and remove this. # TODO: In future, we may use a virtualenv for everything and remove this.
......
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
ARG BASE_IMAGE="nvcr.io/nvidia/tritonserver"
ARG BASE_IMAGE_TAG="25.01-py3"
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS triton-distributed
USER root
# Rust build/dev dependencies
RUN apt-get update; apt-get install -y gdb protobuf-compiler
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
# Install uv and create virtualenv
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
RUN mkdir /opt/triton && \
uv venv /opt/triton/venv --python 3.12
# Activate virtual environment
ENV VIRTUAL_ENV=/opt/triton/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
# Install OpenAI-compatible frontend
# TODO: can this be removed since we have rust http server?
ARG OPENAI_SERVER_TAG="r25.01"
RUN mkdir -p /opt/tritonserver/python && \
cd /opt/tritonserver/python && \
rm -rf openai && \
git clone -b ${OPENAI_SERVER_TAG} --single-branch https://github.com/triton-inference-server/server.git && \
cd server && \
git checkout ${SERVER_OPENAI_COMMIT} && \
cd .. && \
mv server/python/openai openai && \
chown -R root:root openai && \
chmod 755 openai && \
chmod -R go-w openai && \
rm -rf server && \
uv pip install -r openai/requirements.txt
# Common dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
uv pip install --requirement /tmp/requirements.txt
RUN --mount=type=bind,source=./container/deps/requirements.nats.txt,target=/tmp/requirements.txt \
uv pip install --requirement /tmp/requirements.txt
RUN --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.txt \
uv pip install --requirement /tmp/requirements.txt
# Finish pyright install
RUN pyright --help > /dev/null 2>&1
# In Process Python API Install
RUN find /opt/tritonserver/python -maxdepth 1 -type f -name \
"tritonserver-*.whl" | xargs -I {} uv pip install --force-reinstall --upgrade {}[all]
# GENAI Perf Install
ARG GENAI_PERF_TAG="r25.01"
RUN uv pip install "git+https://github.com/triton-inference-server/perf_analyzer.git@${GENAI_PERF_TAG}#subdirectory=genai-perf"
# Install NATS
RUN wget https://github.com/nats-io/nats-server/releases/download/v2.10.24/nats-server-v2.10.24-amd64.deb && dpkg -i nats-server-v2.10.24-amd64.deb
# Enable Git operations in the /workspace directory
RUN printf "[safe]\n directory=/workspace\n" > /root/.gitconfig
RUN ln -sf /bin/bash /bin/sh
# Install NGINX and utilities
# TODO: can this be removed since we do not use the nginx?
RUN apt-get install nginx nvtop tmux -y
RUN rm -rf /etc/nginx/sites-enabled/default
# Working directory
WORKDIR /workspace
COPY runtime /workspace/runtime
RUN cd runtime/rust && \
cargo build --release --locked && cargo doc --no-deps
# Build triton_distributed_rs wheel
RUN cd runtime/rust/python-wheel && \
uv build && \
uv pip install dist/triton_distributed_rs*cp312*.whl
# Install patched vllm
ARG VLLM_REF="v0.7.2"
ARG VLLM_PATCH="vllm_${VLLM_REF}.patch"
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm
# Install triton_distributed_rs wheel globally
# RUN pip install runtime/rust/python-wheel/dist/triton_distributed_rs*cp312*.whl
COPY icp /workspace/icp
RUN /workspace/icp/protos/gen_python.sh
# Install python packages
ARG PYTHON_PACKAGE_VERSION=0.0.1.dev+unknown
RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_TRITON_DISTRIBUTED_ICP=${PYTHON_PACKAGE_VERSION} uv pip install -e /workspace/icp/python
RUN SETUPTOOLS_SCM_PRETEND_VERSION_FOR_TRITON_DISTRIBUTED_RUNTIME=${PYTHON_PACKAGE_VERSION} uv pip install -e /workspace/runtime/python
# Copy remaining files
COPY . /workspace
# Environment setup
ENV PYTHONPATH="${PYTHONPATH}:/workspace/examples/python:/opt/tritonserver/python/openai/openai_frontend"
ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true
CMD []
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
...@@ -265,6 +265,11 @@ error() { ...@@ -265,6 +265,11 @@ error() {
get_options "$@" get_options "$@"
# Update DOCKERFILE if framework is VLLM
if [[ $FRAMEWORK == "VLLM" ]]; then
DOCKERFILE=${SOURCE_DIR}/Dockerfile.vllm
fi
# BUILD DEV IMAGE # BUILD DEV IMAGE
BUILD_ARGS+=" --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BASE_IMAGE_TAG=$BASE_IMAGE_TAG --build-arg FRAMEWORK=$FRAMEWORK --build-arg ${FRAMEWORK}_FRAMEWORK=1 --build-arg VERSION=$VERSION --build-arg PYTHON_PACKAGE_VERSION=$PYTHON_PACKAGE_VERSION" BUILD_ARGS+=" --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BASE_IMAGE_TAG=$BASE_IMAGE_TAG --build-arg FRAMEWORK=$FRAMEWORK --build-arg ${FRAMEWORK}_FRAMEWORK=1 --build-arg VERSION=$VERSION --build-arg PYTHON_PACKAGE_VERSION=$PYTHON_PACKAGE_VERSION"
......
...@@ -59,9 +59,6 @@ Run the server and client components in separate terminal sessions: ...@@ -59,9 +59,6 @@ Run the server and client components in separate terminal sessions:
**Terminal 1 - Server:** **Terminal 1 - Server:**
```bash ```bash
# Activate virtual environment
source /opt/triton/venv/bin/activate
# Launch worker # Launch worker
cd /workspace/examples/python_rs/llm/vllm cd /workspace/examples/python_rs/llm/vllm
python3 -m monolith.worker \ python3 -m monolith.worker \
...@@ -72,9 +69,6 @@ python3 -m monolith.worker \ ...@@ -72,9 +69,6 @@ python3 -m monolith.worker \
**Terminal 2 - Client:** **Terminal 2 - Client:**
```bash ```bash
# Activate virtual environment
source /opt/triton/venv/bin/activate
# Run client # Run client
cd /workspace/examples/python_rs/llm/vllm cd /workspace/examples/python_rs/llm/vllm
python3 -m common.client \ python3 -m common.client \
...@@ -104,9 +98,6 @@ This deployment option splits the model serving across prefill and decode worker ...@@ -104,9 +98,6 @@ This deployment option splits the model serving across prefill and decode worker
**Terminal 1 - Prefill Worker:** **Terminal 1 - Prefill Worker:**
```bash ```bash
# Activate virtual environment
source /opt/triton/venv/bin/activate
# Launch prefill worker # Launch prefill worker
cd /workspace/examples/python_rs/llm/vllm cd /workspace/examples/python_rs/llm/vllm
VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=0 python3 -m disaggregated.prefill_worker \ VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=0 python3 -m disaggregated.prefill_worker \
...@@ -121,9 +112,6 @@ VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=0 python3 -m disaggregat ...@@ -121,9 +112,6 @@ VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=0 python3 -m disaggregat
**Terminal 2 - Decode Worker:** **Terminal 2 - Decode Worker:**
```bash ```bash
# Activate virtual environment
source /opt/triton/venv/bin/activate
# Launch decode worker # Launch decode worker
cd /workspace/examples/python_rs/llm/vllm cd /workspace/examples/python_rs/llm/vllm
VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=1,2 python3 -m disaggregated.decode_worker \ VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=1,2 python3 -m disaggregated.decode_worker \
...@@ -138,9 +126,6 @@ VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=1,2 python3 -m disaggreg ...@@ -138,9 +126,6 @@ VLLM_WORKER_MULTIPROC_METHOD=spawn CUDA_VISIBLE_DEVICES=1,2 python3 -m disaggreg
**Terminal 3 - Client:** **Terminal 3 - Client:**
```bash ```bash
# Activate virtual environment
source /opt/triton/venv/bin/activate
# Run client # Run client
cd /workspace/examples/python_rs/llm/vllm cd /workspace/examples/python_rs/llm/vllm
python3 -m common.client \ python3 -m common.client \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment