Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
5bcdb734
Commit
5bcdb734
authored
Mar 11, 2025
by
Neelay Shah
Committed by
GitHub
Mar 11, 2025
Browse files
refactor: rename vllm_nixl to vllm and make default (#100)
parent
a7c35dcf
Changes
51
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
341 additions
and
1810 deletions
+341
-1810
.github/workflows/pre-merge-python.yml
.github/workflows/pre-merge-python.yml
+1
-2
container/Dockerfile.vllm
container/Dockerfile.vllm
+133
-53
container/Dockerfile.vllm_nixl
container/Dockerfile.vllm_nixl
+0
-264
container/build.sh
container/build.sh
+6
-8
container/run.sh
container/run.sh
+5
-1
examples/python_rs/llm/vllm/README.md
examples/python_rs/llm/vllm/README.md
+196
-300
examples/python_rs/llm/vllm/__init__.py
examples/python_rs/llm/vllm/__init__.py
+0
-0
examples/python_rs/llm/vllm/common/base_engine.py
examples/python_rs/llm/vllm/common/base_engine.py
+0
-73
examples/python_rs/llm/vllm/common/chat_processor.py
examples/python_rs/llm/vllm/common/chat_processor.py
+0
-240
examples/python_rs/llm/vllm/common/client.py
examples/python_rs/llm/vllm/common/client.py
+0
-77
examples/python_rs/llm/vllm/common/parser.py
examples/python_rs/llm/vllm/common/parser.py
+0
-25
examples/python_rs/llm/vllm/common/protocol.py
examples/python_rs/llm/vllm/common/protocol.py
+0
-115
examples/python_rs/llm/vllm/disagg_router.py
examples/python_rs/llm/vllm/disagg_router.py
+0
-0
examples/python_rs/llm/vllm/disaggregated/__init__.py
examples/python_rs/llm/vllm/disaggregated/__init__.py
+0
-0
examples/python_rs/llm/vllm/disaggregated/decode_worker.py
examples/python_rs/llm/vllm/disaggregated/decode_worker.py
+0
-129
examples/python_rs/llm/vllm/disaggregated/prefill_worker.py
examples/python_rs/llm/vllm/disaggregated/prefill_worker.py
+0
-82
examples/python_rs/llm/vllm/kv_router.py
examples/python_rs/llm/vllm/kv_router.py
+0
-0
examples/python_rs/llm/vllm/kv_router/__init__.py
examples/python_rs/llm/vllm/kv_router/__init__.py
+0
-0
examples/python_rs/llm/vllm/kv_router/processor.py
examples/python_rs/llm/vllm/kv_router/processor.py
+0
-208
examples/python_rs/llm/vllm/kv_router/router.py
examples/python_rs/llm/vllm/kv_router/router.py
+0
-233
No files found.
.github/workflows/pre-merge-python.yml
View file @
5bcdb734
...
@@ -30,8 +30,7 @@ jobs:
...
@@ -30,8 +30,7 @@ jobs:
strategy
:
strategy
:
matrix
:
matrix
:
framework
:
framework
:
-
standard
-
vllm
-
vllm_nixl
name
:
Build and Test - ${{ matrix.framework }}
name
:
Build and Test - ${{ matrix.framework }}
env
:
env
:
CONTAINER_ID
:
test_${{ github.run_id }}_${{ github.run_attempt }}_${{ github.job }}_${{ matrix.framework }}
CONTAINER_ID
:
test_${{ github.run_id }}_${{ github.run_attempt }}_${{ github.job }}_${{ matrix.framework }}
...
...
container/Dockerfile.vllm
View file @
5bcdb734
...
@@ -8,6 +8,133 @@ FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dev
...
@@ -8,6 +8,133 @@ FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dev
USER root
USER root
### NIXL SETUP ###
ARG MOFED_VERSION=24.10-1.1.4.0
ARG PYTHON_VERSION=3.12
ARG NSYS_URL=https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_1/
ARG NSYS_PKG=NsightSystems-linux-cli-public-2025.1.1.131-3554042.deb
RUN apt-get update -y && apt-get -y install curl \
git \
libnuma-dev \
numactl \
wget \
autotools-dev \
automake \
libtool \
libz-dev \
libiberty-dev \
flex \
build-essential \
cmake \
libibverbs-dev \
libgoogle-glog-dev \
libgtest-dev \
libjsoncpp-dev \
libpython3-dev \
libboost-all-dev \
libssl-dev \
libgrpc-dev \
libgrpc++-dev \
libprotobuf-dev \
protobuf-compiler-grpc \
pybind11-dev \
python3-full \
python3-pip \
python3-numpy \
etcd-server \
net-tools \
pciutils \
libpci-dev \
vim \
tmux \
screen \
ibverbs-utils \
libibmad-dev
RUN apt-get install -y linux-tools-common linux-tools-generic ethtool iproute2
RUN apt-get install -y dkms linux-headers-generic
RUN apt-get install -y meson ninja-build uuid-dev gdb
RUN apt-get update && apt install -y wget libglib2.0-0
RUN wget ${NSYS_URL}${NSYS_PKG} && dpkg -i $NSYS_PKG && rm $NSYS_PKG
RUN cd /usr/local/src && \
curl -fSsL "https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu24.04-x86_64.tgz" -o mofed.tgz && \
tar -xf /usr/local/src/mofed.tgz && \
cd MLNX_OFED_LINUX-* && \
apt-get update && apt-get install -y --no-install-recommends \
./DEBS/libibverbs* ./DEBS/ibverbs-providers* ./DEBS/librdmacm* ./DEBS/libibumad* && \
rm -rf /var/lib/apt/lists/* /usr/local/src/*
ENV LIBRARY_PATH=$LIBRARY_PATH:/usr/local/cuda/lib64 \
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
ENV LIBRARY_PATH=$LIBRARY_PATH:/usr/local/lib \
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
WORKDIR /workspace
RUN git clone https://github.com/NVIDIA/gdrcopy.git
RUN PREFIX=/usr/local DESTLIB=/usr/local/lib make -C /workspace/gdrcopy lib_install
RUN cp gdrcopy/src/libgdrapi.so.2.* /usr/lib/x86_64-linux-gnu/
RUN ldconfig
ARG UCX_VERSION=v1.18.0
RUN cd /usr/local/src && \
curl -fSsL "https://github.com/openucx/ucx/tarball/${UCX_VERSION}" | tar xz && \
cd openucx-ucx* && \
./autogen.sh && ./configure \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-cma \
--enable-devel-headers \
--with-cuda=/usr/local/cuda \
--with-verbs \
--with-dm \
--with-gdrcopy=/usr/local \
--enable-mt \
--with-mlx5-dv && \
make -j && \
make -j install-strip && \
ldconfig
ENV LD_LIBRARY_PATH=/usr/lib:$LD_LIBRARY_PATH
ENV CPATH=/usr/include:$CPATH
ENV PATH=/usr/bin:$PATH
ENV PKG_CONFIG_PATH=/usr/lib/pkgconfig:$PKG_CONFIG_PATH
SHELL ["/bin/bash", "-c"]
WORKDIR /workspace
ENV LD_LIBRARY_PATH=/usr/local/ompi/lib:$LD_LIBRARY_PATH
ENV CPATH=/usr/local/ompi/include:$CPATH
ENV PATH=/usr/local/ompi/bin:$PATH
ENV PKG_CONFIG_PATH=/usr/local/ompi/lib/pkgconfig:$PKG_CONFIG_PATH
COPY --from=nixl . /opt/nixl
RUN cd /opt/nixl && \
mkdir build && \
meson setup build/ --prefix=/usr/local/nixl && \
cd build/ && \
ninja && \
ninja install
ENV LD_LIBRARY_PATH=/usr/local/nixl/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
ENV PYTHONPATH=/usr/local/nixl/lib/python3/dist-packages/:/opt/nixl/test/python/:$PYTHONPATH
ENV UCX_TLS=^cuda_ipc
ENV NIXL_PLUGIN_DIR=/usr/local/nixl/lib/x86_64-linux-gnu/plugins
RUN ls -l /usr/local/nixl/
RUN ls -l /usr/local/nixl/include/
RUN ls -l /usr/local/nixl/include/internal/
RUN ls /opt/nixl
# Install utilities
# Install utilities
RUN apt update -y && apt install -y git wget curl nvtop tmux vim
RUN apt update -y && apt install -y git wget curl nvtop tmux vim
# nats
# nats
...
@@ -31,6 +158,10 @@ RUN mkdir /opt/dynamo && \
...
@@ -31,6 +158,10 @@ RUN mkdir /opt/dynamo && \
ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
# Common dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
uv pip install --requirement /tmp/requirements.txt
# Install patched vllm - keep this early in Dockerfile to avoid
# Install patched vllm - keep this early in Dockerfile to avoid
# rebuilds from unrelated source code changes
# rebuilds from unrelated source code changes
ARG VLLM_REF="v0.7.2"
ARG VLLM_REF="v0.7.2"
...
@@ -39,7 +170,6 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
...
@@ -39,7 +170,6 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm
bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm
# Install genai-perf for benchmarking
# Install genai-perf for benchmarking
# TODO: Move to tag when fix for genai-perf will be released
ARG GENAI_PERF_TAG="25d0188713adc47868d6b3f22426375237a90529"
ARG GENAI_PERF_TAG="25d0188713adc47868d6b3f22426375237a90529"
RUN uv pip install "git+https://github.com/triton-inference-server/perf_analyzer.git@${GENAI_PERF_TAG}#subdirectory=genai-perf"
RUN uv pip install "git+https://github.com/triton-inference-server/perf_analyzer.git@${GENAI_PERF_TAG}#subdirectory=genai-perf"
...
@@ -47,7 +177,7 @@ RUN uv pip install "git+https://github.com/triton-inference-server/perf_analyzer
...
@@ -47,7 +177,7 @@ RUN uv pip install "git+https://github.com/triton-inference-server/perf_analyzer
RUN --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.txt \
RUN --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.txt \
uv pip install --requirement /tmp/requirements.txt
uv pip install --requirement /tmp/requirements.txt
### MISC UTILITY SETUP ###
#
### MISC UTILITY SETUP ###
# Finish pyright install
# Finish pyright install
RUN pyright --help > /dev/null 2>&1
RUN pyright --help > /dev/null 2>&1
...
@@ -103,11 +233,6 @@ COPY lib/bindings /workspace/lib/bindings
...
@@ -103,11 +233,6 @@ COPY lib/bindings /workspace/lib/bindings
RUN cd lib/bindings/c && \
RUN cd lib/bindings/c && \
cargo build --release --locked && cargo doc --no-deps
cargo build --release --locked && cargo doc --no-deps
# Generate C bindings for kv cache routing in vLLM
COPY lib/bindings /workspace/lib/bindings
RUN cd lib/bindings/c && \
cargo build --release --locked && cargo doc --no-deps
COPY deploy/dynamo/sdk /workspace/deploy/dynamo/sdk
COPY deploy/dynamo/sdk /workspace/deploy/dynamo/sdk
# Build dynamo wheel
# Build dynamo wheel
RUN source /opt/dynamo/venv/bin/activate && \
RUN source /opt/dynamo/venv/bin/activate && \
...
@@ -135,50 +260,5 @@ ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
...
@@ -135,50 +260,5 @@ ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
CMD []
### Lean Runtime Image Stage ###
### TODO Lean Runtime Image Stage ###
# FIXME: Separate build and runtime images
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS runtime
USER root
# Install tools for interactive convenience
RUN apt update -y && \
apt install -y curl tmux vim && \
echo "set -g mouse on" >> /root/.tmux.conf
# Set environment variables
ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
ENV RAPIDS_LIBUCX_PREFER_SYSTEM_LIBRARY=true
ENV VLLM_KV_CAPI_PATH="/opt/dynamo/bindings/lib/libdynamo_llm_capi.so"
# Copy binaries
COPY --from=dev /usr/local/bin/http /usr/local/bin/http
COPY --from=dev /usr/local/bin/llmctl /usr/local/bin/llmctl
COPY --from=dev /usr/local/bin/etcd/etcd /usr/local/bin/etcd
COPY --from=dev /usr/bin/nats-server /usr/local/bin/nats-server
COPY --from=dev /bin/uv /usr/local/bin/uv
COPY --from=dev /bin/uvx /usr/local/bin/uvx
# Copy venv with installed packages
RUN uv python install 3.12
COPY --from=dev /opt/vllm /opt/vllm
COPY --from=dev ${VIRTUAL_ENV} ${VIRTUAL_ENV}
# Copy minimal set of files for testing. May consider separate stage for testing
# if test dependencies start to negatively impact deployment environment/size.
COPY pyproject.toml /workspace/pyproject.toml
COPY container/deps/vllm /workspace/container/deps/vllm
# Add library for KV routing
COPY --from=dev ${VLLM_KV_CAPI_PATH} ${VLLM_KV_CAPI_PATH}
# Copy minimal set of files for deployment/examples
# FIXME: Use a more consolidated path after directory restructure
COPY examples/python_rs/llm/vllm /workspace/examples/python_rs/llm/vllm
WORKDIR /workspace
# FIXME: May want a modification with dynamo banner on entry
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
container/Dockerfile.vllm_nixl
deleted
100644 → 0
View file @
a7c35dcf
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
ARG BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dev
USER root
### NIXL SETUP ###
ARG MOFED_VERSION=24.10-1.1.4.0
ARG PYTHON_VERSION=3.12
ARG NSYS_URL=https://developer.nvidia.com/downloads/assets/tools/secure/nsight-systems/2025_1/
ARG NSYS_PKG=NsightSystems-linux-cli-public-2025.1.1.131-3554042.deb
RUN apt-get update -y && apt-get -y install curl \
git \
libnuma-dev \
numactl \
wget \
autotools-dev \
automake \
libtool \
libz-dev \
libiberty-dev \
flex \
build-essential \
cmake \
libibverbs-dev \
libgoogle-glog-dev \
libgtest-dev \
libjsoncpp-dev \
libpython3-dev \
libboost-all-dev \
libssl-dev \
libgrpc-dev \
libgrpc++-dev \
libprotobuf-dev \
protobuf-compiler-grpc \
pybind11-dev \
python3-full \
python3-pip \
python3-numpy \
etcd-server \
net-tools \
pciutils \
libpci-dev \
vim \
tmux \
screen \
ibverbs-utils \
libibmad-dev
RUN apt-get install -y linux-tools-common linux-tools-generic ethtool iproute2
RUN apt-get install -y dkms linux-headers-generic
RUN apt-get install -y meson ninja-build uuid-dev gdb
RUN apt-get update && apt install -y wget libglib2.0-0
RUN wget ${NSYS_URL}${NSYS_PKG} && dpkg -i $NSYS_PKG && rm $NSYS_PKG
RUN cd /usr/local/src && \
curl -fSsL "https://content.mellanox.com/ofed/MLNX_OFED-${MOFED_VERSION}/MLNX_OFED_LINUX-${MOFED_VERSION}-ubuntu24.04-x86_64.tgz" -o mofed.tgz && \
tar -xf /usr/local/src/mofed.tgz && \
cd MLNX_OFED_LINUX-* && \
apt-get update && apt-get install -y --no-install-recommends \
./DEBS/libibverbs* ./DEBS/ibverbs-providers* ./DEBS/librdmacm* ./DEBS/libibumad* && \
rm -rf /var/lib/apt/lists/* /usr/local/src/*
ENV LIBRARY_PATH=$LIBRARY_PATH:/usr/local/cuda/lib64 \
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
ENV LIBRARY_PATH=$LIBRARY_PATH:/usr/local/lib \
LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib
WORKDIR /workspace
RUN git clone https://github.com/NVIDIA/gdrcopy.git
RUN PREFIX=/usr/local DESTLIB=/usr/local/lib make -C /workspace/gdrcopy lib_install
RUN cp gdrcopy/src/libgdrapi.so.2.* /usr/lib/x86_64-linux-gnu/
RUN ldconfig
ARG UCX_VERSION=v1.18.0
RUN cd /usr/local/src && \
curl -fSsL "https://github.com/openucx/ucx/tarball/${UCX_VERSION}" | tar xz && \
cd openucx-ucx* && \
./autogen.sh && ./configure \
--enable-shared \
--disable-static \
--disable-doxygen-doc \
--enable-optimizations \
--enable-cma \
--enable-devel-headers \
--with-cuda=/usr/local/cuda \
--with-verbs \
--with-dm \
--with-gdrcopy=/usr/local \
--enable-mt \
--with-mlx5-dv && \
make -j && \
make -j install-strip && \
ldconfig
ENV LD_LIBRARY_PATH=/usr/lib:$LD_LIBRARY_PATH
ENV CPATH=/usr/include:$CPATH
ENV PATH=/usr/bin:$PATH
ENV PKG_CONFIG_PATH=/usr/lib/pkgconfig:$PKG_CONFIG_PATH
SHELL ["/bin/bash", "-c"]
WORKDIR /workspace
ENV LD_LIBRARY_PATH=/usr/local/ompi/lib:$LD_LIBRARY_PATH
ENV CPATH=/usr/local/ompi/include:$CPATH
ENV PATH=/usr/local/ompi/bin:$PATH
ENV PKG_CONFIG_PATH=/usr/local/ompi/lib/pkgconfig:$PKG_CONFIG_PATH
COPY --from=nixl . /opt/nixl
RUN cd /opt/nixl && \
mkdir build && \
meson setup build/ --prefix=/usr/local/nixl && \
cd build/ && \
ninja && \
ninja install
ENV LD_LIBRARY_PATH=/usr/local/nixl/lib/x86_64-linux-gnu/:$LD_LIBRARY_PATH
ENV PYTHONPATH=/usr/local/nixl/lib/python3/dist-packages/:/opt/nixl/test/python/:$PYTHONPATH
ENV UCX_TLS=^cuda_ipc
ENV NIXL_PLUGIN_DIR=/usr/local/nixl/lib/x86_64-linux-gnu/plugins
RUN ls -l /usr/local/nixl/
RUN ls -l /usr/local/nixl/include/
RUN ls -l /usr/local/nixl/include/internal/
RUN ls /opt/nixl
# Install utilities
RUN apt update -y && apt install -y git wget curl nvtop tmux vim
# nats
RUN wget https://github.com/nats-io/nats-server/releases/download/v2.10.24/nats-server-v2.10.24-amd64.deb && dpkg -i nats-server-v2.10.24-amd64.deb
# etcd
ENV ETCD_VERSION="v3.5.18"
RUN wget https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-amd64.tar.gz -O /tmp/etcd.tar.gz && \
mkdir -p /usr/local/bin/etcd && \
tar -xvf /tmp/etcd.tar.gz -C /usr/local/bin/etcd --strip-components=1
ENV PATH=/usr/local/bin/etcd/:$PATH
### VIRTUAL ENVIRONMENT SETUP ###
# Install uv and create virtualenv
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
RUN mkdir /opt/dynamo && \
uv venv /opt/dynamo/venv --python 3.12
# Activate virtual environment
ENV VIRTUAL_ENV=/opt/dynamo/venv
ENV PATH="${VIRTUAL_ENV}/bin:${PATH}"
# Common dependencies
RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
uv pip install --requirement /tmp/requirements.txt
# Install patched vllm - keep this early in Dockerfile to avoid
# rebuilds from unrelated source code changes
ARG VLLM_REF="v0.7.2"
ARG VLLM_PATCH="vllm_${VLLM_REF}-dynamo-kv-disagg-patch.patch"
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
bash /tmp/deps/vllm/install.sh --patch /tmp/deps/vllm/${VLLM_PATCH} --ref ${VLLM_REF} --install-cmd "uv pip install --editable" --use-precompiled --installation-dir /opt/vllm
# Install genai-perf for benchmarking
ARG GENAI_PERF_TAG="25d0188713adc47868d6b3f22426375237a90529"
RUN uv pip install "git+https://github.com/triton-inference-server/perf_analyzer.git@${GENAI_PERF_TAG}#subdirectory=genai-perf"
# Install test dependencies
RUN --mount=type=bind,source=./container/deps/requirements.test.txt,target=/tmp/requirements.txt \
uv pip install --requirement /tmp/requirements.txt
# ### MISC UTILITY SETUP ###
# Finish pyright install
RUN pyright --help > /dev/null 2>&1
# Enable Git operations in the /workspace directory
RUN printf "[safe]\n directory=/workspace\n" > /root/.gitconfig
RUN ln -sf /bin/bash /bin/sh
### BUILDS ###
# Rust build/dev dependencies
RUN apt update -y && \
apt install -y \
build-essential \
protobuf-compiler \
cmake \
libssl-dev \
pkg-config && \
curl https://sh.rustup.rs -sSf | bash -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}"
RUN rustup toolchain install 1.85.0-x86_64-unknown-linux-gnu
# Working directory
WORKDIR /workspace
# Copy Python wheel configuration files
COPY pyproject.toml /workspace/
COPY README.md /workspace/
COPY LICENSE /workspace/
# Build Rust runtime
COPY lib/runtime /workspace/lib/runtime
RUN cd lib/runtime && \
cargo build --release --locked && cargo doc --no-deps
# Build OpenAI HTTP Service binaries
COPY lib/llm /workspace/lib/llm
COPY components /workspace/components
RUN cd components && \
cargo build --release && \
cp target/release/http /usr/local/bin/
# Build Dynamo Run binaries
COPY launch /workspace/launch
RUN cd launch && \
cargo build --release --features mistralrs,sglang,vllm,python && \
cp target/release/dynamo-run /usr/local/bin/ && \
cp target/release/llmctl /usr/local/bin/
# Generate C bindings for kv cache routing in vLLM
COPY lib/bindings /workspace/lib/bindings
RUN cd lib/bindings/c && \
cargo build --release --locked && cargo doc --no-deps
COPY deploy/dynamo/sdk /workspace/deploy/dynamo/sdk
# Build dynamo wheel
RUN source /opt/dynamo/venv/bin/activate && \
uv build --wheel --out-dir /workspace/dist && \
uv pip install /workspace/dist/ai_dynamo*cp312*.whl && \
cd /workspace/deploy/dynamo/sdk && \
uv build --wheel --out-dir /workspace/dist && \
uv pip install /workspace/dist/ai_dynamo_sdk*any.whl
# Package the bindings
RUN mkdir -p /opt/dynamo/bindings/wheels && \
mkdir /opt/dynamo/bindings/lib && \
cp dist/ai_dynamo*cp312*.whl /opt/dynamo/bindings/wheels/. && \
cp lib/bindings/c/target/release/libdynamo_llm_capi.so /opt/dynamo/bindings/lib/. && \
cp -r lib/bindings/c/include /opt/dynamo/bindings/.
# Tell vllm to use the Dynamo LLM C API for KV Cache Routing
ENV VLLM_KV_CAPI_PATH="/opt/dynamo/bindings/lib/libdynamo_llm_capi.so"
# FIXME: Copy more specific folders in for dev/debug after directory restructure
COPY . /workspace
# FIXME: May want a modification with dynamo banner on entry
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
CMD []
### TODO Lean Runtime Image Stage ###
container/build.sh
View file @
5bcdb734
...
@@ -44,7 +44,7 @@ PYTHON_PACKAGE_VERSION=${current_tag:-$latest_tag.dev+$commit_id}
...
@@ -44,7 +44,7 @@ PYTHON_PACKAGE_VERSION=${current_tag:-$latest_tag.dev+$commit_id}
# installed within framework specific sections of the Dockerfile.
# installed within framework specific sections of the Dockerfile.
declare
-A
FRAMEWORKS
=([
"STANDARD"
]=
1
[
"TENSORRTLLM"
]=
2
[
"VLLM"
]=
3
[
"VLLM_NIXL"
]=
4
)
declare
-A
FRAMEWORKS
=([
"STANDARD"
]=
1
[
"TENSORRTLLM"
]=
2
[
"VLLM"
]=
3
[
"VLLM_NIXL"
]=
4
)
DEFAULT_FRAMEWORK
=
STANDARD
DEFAULT_FRAMEWORK
=
VLLM
SOURCE_DIR
=
$(
dirname
"
$(
readlink
-f
"
$0
"
)
"
)
SOURCE_DIR
=
$(
dirname
"
$(
readlink
-f
"
$0
"
)
"
)
DOCKERFILE
=
${
SOURCE_DIR
}
/Dockerfile
DOCKERFILE
=
${
SOURCE_DIR
}
/Dockerfile
...
@@ -64,9 +64,6 @@ TENSORRTLLM_PIP_WHEEL_PATH=""
...
@@ -64,9 +64,6 @@ TENSORRTLLM_PIP_WHEEL_PATH=""
VLLM_BASE_IMAGE
=
"nvcr.io/nvidia/cuda-dl-base"
VLLM_BASE_IMAGE
=
"nvcr.io/nvidia/cuda-dl-base"
VLLM_BASE_IMAGE_TAG
=
"25.01-cuda12.8-devel-ubuntu24.04"
VLLM_BASE_IMAGE_TAG
=
"25.01-cuda12.8-devel-ubuntu24.04"
VLLM_NIXL_BASE_IMAGE
=
"nvcr.io/nvidia/cuda-dl-base"
VLLM_NIXL_BASE_IMAGE_TAG
=
"25.01-cuda12.8-devel-ubuntu24.04"
NIXL_COMMIT
=
3ce6a673b266b4f293909ceb17ca7975f1ba5cd7
NIXL_COMMIT
=
3ce6a673b266b4f293909ceb17ca7975f1ba5cd7
NIXL_REPO
=
ai-dynamo/nixl.git
NIXL_REPO
=
ai-dynamo/nixl.git
...
@@ -197,6 +194,10 @@ get_options() {
...
@@ -197,6 +194,10 @@ get_options() {
FRAMEWORK
=
$DEFAULT_FRAMEWORK
FRAMEWORK
=
$DEFAULT_FRAMEWORK
fi
fi
if
[[
${
FRAMEWORK
^^
}
==
"VLLM_NIXL"
]]
;
then
FRAMEWORK
=
"VLLM"
fi
if
[
!
-z
"
$FRAMEWORK
"
]
;
then
if
[
!
-z
"
$FRAMEWORK
"
]
;
then
FRAMEWORK
=
${
FRAMEWORK
^^
}
FRAMEWORK
=
${
FRAMEWORK
^^
}
...
@@ -283,17 +284,14 @@ error() {
...
@@ -283,17 +284,14 @@ error() {
get_options
"
$@
"
get_options
"
$@
"
# Update DOCKERFILE if framework is VLLM
# Update DOCKERFILE if framework is VLLM
if
[[
$FRAMEWORK
==
"VLLM"
]]
;
then
if
[[
$FRAMEWORK
==
"VLLM"
]]
;
then
DOCKERFILE
=
${
SOURCE_DIR
}
/Dockerfile.vllm
DOCKERFILE
=
${
SOURCE_DIR
}
/Dockerfile.vllm
elif
[[
$FRAMEWORK
==
"VLLM_NIXL"
]]
;
then
DOCKERFILE
=
${
SOURCE_DIR
}
/Dockerfile.vllm_nixl
elif
[[
$FRAMEWORK
==
"TENSORRTLLM"
]]
;
then
elif
[[
$FRAMEWORK
==
"TENSORRTLLM"
]]
;
then
DOCKERFILE
=
${
SOURCE_DIR
}
/Dockerfile.tensorrt_llm
DOCKERFILE
=
${
SOURCE_DIR
}
/Dockerfile.tensorrt_llm
fi
fi
if
[[
$FRAMEWORK
==
"VLLM
_NIXL
"
]]
;
then
if
[[
$FRAMEWORK
==
"VLLM"
]]
;
then
TEMP_DIR
=
$(
mktemp
-d
)
TEMP_DIR
=
$(
mktemp
-d
)
# Clean up temp directory on script exit
# Clean up temp directory on script exit
...
...
container/run.sh
View file @
5bcdb734
...
@@ -23,7 +23,7 @@ RUN_PREFIX=
...
@@ -23,7 +23,7 @@ RUN_PREFIX=
# installed within framework specific sections of the Dockerfile.
# installed within framework specific sections of the Dockerfile.
declare
-A
FRAMEWORKS
=([
"STANDARD"
]=
1
[
"TENSORRTLLM"
]=
2
[
"VLLM"
]=
3
[
"VLLM_NIXL"
]=
4
)
declare
-A
FRAMEWORKS
=([
"STANDARD"
]=
1
[
"TENSORRTLLM"
]=
2
[
"VLLM"
]=
3
[
"VLLM_NIXL"
]=
4
)
DEFAULT_FRAMEWORK
=
STANDARD
DEFAULT_FRAMEWORK
=
VLLM
SOURCE_DIR
=
$(
dirname
"
$(
readlink
-f
"
$0
"
)
"
)
SOURCE_DIR
=
$(
dirname
"
$(
readlink
-f
"
$0
"
)
"
)
...
@@ -170,6 +170,10 @@ get_options() {
...
@@ -170,6 +170,10 @@ get_options() {
FRAMEWORK
=
$DEFAULT_FRAMEWORK
FRAMEWORK
=
$DEFAULT_FRAMEWORK
fi
fi
if
[[
${
FRAMEWORK
^^
}
==
"VLLM_NIXL"
]]
;
then
FRAMEWORK
=
"VLLM"
fi
if
[
!
-z
"
$FRAMEWORK
"
]
;
then
if
[
!
-z
"
$FRAMEWORK
"
]
;
then
FRAMEWORK
=
${
FRAMEWORK
^^
}
FRAMEWORK
=
${
FRAMEWORK
^^
}
if
[[
!
-n
"
${
FRAMEWORKS
[
$FRAMEWORK
]
}
"
]]
;
then
if
[[
!
-n
"
${
FRAMEWORKS
[
$FRAMEWORK
]
}
"
]]
;
then
...
...
examples/python_rs/llm/vllm/README.md
View file @
5bcdb734
This diff is collapsed.
Click to expand it.
examples/python_rs/llm/vllm/
sdk_kv_router/
__init__.py
→
examples/python_rs/llm/vllm/__init__.py
View file @
5bcdb734
File moved
examples/python_rs/llm/vllm/common/base_engine.py
deleted
100644 → 0
View file @
a7c35dcf
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
abc
import
logging
from
common.chat_processor
import
ChatProcessor
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.entrypoints.openai.api_server
import
(
build_async_engine_client_from_engine_args
,
)
logger
=
logging
.
getLogger
(
"vllm"
)
class
BaseVllmEngine
:
"""
Request handler for the generate endpoint
"""
def
__init__
(
self
,
engine_args
:
AsyncEngineArgs
):
self
.
engine_args
=
engine_args
self
.
model_config
=
self
.
engine_args
.
create_model_config
()
self
.
engine_client
=
None
self
.
chat_processor
:
ChatProcessor
|
None
=
None
self
.
_engine_context
=
None
async
def
initialize
(
self
):
"""Initialize the engine client and related components."""
logger
.
info
(
"Initializing engine client"
)
self
.
_engine_context
=
build_async_engine_client_from_engine_args
(
self
.
engine_args
)
if
self
.
_engine_context
is
not
None
:
self
.
engine_client
=
await
self
.
_engine_context
.
__aenter__
()
self
.
tokenizer
=
await
self
.
engine_client
.
get_tokenizer
()
self
.
chat_processor
=
ChatProcessor
(
self
.
tokenizer
,
self
.
model_config
)
else
:
raise
RuntimeError
(
"Failed to initialize engine client"
)
async
def
cleanup
(
self
):
"""Cleanup resources."""
print
(
"Cleaning up engine client"
)
if
self
.
_engine_context
is
not
None
:
await
self
.
_engine_context
.
__aexit__
(
None
,
None
,
None
)
self
.
_engine_context
=
None
self
.
engine_client
=
None
self
.
chat_processor
=
None
async
def
__aenter__
(
self
):
await
self
.
initialize
()
"""Initialize with context manager syntax."""
return
self
async
def
__aexit__
(
self
,
exc_type
,
exc_value
,
traceback
):
await
self
.
cleanup
()
@
abc
.
abstractmethod
async
def
generate
(
self
,
raw_request
):
pass
examples/python_rs/llm/vllm/common/chat_processor.py
deleted
100644 → 0
View file @
a7c35dcf
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
import
time
from
typing
import
AsyncIterator
,
List
,
Optional
,
Protocol
,
Union
,
runtime_checkable
from
vllm.config
import
ModelConfig
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.entrypoints.chat_utils
import
ConversationMessage
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
CompletionRequest
,
RequestResponseMetadata
,
)
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
from
vllm.entrypoints.openai.serving_completion
import
OpenAIServingCompletion
from
vllm.entrypoints.openai.serving_engine
import
RequestPrompt
from
vllm.inputs.data
import
TokensPrompt
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
@
runtime_checkable
class
ProcessMixInRequired
(
Protocol
):
engine_args
:
AsyncEngineArgs
chat_processor
:
"ChatProcessor | None"
completions_processor
:
"CompletionsProcessor | None"
model_config
:
ModelConfig
class
ProcessMixIn
(
ProcessMixInRequired
):
"""
Mixin for pre and post processing for vLLM
Requires engine_args, engine_client, processor, model_config to be initialized
"""
engine_args
:
AsyncEngineArgs
chat_processor
:
"ChatProcessor | None"
completions_processor
:
"CompletionsProcessor | None"
model_config
:
ModelConfig
def
__init__
(
self
):
pass
def
_get_processor
(
self
,
raw_request
:
Union
[
CompletionRequest
,
ChatCompletionRequest
]
):
# Determine the processor type based on the request structure
return
(
self
.
chat_processor
if
isinstance
(
raw_request
,
ChatCompletionRequest
)
else
self
.
completions_processor
)
async
def
_parse_raw_request
(
self
,
raw_request
:
Union
[
CompletionRequest
,
ChatCompletionRequest
]
):
processor
=
self
.
_get_processor
(
raw_request
)
if
processor
is
None
:
raise
RuntimeError
(
"Processor has not been initialized"
)
request
=
processor
.
parse_raw_request
(
raw_request
)
preprocess_result
=
await
processor
.
preprocess
(
raw_request
)
default_max_tokens
=
self
.
model_config
.
max_model_len
-
len
(
preprocess_result
.
engine_prompt
[
"prompt_token_ids"
]
)
default_sampling_params
=
self
.
model_config
.
get_diff_sampling_param
()
sampling_params
=
request
.
to_sampling_params
(
default_max_tokens
,
self
.
model_config
.
logits_processor_pattern
,
default_sampling_params
,
)
return
(
request
,
preprocess_result
.
conversation
,
preprocess_result
.
request_prompt
,
preprocess_result
.
engine_prompt
,
sampling_params
,
)
async
def
_stream_response
(
self
,
request
,
generator
,
request_id
,
conversation
):
processor
=
self
.
_get_processor
(
request
)
if
processor
is
None
:
raise
RuntimeError
(
"processor has not been initialized"
)
return
processor
.
stream_response
(
request
,
generator
,
request_id
,
conversation
,
)
class
PreprocessResult
:
def
__init__
(
self
,
conversation
:
Optional
[
ConversationMessage
],
request_prompt
:
RequestPrompt
,
engine_prompt
:
TokensPrompt
,
):
self
.
conversation
=
conversation
self
.
request_prompt
=
request_prompt
self
.
engine_prompt
=
engine_prompt
class
ChatProcessor
:
def
__init__
(
self
,
tokenizer
:
AnyTokenizer
,
model_config
:
ModelConfig
):
self
.
tokenizer
=
tokenizer
self
.
model_config
=
model_config
self
.
openai_serving
=
OpenAIServingChat
(
engine_client
=
None
,
model_config
=
model_config
,
models
=
None
,
request_logger
=
None
,
response_role
=
"assistant"
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
)
def
parse_raw_request
(
self
,
raw_request
:
ChatCompletionRequest
)
->
ChatCompletionRequest
:
return
ChatCompletionRequest
.
parse_obj
(
raw_request
)
async
def
preprocess
(
self
,
raw_request
:
ChatCompletionRequest
)
->
PreprocessResult
:
request
=
self
.
parse_raw_request
(
raw_request
)
(
conversation
,
request_prompts
,
engine_prompts
,
)
=
await
self
.
openai_serving
.
_preprocess_chat
(
request
,
self
.
tokenizer
,
request
.
messages
,
chat_template
=
request
.
chat_template
or
self
.
tokenizer
.
chat_template
,
chat_template_content_format
=
self
.
openai_serving
.
chat_template_content_format
,
add_generation_prompt
=
request
.
add_generation_prompt
,
continue_final_message
=
request
.
continue_final_message
,
tool_dicts
=
None
,
documents
=
request
.
documents
,
chat_template_kwargs
=
request
.
chat_template_kwargs
,
tool_parser
=
self
.
openai_serving
.
tool_parser
,
truncate_prompt_tokens
=
request
.
truncate_prompt_tokens
,
add_special_tokens
=
request
.
add_special_tokens
,
)
return
PreprocessResult
(
conversation
[
0
],
request_prompts
[
0
],
engine_prompts
[
0
])
async
def
stream_response
(
self
,
request
:
ChatCompletionRequest
,
result_generator
:
AsyncIterator
,
request_id
:
str
,
conversation
:
List
,
):
request_metadata
=
RequestResponseMetadata
(
request_id
=
request_id
)
if
not
request
.
stream
:
raise
ValueError
(
"Only streaming responses are supported"
)
async
for
raw_response
in
self
.
openai_serving
.
chat_completion_stream_generator
(
request
,
result_generator
,
request_id
,
request
.
model
,
conversation
,
self
.
tokenizer
,
request_metadata
,
):
if
raw_response
.
startswith
(
"data: [DONE]"
):
break
response
=
json
.
loads
(
raw_response
.
lstrip
(
"data: "
))
yield
response
class
CompletionsProcessor
:
def
__init__
(
self
,
tokenizer
:
AnyTokenizer
,
model_config
:
ModelConfig
):
self
.
tokenizer
=
tokenizer
self
.
model_config
=
model_config
self
.
openai_serving
=
OpenAIServingCompletion
(
engine_client
=
None
,
model_config
=
model_config
,
models
=
None
,
request_logger
=
None
,
)
def
parse_raw_request
(
self
,
raw_request
:
CompletionRequest
)
->
CompletionRequest
:
return
CompletionRequest
.
parse_obj
(
raw_request
)
async
def
preprocess
(
self
,
raw_request
:
CompletionRequest
)
->
PreprocessResult
:
request
=
self
.
parse_raw_request
(
raw_request
)
(
request_prompts
,
engine_prompts
,
)
=
await
self
.
openai_serving
.
_preprocess_completion
(
request
,
self
.
tokenizer
,
input_or_inputs
=
request
.
prompt
,
truncate_prompt_tokens
=
request
.
truncate_prompt_tokens
,
add_special_tokens
=
request
.
add_special_tokens
,
)
return
PreprocessResult
(
None
,
request_prompts
[
0
],
engine_prompts
[
0
])
async
def
stream_response
(
self
,
request
:
CompletionRequest
,
result_generator
:
AsyncIterator
,
request_id
:
str
,
conversation
:
Optional
[
List
[
ConversationMessage
]]
=
None
,
):
request_metadata
=
RequestResponseMetadata
(
request_id
=
request_id
)
if
not
request
.
stream
:
raise
ValueError
(
"Only streaming responses are supported"
)
async
for
raw_response
in
self
.
openai_serving
.
completion_stream_generator
(
request
,
result_generator
,
request_id
,
int
(
time
.
time
()),
# created_time
request
.
model
,
1
,
# num_prompts
self
.
tokenizer
,
request_metadata
,
):
if
raw_response
.
startswith
(
"data: [DONE]"
):
break
response
=
json
.
loads
(
raw_response
.
lstrip
(
"data: "
))
yield
response
examples/python_rs/llm/vllm/common/client.py
deleted
100644 → 0
View file @
a7c35dcf
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
asyncio
import
uvloop
from
dynamo.runtime
import
DistributedRuntime
,
dynamo_worker
from
.protocol
import
Request
@
dynamo_worker
()
async
def
worker
(
runtime
:
DistributedRuntime
,
component
:
str
,
prompt
:
str
,
max_tokens
:
int
,
temperature
:
float
,
):
"""
Instantiate a `backend` client and call the `generate` endpoint
"""
# get endpoint
endpoint
=
runtime
.
namespace
(
"dynamo"
).
component
(
component
).
endpoint
(
"generate"
)
# create client
client
=
await
endpoint
.
client
()
# issue request
tasks
=
[]
for
_
in
range
(
1
):
tasks
.
append
(
client
.
generate
(
Request
(
prompt
=
prompt
,
sampling_params
=
{
"temperature"
:
temperature
,
"max_tokens"
:
max_tokens
,
},
).
model_dump_json
()
)
)
streams
=
await
asyncio
.
gather
(
*
tasks
)
# process response
for
stream
in
streams
:
async
for
resp
in
stream
:
print
(
resp
)
if
__name__
==
"__main__"
:
uvloop
.
install
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--prompt"
,
type
=
str
,
default
=
"what is the capital of france?"
)
parser
.
add_argument
(
"--component"
,
type
=
str
,
default
=
"vllm"
)
parser
.
add_argument
(
"--max-tokens"
,
type
=
int
,
default
=
10
)
parser
.
add_argument
(
"--temperature"
,
type
=
float
,
default
=
0.5
)
args
=
parser
.
parse_args
()
asyncio
.
run
(
worker
(
args
.
component
,
args
.
prompt
,
args
.
max_tokens
,
args
.
temperature
))
examples/python_rs/llm/vllm/common/parser.py
deleted
100644 → 0
View file @
a7c35dcf
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.utils
import
FlexibleArgumentParser
def
parse_vllm_args
()
->
AsyncEngineArgs
:
parser
=
FlexibleArgumentParser
()
parser
=
AsyncEngineArgs
.
add_cli_args
(
parser
)
args
=
parser
.
parse_args
()
return
AsyncEngineArgs
.
from_cli_args
(
args
)
examples/python_rs/llm/vllm/common/protocol.py
deleted
100644 → 0
View file @
a7c35dcf
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
from
typing
import
Any
,
List
,
Optional
import
msgspec
from
pydantic
import
BaseModel
,
ConfigDict
,
field_validator
from
pydantic_core
import
core_schema
from
typing_extensions
import
NotRequired
from
vllm.inputs.data
import
TokensPrompt
from
vllm.outputs
import
CompletionOutput
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
PromptLogprobs
,
RequestMetrics
class
Request
(
BaseModel
):
prompt
:
str
sampling_params
:
dict
class
Tokens
(
BaseModel
):
tokens
:
list
[
int
]
class
PrefillRequest
(
Request
):
request_id
:
str
class
Response
(
BaseModel
):
text
:
str
class
PrefillResponse
(
BaseModel
):
prefilled
:
bool
# Hack to override the type of multi_modal_data in TokensPrompt
# as pydantic doesn't understand generic types
# TokensPrompt is defined here: https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/inputs/data.py#L38
# multi_modal_data is defined here: https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/inputs.py#L103
# ModalityData is defined here: https://github.com/vllm-project/vllm/blob/main/vllm/multimodal/inputs.py#L80
class
PatchedTokensPrompt
(
TokensPrompt
):
multi_modal_data
:
NotRequired
[
Optional
[
Any
]]
# type: ignore
# Monkey-patch the SamplingParams type to add a dummy core schema so pydantic can validate it
# Sampling params is a mspspec struct
# SamplingParams is defined here: https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/sampling_params.py#L88
SamplingParams
.
__get_pydantic_core_schema__
=
classmethod
(
lambda
cls
,
source
,
handler
:
core_schema
.
any_schema
()
)
class
vLLMGenerateRequest
(
BaseModel
):
"""
Serializable class of all the fields vLLM engine requires for inference
"""
model_config
=
ConfigDict
(
arbitrary_types_allowed
=
True
)
engine_prompt
:
PatchedTokensPrompt
sampling_params
:
SamplingParams
request_id
:
str
@
field_validator
(
"sampling_params"
,
mode
=
"before"
)
@
classmethod
def
parse_sampling_params
(
cls
,
v
:
Any
)
->
SamplingParams
:
if
isinstance
(
v
,
str
):
v
=
json
.
loads
(
v
)
if
isinstance
(
v
,
dict
):
return
SamplingParams
(
**
v
)
return
v
model_config
=
ConfigDict
(
json_encoders
=
{
SamplingParams
:
lambda
v
:
msgspec
.
json
.
encode
(
v
)}
)
class
MyRequestOutput
(
BaseModel
):
"""
RequestOutput from vLLM is not serializable by default
https://github.com/vllm-project/vllm/blob/a4c402a756fa3213caf9d2cde0e4ceb2d57727f2/vllm/outputs.py#L85
This class is used to serialize the RequestOutput and any recursively defined types
We can do this because PromptLogprobs, RequestMetrics, and CompletionOutput are all serializable dataclasses
"""
model_config
=
ConfigDict
(
arbitrary_types_allowed
=
True
)
request_id
:
str
prompt
:
Optional
[
str
]
=
None
prompt_token_ids
:
Optional
[
List
[
int
]]
=
None
prompt_logprobs
:
Optional
[
PromptLogprobs
]
=
None
outputs
:
List
[
CompletionOutput
]
finished
:
bool
metrics
:
Optional
[
RequestMetrics
]
=
None
# lora_request: Optional[LoRARequest] = None
# encoder_prompt: Optional[str] = None
# encoder_prompt_token_ids: Optional[List[int]] = None
# num_cached_tokens: Optional[int] = None
# multi_modal_placeholders: Optional[MultiModalPlaceholderDict] = None
examples/python_rs/llm/vllm
_nixl
/disagg_router.py
→
examples/python_rs/llm/vllm/disagg_router.py
View file @
5bcdb734
File moved
examples/python_rs/llm/vllm/disaggregated/__init__.py
deleted
100644 → 0
View file @
a7c35dcf
examples/python_rs/llm/vllm/disaggregated/decode_worker.py
deleted
100644 → 0
View file @
a7c35dcf
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
socket
import
uuid
import
msgspec
import
uvloop
from
common.base_engine
import
BaseVllmEngine
from
common.chat_processor
import
ProcessMixIn
from
common.parser
import
parse_vllm_args
from
common.protocol
import
PrefillRequest
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
ChatCompletionStreamResponse
,
)
from
vllm.logger
import
logger
as
vllm_logger
from
dynamo.runtime
import
DistributedRuntime
,
dynamo_endpoint
,
dynamo_worker
class
VllmDecodeEngine
(
BaseVllmEngine
,
ProcessMixIn
):
"""
Request handler for the generate endpoint
"""
def
__init__
(
self
,
engine_args
:
AsyncEngineArgs
,
prefill
):
assert
(
engine_args
.
kv_transfer_config
.
is_kv_consumer
),
"Decode worker must be a KV consumer"
if
engine_args
.
enable_chunked_prefill
is
not
False
:
vllm_logger
.
info
(
"Chunked prefill is not supported in disaggregated mode, disabling it"
)
engine_args
.
enable_chunked_prefill
=
False
super
().
__init__
(
engine_args
)
self
.
prefill
=
prefill
self
.
kv_transfer_config
=
engine_args
.
create_engine_config
().
kv_transfer_config
self
.
kv_rank
=
self
.
kv_transfer_config
.
kv_rank
@
dynamo_endpoint
(
ChatCompletionRequest
,
ChatCompletionStreamResponse
)
async
def
generate
(
self
,
raw_request
):
if
self
.
engine_client
is
None
:
await
self
.
initialize
()
vllm_logger
.
debug
(
f
"Got raw request:
{
raw_request
}
"
)
(
request
,
conversation
,
request_prompt
,
engine_prompt
,
sampling_params
,
)
=
await
self
.
_parse_raw_request
(
raw_request
)
# TODO: pass decode info through a separate request param
request_id
=
f
"
{
uuid
.
uuid4
()
}
___decode_hostname_
{
socket
.
gethostname
()
}
___decode_kv_rank_
{
self
.
kv_rank
}
"
prefill_sampling_params
=
{
**
msgspec
.
to_builtins
(
sampling_params
)}
prefill_sampling_params
[
"max_tokens"
]
=
1
prefill_sampling_params
[
"min_tokens"
]
=
1
prefill_request
=
PrefillRequest
(
prompt
=
request_prompt
,
# TODO: we should use engine prompt to avoid extra tokenization
sampling_params
=
prefill_sampling_params
,
request_id
=
request_id
,
)
vllm_logger
.
debug
(
f
"Prefill request:
{
prefill_request
}
"
)
prefill_output
=
self
.
prefill
.
generate
(
prefill_request
.
model_dump_json
(),
)
vllm_logger
.
debug
(
f
"Running generate with engine_prompt:
{
engine_prompt
}
, sampling_params:
{
sampling_params
}
, request_id:
{
request_id
}
"
)
if
self
.
engine_client
is
None
:
raise
RuntimeError
(
"Engine client not initialized"
)
else
:
generator
=
self
.
engine_client
.
generate
(
engine_prompt
,
sampling_params
,
request_id
)
async
for
response
in
await
self
.
_stream_response
(
request
,
generator
,
request_id
,
conversation
):
vllm_logger
.
debug
(
f
"Generated response:
{
response
}
"
)
yield
response
await
prefill_output
@
dynamo_worker
()
async
def
worker
(
runtime
:
DistributedRuntime
,
engine_args
:
AsyncEngineArgs
):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component
=
runtime
.
namespace
(
"dynamo"
).
component
(
"vllm"
)
await
component
.
create_service
()
prefill
=
(
await
runtime
.
namespace
(
"dynamo"
)
.
component
(
"prefill"
)
.
endpoint
(
"generate"
)
.
client
()
)
async
with
VllmDecodeEngine
(
engine_args
,
prefill
)
as
decode_engine
:
endpoint
=
component
.
endpoint
(
"generate"
)
await
endpoint
.
serve_endpoint
(
decode_engine
.
generate
)
if
__name__
==
"__main__"
:
uvloop
.
install
()
engine_args
=
parse_vllm_args
()
asyncio
.
run
(
worker
(
engine_args
))
examples/python_rs/llm/vllm/disaggregated/prefill_worker.py
deleted
100644 → 0
View file @
a7c35dcf
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
uvloop
import
vllm
from
common.base_engine
import
BaseVllmEngine
from
common.parser
import
parse_vllm_args
from
common.protocol
import
PrefillRequest
,
PrefillResponse
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.logger
import
logger
as
vllm_logger
from
dynamo.runtime
import
DistributedRuntime
,
dynamo_endpoint
,
dynamo_worker
class
VllmPrefillEngine
(
BaseVllmEngine
):
"""
Request handler for the generate endpoint
"""
def
__init__
(
self
,
engine_args
:
AsyncEngineArgs
):
assert
(
engine_args
.
kv_transfer_config
.
is_kv_producer
),
"Prefill worker must be a KV producer"
if
engine_args
.
enable_chunked_prefill
is
not
False
:
vllm_logger
.
info
(
"Chunked prefill is not supported in disaggregated mode, disabling it"
)
engine_args
.
enable_chunked_prefill
=
False
super
().
__init__
(
engine_args
)
self
.
kv_transfer_config
=
engine_args
.
create_engine_config
().
kv_transfer_config
self
.
kv_rank
=
self
.
kv_transfer_config
.
kv_rank
@
dynamo_endpoint
(
PrefillRequest
,
PrefillResponse
)
async
def
generate
(
self
,
request
):
if
self
.
engine_client
is
None
:
await
self
.
initialize
()
vllm_logger
.
debug
(
f
"Received prefill request:
{
request
}
"
)
sampling_params
=
vllm
.
sampling_params
.
SamplingParams
(
**
request
.
sampling_params
)
if
self
.
engine_client
is
None
:
raise
RuntimeError
(
"Engine client not initialized"
)
else
:
async
for
response
in
self
.
engine_client
.
generate
(
request
.
prompt
,
sampling_params
,
request
.
request_id
):
vllm_logger
.
debug
(
f
"Generated response:
{
response
}
"
)
yield
True
@
dynamo_worker
()
async
def
worker
(
runtime
:
DistributedRuntime
,
engine_args
:
AsyncEngineArgs
):
"""
Instantiate a `backend` component and serve the `generate` endpoint
A `Component` can serve multiple endpoints
"""
component
=
runtime
.
namespace
(
"dynamo"
).
component
(
"prefill"
)
await
component
.
create_service
()
async
with
VllmPrefillEngine
(
engine_args
)
as
prefill_engine
:
endpoint
=
component
.
endpoint
(
"generate"
)
await
endpoint
.
serve_endpoint
(
prefill_engine
.
generate
)
if
__name__
==
"__main__"
:
uvloop
.
install
()
engine_args
=
parse_vllm_args
()
asyncio
.
run
(
worker
(
engine_args
))
examples/python_rs/llm/vllm
_nixl
/kv_router.py
→
examples/python_rs/llm/vllm/kv_router.py
View file @
5bcdb734
File moved
examples/python_rs/llm/vllm/kv_router/__init__.py
deleted
100644 → 0
View file @
a7c35dcf
examples/python_rs/llm/vllm/kv_router/processor.py
deleted
100644 → 0
View file @
a7c35dcf
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
uuid
from
enum
import
Enum
from
typing
import
AsyncIterator
,
Tuple
,
Union
import
uvloop
from
common.chat_processor
import
ChatProcessor
,
CompletionsProcessor
,
ProcessMixIn
from
common.parser
import
parse_vllm_args
from
common.protocol
import
MyRequestOutput
,
Tokens
,
vLLMGenerateRequest
from
transformers
import
AutoTokenizer
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
ChatCompletionStreamResponse
,
CompletionRequest
,
CompletionStreamResponse
,
)
from
vllm.logger
import
logger
as
vllm_logger
from
vllm.outputs
import
RequestOutput
from
vllm.transformers_utils.tokenizer
import
AnyTokenizer
from
dynamo.runtime
import
Client
,
DistributedRuntime
,
dynamo_endpoint
,
dynamo_worker
class
RequestType
(
Enum
):
CHAT
=
"chat"
COMPLETION
=
"completion"
class
Processor
(
ProcessMixIn
):
"""
vLLM pre and post processing
"""
def
__init__
(
self
,
engine_args
:
AsyncEngineArgs
,
router_client
:
Client
,
workers_client
:
Client
,
):
self
.
engine_args
=
engine_args
self
.
model_config
=
self
.
engine_args
.
create_model_config
()
self
.
tokenizer
=
self
.
_create_tokenizer
(
engine_args
)
self
.
chat_processor
=
ChatProcessor
(
self
.
tokenizer
,
self
.
model_config
)
self
.
completions_processor
=
CompletionsProcessor
(
self
.
tokenizer
,
self
.
model_config
)
self
.
router_client
=
router_client
self
.
workers_client
=
workers_client
def
_create_tokenizer
(
self
,
engine_args
:
AsyncEngineArgs
)
->
AnyTokenizer
:
"""Create a TokenizerGroup using engine arguments similar to VLLM's approach"""
model_path
=
engine_args
.
model
# Create the base tokenizer with VLLM's typical settings
base_tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
,
padding_side
=
"left"
,
truncation_side
=
"left"
,
use_fast
=
True
,
# VLLM might use the fast tokenizer for efficiency
)
return
base_tokenizer
async
def
_generate
(
self
,
raw_request
:
Union
[
CompletionRequest
,
ChatCompletionRequest
],
request_type
:
RequestType
,
):
request_id
=
str
(
uuid
.
uuid4
())
vllm_logger
.
debug
(
f
"Got raw request:
{
raw_request
}
"
)
(
request
,
conversation
,
prompt
,
engine_prompt
,
sampling_params
,
)
=
await
self
.
_parse_raw_request
(
raw_request
)
worker_id_generator
:
AsyncIterator
=
await
self
.
router_client
.
generate
(
Tokens
(
tokens
=
engine_prompt
[
"prompt_token_ids"
]).
model_dump_json
()
)
worker_id
=
(
await
worker_id_generator
.
__anext__
()
)
# only one worker id is returned
worker_id
=
worker_id
.
data
()
vllm_logger
.
info
(
f
"Worker ID:
{
worker_id
}
"
)
if
worker_id
==
""
:
engine_generator
=
await
self
.
workers_client
.
random
(
vLLMGenerateRequest
(
engine_prompt
=
engine_prompt
,
sampling_params
=
sampling_params
,
request_id
=
request_id
,
).
model_dump_json
()
)
else
:
engine_generator
=
await
self
.
workers_client
.
direct
(
vLLMGenerateRequest
(
engine_prompt
=
engine_prompt
,
sampling_params
=
sampling_params
,
request_id
=
request_id
,
).
model_dump_json
(),
int
(
worker_id
),
)
output
=
self
.
_generate_responses
(
engine_generator
,
request_type
)
async
for
response
in
await
self
.
_stream_response
(
request
,
output
,
request_id
,
conversation
):
yield
response
async
def
_generate_responses
(
self
,
engine_generator
:
AsyncIterator
[
RequestOutput
],
request_type
:
RequestType
)
->
AsyncIterator
[
Union
[
RequestOutput
,
Tuple
[
int
,
RequestOutput
]]]:
prompt_idx
=
0
async
for
resp
in
engine_generator
:
# Deserialize the response from the engine
# Creates correct vLLM objects for each field
output
=
MyRequestOutput
.
model_validate_json
(
resp
.
data
())
# OpenAIServingChat.chat_completion_stream_generator() method expects a RequestOutput object
request_output
=
RequestOutput
(
request_id
=
output
.
request_id
,
prompt
=
output
.
prompt
,
prompt_token_ids
=
output
.
prompt_token_ids
,
prompt_logprobs
=
output
.
prompt_logprobs
,
outputs
=
output
.
outputs
,
finished
=
output
.
finished
,
metrics
=
output
.
metrics
,
)
if
request_type
==
RequestType
.
CHAT
:
# For chat requests, yield the request_output directly.
yield
request_output
elif
request_type
==
RequestType
.
COMPLETION
:
# Completion requests can have multiple prompts and stream generator requires the prompt index
yield
(
prompt_idx
,
request_output
)
else
:
raise
NotImplementedError
(
f
"Request type
{
request_type
}
not implemented"
)
@
dynamo_endpoint
(
ChatCompletionRequest
,
ChatCompletionStreamResponse
)
async
def
generate_chat
(
self
,
raw_request
):
async
for
response
in
self
.
_generate
(
raw_request
,
RequestType
.
CHAT
):
yield
response
@
dynamo_endpoint
(
CompletionRequest
,
CompletionStreamResponse
)
async
def
generate_completions
(
self
,
raw_request
):
async
for
response
in
self
.
_generate
(
raw_request
,
RequestType
.
COMPLETION
):
yield
response
@
dynamo_worker
()
async
def
worker
(
runtime
:
DistributedRuntime
,
engine_args
:
AsyncEngineArgs
):
"""
Set up clients to the router and workers.
Serve the dynamo.process.chat/completions endpoint.
"""
workers_client
=
(
await
runtime
.
namespace
(
"dynamo"
)
.
component
(
"vllm"
)
.
endpoint
(
"generate"
)
.
client
()
)
router_client
=
(
await
runtime
.
namespace
(
"dynamo"
)
.
component
(
"router"
)
.
endpoint
(
"generate"
)
.
client
()
)
preprocess_component
=
runtime
.
namespace
(
"dynamo"
).
component
(
"process"
)
await
preprocess_component
.
create_service
()
chat_endpoint
=
preprocess_component
.
endpoint
(
"chat/completions"
)
completions_endpoint
=
preprocess_component
.
endpoint
(
"completions"
)
processor
=
Processor
(
engine_args
,
router_client
,
workers_client
)
await
asyncio
.
gather
(
chat_endpoint
.
serve_endpoint
(
processor
.
generate_chat
),
completions_endpoint
.
serve_endpoint
(
processor
.
generate_completions
),
)
if
__name__
==
"__main__"
:
uvloop
.
install
()
engine_args
=
parse_vllm_args
()
asyncio
.
run
(
worker
(
engine_args
))
examples/python_rs/llm/vllm/kv_router/router.py
deleted
100644 → 0
View file @
a7c35dcf
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
random
from
argparse
import
Namespace
from
typing
import
AsyncIterator
import
uvloop
from
common.protocol
import
Tokens
from
vllm.logger
import
logger
as
vllm_logger
from
dynamo.llm
import
AggregatedMetrics
,
KvIndexer
,
KvMetricsAggregator
,
OverlapScores
from
dynamo.runtime
import
DistributedRuntime
,
dynamo_endpoint
,
dynamo_worker
WorkerId
=
str
class
CustomRouter
:
"""
Request handler for the generate endpoint
"""
def
__init__
(
self
,
workers_client
,
indexer
:
KvIndexer
,
metrics_aggregator
:
KvMetricsAggregator
,
):
vllm_logger
.
info
(
"Initializing Custom Router"
)
self
.
indexer
=
indexer
self
.
metrics_aggregator
=
metrics_aggregator
self
.
workers_client
=
workers_client
def
_cost_function
(
self
,
scores
:
OverlapScores
|
None
,
metrics
:
AggregatedMetrics
|
None
,
token_length
:
int
,
):
worker_scores
=
{}
if
scores
:
for
worker_id
,
score
in
scores
.
scores
.
items
():
# score is number of matching blocks we multiply by block_size to get tokens
# and compare to token_length. The larger the cache hit the better
worker_scores
[
worker_id
]
=
(
score
*
self
.
indexer
.
block_size
()
/
token_length
)
worker_metrics
=
{}
# pull metrics for each worker
max_waiting
=
0.0
if
metrics
:
for
endpoint
in
metrics
.
endpoints
:
worker_id
=
endpoint
.
worker_id
worker_metrics
[
worker_id
]
=
{
"gpu_cache_usage_perc"
:
endpoint
.
gpu_cache_usage_perc
if
hasattr
(
endpoint
,
"gpu_cache_usage_perc"
)
else
0.0
,
"num_requests_waiting"
:
endpoint
.
num_requests_waiting
if
hasattr
(
endpoint
,
"num_requests_waiting"
)
else
0.0
,
"gpu_prefix_cache_hit_rate"
:
endpoint
.
gpu_prefix_cache_hit_rate
if
hasattr
(
endpoint
,
"gpu_prefix_cache_hit_rate"
)
else
0.0
,
}
max_waiting
=
max
(
max_waiting
,
worker_metrics
[
worker_id
][
"num_requests_waiting"
]
)
# Get all worker IDs from the client. This is needed because scores / metrics may not have values for all workers
# and we want all workers to be considered in the logit calculation
worker_ids
=
self
.
workers_client
.
endpoint_ids
()
worker_logits
=
{}
for
worker_id
in
worker_ids
:
# Use default values if worker not in scores or metrics
score
=
worker_scores
.
get
(
worker_id
,
0.0
)
metrics_dict
=
worker_metrics
.
get
(
worker_id
,
{
"gpu_cache_usage_perc"
:
0.0
,
"num_requests_waiting"
:
0.0
,
"gpu_prefix_cache_hit_rate"
:
0.0
,
},
)
normalized_waiting
=
(
metrics_dict
[
"num_requests_waiting"
]
/
max_waiting
if
max_waiting
>
0
else
0.0
)
# Have 1 metric that weights towards cache hit
# 2 metrics that penalize overloaded worker and queuing
worker_logits
[
worker_id
]
=
(
2
*
score
-
metrics_dict
[
"gpu_cache_usage_perc"
]
-
normalized_waiting
)
vllm_logger
.
info
(
f
"Formula for
{
worker_id
}
:
{
worker_logits
[
worker_id
]:.
3
f
}
= 2.0 *
{
score
:.
3
f
}
-
{
metrics_dict
[
'gpu_cache_usage_perc'
]:.
3
f
}
-
{
normalized_waiting
:.
3
f
}
"
)
if
not
worker_logits
or
all
(
logit
==
0
for
logit
in
worker_logits
.
values
()):
return
""
# Select the worker with the highest logit
if
worker_logits
:
max_logit
=
max
(
worker_logits
.
values
())
best_workers
=
[
wid
for
wid
,
logit
in
worker_logits
.
items
()
if
logit
==
max_logit
]
best_worker_id
=
random
.
choice
(
best_workers
)
else
:
best_worker_id
=
""
# Log the metrics for the selected worker
if
best_worker_id
:
vllm_logger
.
info
(
f
"Selected worker:
{
best_worker_id
}
, logit:
{
worker_logits
[
best_worker_id
]:.
3
f
}
"
)
vllm_logger
.
info
(
f
"Score:
{
scores
.
scores
.
get
(
best_worker_id
,
0.0
)
if
scores
else
0.0
:.
3
f
}
"
)
metrics_dict
=
worker_metrics
.
get
(
best_worker_id
,
{})
vllm_logger
.
info
(
f
"GPU Cache Hit Rate:
{
metrics_dict
.
get
(
'gpu_prefix_cache_hit_rate'
,
0.0
):.
3
f
}
"
)
vllm_logger
.
info
(
f
"GPU Cache Usage:
{
metrics_dict
.
get
(
'gpu_cache_usage_perc'
,
0.0
):.
3
f
}
"
)
vllm_logger
.
info
(
f
"Requests Waiting:
{
metrics_dict
.
get
(
'num_requests_waiting'
,
0.0
)
/
max_waiting
if
max_waiting
>
0
else
0.0
:.
3
f
}
"
)
return
best_worker_id
@
dynamo_endpoint
(
Tokens
,
WorkerId
)
async
def
generate
(
self
,
request
)
->
AsyncIterator
[
WorkerId
]:
lora_id
=
0
worker_id
=
""
try
:
scores
=
await
self
.
indexer
.
find_matches_for_request
(
request
.
tokens
,
lora_id
)
except
Exception
as
e
:
scores
=
{}
vllm_logger
.
exception
(
f
"Error finding matches:
{
e
}
"
)
token_length
=
len
(
request
.
tokens
)
metrics
=
await
self
.
metrics_aggregator
.
get_metrics
()
worker_id
=
self
.
_cost_function
(
scores
,
metrics
,
token_length
)
vllm_logger
.
info
(
f
"Scheduling to worker_id:
{
worker_id
}
"
)
vllm_logger
.
info
(
"########"
)
yield
str
(
worker_id
)
@
dynamo_worker
()
async
def
worker
(
runtime
:
DistributedRuntime
,
args
:
Namespace
):
"""
Set up the worker clients.
Serve the dynamo.router.generate endpoint.
"""
workers_client
=
(
await
runtime
.
namespace
(
"dynamo"
)
.
component
(
"vllm"
)
.
endpoint
(
"generate"
)
.
client
()
)
while
len
(
workers_client
.
endpoint_ids
())
<
args
.
min_workers
:
vllm_logger
.
info
(
f
"Waiting for more workers... Current:
{
len
(
workers_client
.
endpoint_ids
())
}
, Required:
{
args
.
min_workers
}
"
)
await
asyncio
.
sleep
(
5
)
vllm_logger
.
info
(
f
"Required number of workers (
{
args
.
min_workers
}
) are ready:
\n
"
+
"
\n
"
.
join
(
f
"id:
{
id
}
"
for
id
in
workers_client
.
endpoint_ids
())
)
kv_listener
=
runtime
.
namespace
(
"dynamo"
).
component
(
"vllm"
)
await
kv_listener
.
create_service
()
router_component
=
runtime
.
namespace
(
"dynamo"
).
component
(
"router"
)
await
router_component
.
create_service
()
endpoint
=
router_component
.
endpoint
(
"generate"
)
indexer
=
KvIndexer
(
kv_listener
,
args
.
block_size
)
metrics_aggregator
=
KvMetricsAggregator
(
kv_listener
)
await
endpoint
.
serve_endpoint
(
CustomRouter
(
workers_client
,
indexer
,
metrics_aggregator
).
generate
)
if
__name__
==
"__main__"
:
uvloop
.
install
()
import
argparse
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--min-workers"
,
type
=
int
,
default
=
1
,
help
=
"Minimum number of workers required before proceeding"
,
)
# TODO: Read block size
parser
.
add_argument
(
"--block-size"
,
type
=
int
,
default
=
64
,
help
=
"Block size for the KV Indexer"
,
)
args
=
parser
.
parse_args
()
asyncio
.
run
(
worker
(
args
))
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment