Commit 8588e33a authored by GuanLuo's avatar GuanLuo Committed by GitHub
Browse files

feat: Add KV publisher and receiver. Add KV aware routing example.


Signed-off-by: default avatarNeelay Shah <neelays@nvidia.com>
Co-authored-by: default avataraflowers <aflowers@nvidia.com>
Co-authored-by: default avatarRyan McCormick <rmccormick@nvidia.com>
Co-authored-by: default avatarhongkuanz <hongkuanz@nvidia.com>
Co-authored-by: default avatarNeelay Shah <neelays@nvidia.com>
parent d8aada0b
...@@ -5145,6 +5145,7 @@ dependencies = [ ...@@ -5145,6 +5145,7 @@ dependencies = [
"unicode-segmentation", "unicode-segmentation",
"uuid 1.13.1", "uuid 1.13.1",
"validator", "validator",
"xxhash-rust",
] ]
[[package]] [[package]]
......
...@@ -29,6 +29,10 @@ RUN apt-get update; apt-get install -y gdb protobuf-compiler ...@@ -29,6 +29,10 @@ RUN apt-get update; apt-get install -y gdb protobuf-compiler
RUN curl https://sh.rustup.rs -sSf | bash -s -- -y RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
ENV PATH="/root/.cargo/bin:${PATH}" ENV PATH="/root/.cargo/bin:${PATH}"
# 'etcd' is runtime dependency
RUN wget https://github.com/etcd-io/etcd/releases/download/v3.5.18/etcd-v3.5.18-linux-amd64.tar.gz && tar -xzf etcd-v3.5.18-linux-amd64.tar.gz
RUN cp ./etcd-v3.5.18-linux-amd64/etcd* /usr/local/bin/.
# Install OpenAI-compatible frontend and its dependencies from triton server # Install OpenAI-compatible frontend and its dependencies from triton server
# repository. These are used to have a consistent interface, schema, and FastAPI # repository. These are used to have a consistent interface, schema, and FastAPI
# app between Triton Core and Triton Distributed implementations. # app between Triton Core and Triton Distributed implementations.
...@@ -66,6 +70,34 @@ RUN find /opt/tritonserver/python -maxdepth 1 -type f -name \ ...@@ -66,6 +70,34 @@ RUN find /opt/tritonserver/python -maxdepth 1 -type f -name \
ARG GENAI_PERF_TAG="r25.01" ARG GENAI_PERF_TAG="r25.01"
RUN pip install "git+https://github.com/triton-inference-server/perf_analyzer.git@${GENAI_PERF_TAG}#subdirectory=genai-perf" RUN pip install "git+https://github.com/triton-inference-server/perf_analyzer.git@${GENAI_PERF_TAG}#subdirectory=genai-perf"
# Working directory
WORKDIR /workspace
COPY runtime /workspace/runtime
RUN cd runtime/rust && \
cargo build --release --locked && cargo doc --no-deps
# Generate C bindings. Note that this is required for TRTLLM backend re-build
COPY llm /workspace/llm
RUN cd llm/rust/ && \
cargo build --release --locked && cargo doc --no-deps
# Install uv and create virtualenv for general use
COPY python-wheel /workspace/python-wheel
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
RUN mkdir /opt/triton && \
uv venv /opt/triton/venv --python 3.12 && \
source /opt/triton/venv/bin/activate && \
cd python-wheel && \
uv build && \
uv pip install dist/triton_distributed_rs*cp312*.whl
# Package the bindings
RUN mkdir -p /opt/triton/llm_binding/wheels && mkdir /opt/triton/llm_binding/lib
RUN cp python-wheel/dist/triton_distributed_rs*cp312*.whl /opt/triton/llm_binding/wheels/.
RUN cp llm/rust/target/release/libtriton_llm_capi.so /opt/triton/llm_binding/lib/.
RUN cp -r llm/rust/libtriton-llm/include /opt/triton/llm_binding/.
# Backend & Framework Specific Installation # Backend & Framework Specific Installation
ARG FRAMEWORK="STANDARD" ARG FRAMEWORK="STANDARD"
ARG TENSORRTLLM_BACKEND_REPO_TAG= ARG TENSORRTLLM_BACKEND_REPO_TAG=
...@@ -73,7 +105,7 @@ ARG TENSORRTLLM_BACKEND_REBUILD= ...@@ -73,7 +105,7 @@ ARG TENSORRTLLM_BACKEND_REBUILD=
ENV FRAMEWORK=${FRAMEWORK} ENV FRAMEWORK=${FRAMEWORK}
RUN --mount=type=bind,source=./container/deps/requirements.tensorrtllm.txt,target=/tmp/requirements.txt \ RUN --mount=type=bind,source=./container/deps/requirements.tensorrtllm.txt,target=/tmp/requirements.txt \
--mount=type=bind,source=./container/deps/clone_tensorrtllm.sh,target=/tmp/clone_tensorrtllm.sh \ --mount=type=bind,source=./container/deps/clone_tensorrtllm.sh,target=/tmp/clone_tensorrtllm.sh \
if [[ "$FRAMEWORK" == "TENSORRTLLM" ]] ; then pip install --timeout=2000 -r /tmp/requirements.txt; /tmp/clone_tensorrtllm.sh --tensorrtllm-backend-repo-tag ${TENSORRTLLM_BACKEND_REPO_TAG} --tensorrtllm-backend-rebuild ${TENSORRTLLM_BACKEND_REBUILD} ; fi if [[ "$FRAMEWORK" == "TENSORRTLLM" ]] ; then pip install --timeout=2000 -r /tmp/requirements.txt; /tmp/clone_tensorrtllm.sh --tensorrtllm-backend-repo-tag ${TENSORRTLLM_BACKEND_REPO_TAG} --tensorrtllm-backend-rebuild ${TENSORRTLLM_BACKEND_REBUILD} --triton-llm-path /opt/triton/llm_binding ; fi
RUN --mount=type=bind,source=./container/deps/requirements.standard.txt,target=/tmp/requirements.txt \ RUN --mount=type=bind,source=./container/deps/requirements.standard.txt,target=/tmp/requirements.txt \
...@@ -86,6 +118,24 @@ ENV LD_LIBRARY_PATH=${FRAMEWORK_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH} ...@@ -86,6 +118,24 @@ ENV LD_LIBRARY_PATH=${FRAMEWORK_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}
ENV TENSORRTLLM_BACKEND_REPO_TAG=$TENSORRTLLM_BACKEND_REPO_TAG ENV TENSORRTLLM_BACKEND_REPO_TAG=$TENSORRTLLM_BACKEND_REPO_TAG
ENV TRTLLM_USE_MPI_KVCACHE=${TENSORRTLLM_FRAMEWORK:+"1"} ENV TRTLLM_USE_MPI_KVCACHE=${TENSORRTLLM_FRAMEWORK:+"1"}
# TODO set VLLM Version
# ENV VLLM_VERSION
ARG VLLM_FRAMEWORK
# DEFAULT VLLM VARIABLES
# ENV VLLM_ATTENTION_BACKEND=${VLLM_FRAMEWORK:+FLASHINFER}
ENV VLLM_WORKER_MULTIPROC_METHOD=${VLLM_FRAMEWORK:+spawn}
ENV VLLM_TORCH_HOST=${VLLM_FRAMEWORK:+localhost}
ENV VLLM_TORCH_PORT=${VLLM_FRAMEWORK:+36183}
ENV VLLM_DATA_PLANE_BACKEND=${VLLM_FRAMEWORK:+nccl}
ENV VLLM_BASELINE_WORKERS=${VLLM_FRAMEWORK:+0}
ENV VLLM_CONTEXT_WORKERS=${VLLM_FRAMEWORK:+1}
ENV VLLM_GENERATE_WORKERS=${VLLM_FRAMEWORK:+1}
ENV VLLM_BASELINE_TP_SIZE=${VLLM_FRAMEWORK:+1}
ENV VLLM_CONTEXT_TP_SIZE=${VLLM_FRAMEWORK:+1}
ENV VLLM_GENERATE_TP_SIZE=${VLLM_FRAMEWORK:+1}
ENV VLLM_KV_CAPI_PATH="/opt/triton/llm_binding/lib/libtriton_llm_capi.so"
ENV PYTHONUNBUFFERED=1
# Install NATS - pointing toward NATS github instead of binaries.nats.dev due to server instability # Install NATS - pointing toward NATS github instead of binaries.nats.dev due to server instability
RUN wget https://github.com/nats-io/nats-server/releases/download/v2.10.24/nats-server-v2.10.24-amd64.deb && dpkg -i nats-server-v2.10.24-amd64.deb RUN wget https://github.com/nats-io/nats-server/releases/download/v2.10.24/nats-server-v2.10.24-amd64.deb && dpkg -i nats-server-v2.10.24-amd64.deb
...@@ -131,7 +181,7 @@ RUN mkdir /opt/triton && \ ...@@ -131,7 +181,7 @@ RUN mkdir /opt/triton && \
# Install triton_distributed_rs wheel globally in container for tests that # Install triton_distributed_rs wheel globally in container for tests that
# currently run without virtual environment activated. # currently run without virtual environment activated.
# TODO: In future, we may use a virtualenv for everything and remove this. # TODO: In future, we may use a virtualenv for everything and remove this.
RUN pip install runtime/rust/python-wheel/dist/triton_distributed_rs*cp312*.whl RUN pip install /opt/triton/llm_binding/wheels/triton_distributed_rs*cp312*.whl
COPY icp /workspace/icp COPY icp /workspace/icp
RUN /workspace/icp/protos/gen_python.sh RUN /workspace/icp/protos/gen_python.sh
......
...@@ -61,11 +61,11 @@ TENSORRTLLM_BASE_IMAGE=nvcr.io/nvidia/tritonserver ...@@ -61,11 +61,11 @@ TENSORRTLLM_BASE_IMAGE=nvcr.io/nvidia/tritonserver
TENSORRTLLM_BASE_IMAGE_TAG=${TENSORRTLLM_BASE_VERSION}-trtllm-python-py3 TENSORRTLLM_BASE_IMAGE_TAG=${TENSORRTLLM_BASE_VERSION}-trtllm-python-py3
# IMPORTANT NOTE: Ensure the repo tag complies with the TRTLLM backend version # IMPORTANT NOTE: Ensure the repo tag complies with the TRTLLM backend version
# used in the base image above. # used in the base image above.
TENSORRTLLM_BACKEND_REPO_TAG=v0.17.0 TENSORRTLLM_BACKEND_REPO_TAG=triton-llm/v0.17.0
# Set this as 1 to rebuild and replace trtllm backend bits in the container. # Set this as 1 to rebuild and replace trtllm backend bits in the container.
# This will allow building triton distributed container image with custom # This will allow building triton distributed container image with custom
# trt-llm backend repo branch. # trt-llm backend repo branch.
TENSORRTLLM_BACKEND_REBUILD=0 TENSORRTLLM_BACKEND_REBUILD=1
# vllm installation is done later in the Dockerfile so it will overwrite the # vllm installation is done later in the Dockerfile so it will overwrite the
# vllm version installed in the base image. # vllm version installed in the base image.
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
TENSORRTLLM_BACKEND_REPO_TAG= TENSORRTLLM_BACKEND_REPO_TAG=
TENSORRTLLM_BACKEND_REBUILD= TENSORRTLLM_BACKEND_REBUILD=
TRITON_LLM_PATH=
GIT_TOKEN= GIT_TOKEN=
GIT_REPO= GIT_REPO=
...@@ -42,6 +43,14 @@ get_options() { ...@@ -42,6 +43,14 @@ get_options() {
missing_requirement $1 missing_requirement $1
fi fi
;; ;;
--triton-llm-path)
if [ "$2" ]; then
TRITON_LLM_PATH=$2
shift
else
missing_requirement $1
fi
;;
--git-token) --git-token)
if [ "$2" ]; then if [ "$2" ]; then
GIT_TOKEN=$2 GIT_TOKEN=$2
...@@ -138,7 +147,7 @@ if [ ! -z ${TENSORRTLLM_BACKEND_REBUILD} ]; then ...@@ -138,7 +147,7 @@ if [ ! -z ${TENSORRTLLM_BACKEND_REBUILD} ]; then
# Build the backend # Build the backend
(cd inflight_batcher_llm/src \ (cd inflight_batcher_llm/src \
&& cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DUSE_CXX11_ABI=1 .. \ && cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DUSE_CXX11_ABI=1 -DTRITON_LLM_PATH=$TRITON_LLM_PATH .. \
&& make install \ && make install \
&& cp libtriton_tensorrtllm.so /opt/tritonserver/backends/tensorrtllm/ \ && cp libtriton_tensorrtllm.so /opt/tritonserver/backends/tensorrtllm/ \
&& cp trtllmExecutorWorker /opt/tritonserver/backends/tensorrtllm/ \ && cp trtllmExecutorWorker /opt/tritonserver/backends/tensorrtllm/ \
......
...@@ -53,6 +53,308 @@ index 9ba49757..7e871521 100644 ...@@ -53,6 +53,308 @@ index 9ba49757..7e871521 100644
class CompilationLevel: class CompilationLevel:
# constants for the levels of the compilation process # constants for the levels of the compilation process
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index 359b5b26..d52ee050 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -6,6 +6,7 @@ from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
DeviceAwareBlockAllocator)
from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
+from vllm.core.event_manager import KVCacheEventManager
from vllm.platforms import current_platform
from vllm.utils import Device
@@ -28,6 +29,7 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
num_gpu_blocks: int,
num_cpu_blocks: int,
block_size: int,
+ event_manager: Optional[KVCacheEventManager] = None,
) -> DeviceAwareBlockAllocator:
"""Creates a CpuGpuBlockAllocator instance with the specified
configuration.
@@ -64,6 +66,7 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
cpu_block_ids = block_ids[num_gpu_blocks:]
if allocator_type == "naive":
+ assert event_manager is None, "Event API not supported with naive allocator."
gpu_allocator: BlockAllocator = NaiveBlockAllocator(
create_block=NaiveBlock, # type: ignore
num_blocks=num_gpu_blocks,
@@ -82,12 +85,14 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
num_blocks=num_gpu_blocks,
block_size=block_size,
block_ids=gpu_block_ids,
+ event_manager=event_manager,
)
cpu_allocator = PrefixCachingBlockAllocator(
num_blocks=num_cpu_blocks,
block_size=block_size,
block_ids=cpu_block_ids,
+ event_manager=event_manager,
)
else:
raise ValueError(f"Unknown allocator type {allocator_type=}")
@@ -95,10 +100,12 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
return CpuGpuBlockAllocator(
cpu_block_allocator=cpu_allocator,
gpu_block_allocator=gpu_allocator,
+ event_manager=event_manager,
)
def __init__(self, cpu_block_allocator: BlockAllocator,
- gpu_block_allocator: BlockAllocator):
+ gpu_block_allocator: BlockAllocator,
+ event_manager: Optional[KVCacheEventManager] = None,):
assert not (
cpu_block_allocator.all_block_ids
& gpu_block_allocator.all_block_ids
@@ -108,6 +115,7 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
Device.CPU: cpu_block_allocator,
Device.GPU: gpu_block_allocator,
}
+ self.event_manager = event_manager
self._swap_mapping: Dict[int, int] = {}
self._null_block: Optional[Block] = None
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 1ca9e49d..b1591c0c 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -4,7 +4,7 @@ import sys
from bisect import bisect_left
from os.path import commonprefix
from typing import (Callable, Dict, FrozenSet, Iterable, List, Optional, Set,
- Tuple)
+ Tuple, TYPE_CHECKING)
from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker,
get_all_blocks_recursively)
@@ -23,6 +23,9 @@ PrefixHash = int
# then we know this block hasn't been accessed yet.
_DEFAULT_LAST_ACCESSED_TIME = -1
+if TYPE_CHECKING:
+ from vllm.core.event_manager import KVCacheEventManager
+
logger = init_logger(__name__)
@@ -80,6 +83,7 @@ class PrefixCachingBlockAllocator(BlockAllocator):
block_size: int,
block_ids: Optional[Iterable[int]] = None,
eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
+ event_manager: Optional["KVCacheEventManager"] = None,
):
if block_ids is None:
block_ids = range(num_blocks)
@@ -131,6 +135,9 @@ class PrefixCachingBlockAllocator(BlockAllocator):
self.metric_data = CacheMetricData()
+ self.event_manager = event_manager
+
+ # Implements Block.Factory.
def _create_block(
self,
prev_block: Optional[Block],
@@ -337,6 +344,9 @@ class PrefixCachingBlockAllocator(BlockAllocator):
assert self._refcounter.get(_block_id) == 0
assert _block_id == block_id
+ if self.event_manager:
+ self.event_manager.enqueue_removed_event(content_hash_to_evict)
+
self._cached_blocks.pop(content_hash_to_evict)
self._refcounter.incr(block_id)
@@ -513,6 +523,10 @@ class PrefixCachingBlockAllocator(BlockAllocator):
# Mark this block as touched so that it can be marked as
# computed after the entire batch of sequences are scheduled.
self._touched_blocks.add(block.block_id)
+
+ if self.event_manager:
+ self.event_manager.enqueue_stored_event(block.prev_block, block)
+
return block.block_id
# Reuse the cached content hash
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
index c5b3b04f..8a483aa2 100644
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@@ -9,10 +9,12 @@ from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
from vllm.core.block.interfaces import Block
from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
LastAccessBlocksTracker)
+from vllm.core.event_manager import KVCacheEventManager
from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
from vllm.core.interfaces import AllocStatus, BlockSpaceManager
from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
from vllm.utils import Device
+from vllm.envs import VLLM_WORKER_ID, VLLM_KV_CAPI_PATH
SeqId = int
EncoderSeqId = str
@@ -60,6 +62,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
def __init__(
self,
+ model_name: str,
block_size: int,
num_gpu_blocks: int,
num_cpu_blocks: int,
@@ -91,11 +94,17 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
self.watermark_blocks = int(watermark * num_gpu_blocks)
+ if VLLM_WORKER_ID is not None and VLLM_KV_CAPI_PATH is not None:
+ self.event_manager = KVCacheEventManager(model_name, worker_id=str(VLLM_WORKER_ID).encode(), lib_path=VLLM_KV_CAPI_PATH)
+ else:
+ self.event_manager = None
+
self.block_allocator = CpuGpuBlockAllocator.create(
allocator_type="prefix_caching" if enable_caching else "naive",
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=num_cpu_blocks,
block_size=block_size,
+ event_manager=self.event_manager,
)
self.block_tables: Dict[SeqId, BlockTable] = {}
diff --git a/vllm/core/event_manager.py b/vllm/core/event_manager.py
new file mode 100644
index 00000000..4aa90a4a
--- /dev/null
+++ b/vllm/core/event_manager.py
@@ -0,0 +1,89 @@
+from typing import Optional
+import logging
+from vllm.core.block.prefix_caching_block import PrefixCachingBlock, PrefixHash
+
+import ctypes
+from ctypes import c_char_p, c_uint32, c_void_p, c_size_t
+import uuid
+
+logger = logging.getLogger(__name__)
+
+class TritonResult:
+ OK = 0
+ ERR = 1
+
+class KVCacheEventManager:
+ def __init__(self, model_name: str, worker_id: bytes, lib_path: str):
+ self.lib = None
+
+ try:
+ self.lib = ctypes.CDLL(lib_path)
+ self.lib.triton_llm_init.argtypes = [c_char_p, c_char_p]
+ self.lib.triton_llm_init.restype = c_uint32
+
+ result = self.lib.triton_llm_init(model_name.encode(), worker_id)
+ if result == TritonResult.OK:
+ logger.info("KVCacheEventManager initialized successfully. Ready to publish KV Cache Events")
+ else:
+ logger.info("KVCacheEventManager initialization failed!")
+
+ except Exception as e:
+ print(f"Failed to load {lib_path}")
+ raise e
+
+ self.lib.triton_kv_event_publish_stored.argtypes = [
+ ctypes.c_uint64, # event_id
+ ctypes.POINTER(ctypes.c_uint32), # token_ids
+ ctypes.POINTER(ctypes.c_size_t), # num_block_tokens
+ ctypes.POINTER(ctypes.c_uint64), # block_ids
+ ctypes.c_size_t, # num_blocks
+ ctypes.POINTER(ctypes.c_uint64), # parent_hash
+ ctypes.c_uint64, # lora_id
+ ]
+ self.lib.triton_kv_event_publish_stored.restype = ctypes.c_uint32 # triton_llm_result_t
+
+ self.lib.triton_kv_event_publish_removed.argtypes = [
+ ctypes.c_uint64, # event_id
+ ctypes.POINTER(ctypes.c_uint64), # block_ids
+ ctypes.c_size_t, # num_blocks
+ ]
+ self.lib.triton_kv_event_publish_removed.restype = ctypes.c_uint32 # triton_llm_result_t
+
+ self.event_id_counter = 0
+
+ def enqueue_stored_event(self, parent: Optional[PrefixCachingBlock], block: PrefixCachingBlock):
+ token_ids_arr = (ctypes.c_uint32 * len(block.token_ids))(*block.token_ids)
+ num_block_tokens = (ctypes.c_size_t * 1)(len(block.token_ids))
+ block_hash = (ctypes.c_uint64 * 1)(block.content_hash)
+ parent_hash = ((ctypes.c_uint64 * 1)(parent.content_hash) if parent is not None else None)
+
+ # Publish the event
+ result = self.lib.triton_kv_event_publish_stored(
+ self.event_id_counter, # uint64_t event_id
+ token_ids_arr, # const uint32_t *token_ids
+ num_block_tokens, # const uintptr_t *num_block_tokens
+ block_hash, # const uint64_t *block_ids
+ 1, # uintptr_t num_blocks
+ parent_hash, # const uint64_t *parent_hash
+ 0, # uint64_t lora_id
+ )
+
+ if result == TritonResult.OK:
+ logger.debug(f"Store - Published KV Event: {block.content_hash}")
+ else:
+ logger.debug(f"Store - Failed to Publish KV Event: {block.content_hash}")
+
+ self.event_id_counter += 1
+
+ def enqueue_removed_event(self, block_hash: PrefixHash):
+ result = self.lib.triton_kv_event_publish_removed(
+ self.event_id_counter,
+ (ctypes.c_uint64 * 1)(block_hash),
+ 1,)
+
+ if result == TritonResult.OK:
+ logger.debug(f"Remove - Published KV Event: {block_hash}")
+ else:
+ logger.debug(f"Remove - Failed to Publish KV Event: {block_hash}")
+
+ self.event_id_counter += 1
\ No newline at end of file
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index f507847a..6af77646 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -10,7 +10,7 @@ from typing import Callable, Deque, Dict, Iterable, List, Optional
from typing import Sequence as GenericSequence
from typing import Set, Tuple, Union
-from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.config import ModelConfig, CacheConfig, LoRAConfig, SchedulerConfig
from vllm.core.interfaces import AllocStatus, BlockSpaceManager
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
@@ -325,12 +325,14 @@ class Scheduler:
def __init__(
self,
+ model_config: ModelConfig,
scheduler_config: SchedulerConfig,
cache_config: CacheConfig,
lora_config: Optional[LoRAConfig],
pipeline_parallel_size: int = 1,
output_proc_callback: Optional[Callable] = None,
) -> None:
+ self.model_config = model_config
self.scheduler_config = scheduler_config
self.cache_config = cache_config
# Note for LoRA scheduling: the current policy is extremely
@@ -356,6 +358,7 @@ class Scheduler:
# Create the block space manager.
self.block_manager = BlockSpaceManagerImpl(
+ model_name=self.model_config.served_model_name,
block_size=self.cache_config.block_size,
num_gpu_blocks=num_gpu_blocks,
num_cpu_blocks=num_cpu_blocks,
diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
index fe480533..b768e03c 100644 index fe480533..b768e03c 100644
--- a/vllm/distributed/kv_transfer/kv_connector/factory.py --- a/vllm/distributed/kv_transfer/kv_connector/factory.py
...@@ -889,6 +1191,47 @@ index 321902d1..b8937ef8 100644 ...@@ -889,6 +1191,47 @@ index 321902d1..b8937ef8 100644
def ensure_model_parallel_initialized( def ensure_model_parallel_initialized(
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index d82d9ad9..542ccfe8 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -348,7 +348,7 @@ class LLMEngine:
# GPU and CPU blocks, which are profiled in the distributed executor.
self.scheduler = [
Scheduler(
- self.scheduler_config, self.cache_config, self.lora_config,
+ self.model_config, self.scheduler_config, self.cache_config, self.lora_config,
self.parallel_config.pipeline_parallel_size,
self.async_callbacks[v_id]
if self.model_config.use_async_output_proc else None)
diff --git a/vllm/envs.py b/vllm/envs.py
index 745b068b..438142e3 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -87,6 +87,8 @@ if TYPE_CHECKING:
VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
VLLM_RAY_PER_WORKER_GPUS: float = 1.0
VLLM_RAY_BUNDLE_INDICES: str = ""
+ VLLM_KV_CAPI_PATH: Optional[str] = None
+ VLLM_WORKER_ID: Optional[str] = None
def get_default_cache_root():
@@ -572,6 +574,14 @@ environment_variables: Dict[str, Callable[[], Any]] = {
# models the alignment is already naturally aligned to 256 bytes.
"VLLM_CUDA_MEM_ALIGN_KV_CACHE":
lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))),
+
+ # Path to the C API Library
+ "VLLM_KV_CAPI_PATH":
+ lambda: os.environ.get("VLLM_KV_CAPI_PATH", None),
+
+ # Worker ID used for identifying workers in distributed settings
+ "VLLM_WORKER_ID":
+ lambda: os.getenv("VLLM_WORKER_ID", None),
}
# end-env-vars-definition
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index 773f5abe..3eefd266 100644 index 773f5abe..3eefd266 100644
--- a/vllm/model_executor/models/deepseek_v2.py --- a/vllm/model_executor/models/deepseek_v2.py
......
...@@ -243,3 +243,45 @@ Zhang. Distserve: Disaggregating prefill and decoding for goodput-optimized larg ...@@ -243,3 +243,45 @@ Zhang. Distserve: Disaggregating prefill and decoding for goodput-optimized larg
model serving. *arXiv:2401.09670v3 [cs.DC]*, 2024. model serving. *arXiv:2401.09670v3 [cs.DC]*, 2024.
For more details on Triton Distributed, see the [Hello World example](../../hello_world/) and [Triton Inference Server documentation](https://github.com/triton-inference-server/server). For more details on Triton Distributed, see the [Hello World example](../../hello_world/) and [Triton Inference Server documentation](https://github.com/triton-inference-server/server).
# KV Aware Routing with TensorRT-LLM
This example also showcase smart routing based on worker KV usage, in aggregated scenario.
To start a KV aware deployment with 2 decode workers:
```bash
export HOSTNAME=localhost
export MODEL_NAME="llama-3.1-8b-instruct"
python3 /workspace/examples/python/llm/tensorrtllm/deploy/launch_workers.py \
--generate-worker-count 2 \
--model ${MODEL_NAME} \
--initialize-request-plane \
--kv-aware-routing \
--request-plane-uri ${HOSTNAME}:4222 &
```
```bash
python3 -m llm.api_server \
--tokenizer meta-llama/Llama-3.1-8B-Instruct \
--request-plane-uri ${HOSTNAME}:4222 \
--api-server-host ${HOSTNAME} \
--model-name ${MODEL_NAME} &
```
```bash
curl ${HOSTNAME}:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "llama-3.1-8b-instruct",
"messages": [
{"role": "user", "content": "Why is Roger Federer the greatest tennis player of all time? Roger Federer is widely regarded as one of the greatest tennis players of all time, and many consider him the greatest."}
],
"temperature": 0,
"top_p": 0.95,
"max_tokens": 25,
"stream": true,
"n": 1,
"frequency_penalty": 0.0,
"stop": []
}'
```
...@@ -19,6 +19,7 @@ import time ...@@ -19,6 +19,7 @@ import time
from pathlib import Path from pathlib import Path
from llm.tensorrtllm.operators.disaggregated_serving import DisaggregatedServingOperator from llm.tensorrtllm.operators.disaggregated_serving import DisaggregatedServingOperator
from llm.tensorrtllm.operators.kv_aware_routing import KvAwareRoutingOperator
from llm.tensorrtllm.scripts.gpu_info import get_gpu_product_name from llm.tensorrtllm.scripts.gpu_info import get_gpu_product_name
from triton_distributed.runtime import ( from triton_distributed.runtime import (
...@@ -62,6 +63,18 @@ def _create_disaggregated_serving_op(name, args, max_inflight_requests): ...@@ -62,6 +63,18 @@ def _create_disaggregated_serving_op(name, args, max_inflight_requests):
) )
def _create_kv_aware_routing_op(name, args, max_inflight_requests):
model_repository = str(
Path(args.operator_repository) / "triton_core_models"
) # stores our simple pre/post processing
return OperatorConfig(
name=name,
implementation=KvAwareRoutingOperator,
max_inflight_requests=int(max_inflight_requests),
repository=model_repository,
)
def _create_triton_core_op( def _create_triton_core_op(
name, name,
max_inflight_requests, max_inflight_requests,
...@@ -86,6 +99,7 @@ def _create_triton_core_op( ...@@ -86,6 +99,7 @@ def _create_triton_core_op(
"parameters": { "parameters": {
"participant_ids": {"string_value": f"{args.gpu_device_id}"}, "participant_ids": {"string_value": f"{args.gpu_device_id}"},
"gpu_device_ids": {"string_value": f"{args.gpu_device_id}"}, "gpu_device_ids": {"string_value": f"{args.gpu_device_id}"},
"event_buffer_max_size": {"string_value": "1024"},
} }
}, },
}, },
...@@ -159,6 +173,22 @@ def main(args): ...@@ -159,6 +173,22 @@ def main(args):
request_plane_args=([], {"request_plane_uri": args.request_plane_uri}), request_plane_args=([], {"request_plane_uri": args.request_plane_uri}),
) )
worker_configs.append(prefill_decode) worker_configs.append(prefill_decode)
elif args.worker_type == "kv-aware-routing":
print("Creating KvAwareRouting Operator")
router_op = _create_kv_aware_routing_op(
name=args.model,
max_inflight_requests=1000,
args=args,
)
router = WorkerConfig(
operators=[router_op],
name=args.worker_name,
log_level=args.log_level,
metrics_port=args.metrics_port,
request_plane_args=([], {"request_plane_uri": args.request_plane_uri}),
)
worker_configs.append(router)
print("Starting Worker") print("Starting Worker")
for worker_config in worker_configs: for worker_config in worker_configs:
......
...@@ -43,50 +43,51 @@ for sig in signals: ...@@ -43,50 +43,51 @@ for sig in signals:
def _launch_mpi_workers(args): def _launch_mpi_workers(args):
if ( command = [
args.context_worker_count == 1 "mpiexec",
or args.generate_worker_count == 1 "--allow-run-as-root",
or args.aggregate_worker_count == 1 "--oversubscribe",
): "--display-map",
command = [ "--verbose",
"mpiexec", ]
"--allow-run-as-root",
"--oversubscribe", if args.log_dir:
"--display-map", WORKER_LOG_DIR = str(Path(args.log_dir) / "workers")
"--verbose", command += ["--output-filename", WORKER_LOG_DIR]
]
aggregate_gpus = 0
if args.log_dir:
WORKER_LOG_DIR = str(Path(args.log_dir) / "workers") # [TODO] below placements assume model to be TP/PP 1
command += ["--output-filename", WORKER_LOG_DIR] gpu_count_per_context_worker = 1
gpu_count_per_generate_worker = 1
aggregate_gpus = args.context_worker_count + args.generate_worker_count gpu_count_per_aggreate_worker = 1
for index in range(args.context_worker_count): for index in range(args.context_worker_count):
starting_gpu = index * aggregate_gpus starting_gpu = aggregate_gpus
command.extend(_context_cmd(args, starting_gpu)) command.extend(_context_cmd(args, index, starting_gpu))
command.append(":") command.append(":")
aggregate_gpus += gpu_count_per_context_worker
for index in range(args.generate_worker_count):
starting_gpu = index * aggregate_gpus + args.context_worker_count for index in range(args.generate_worker_count):
command.extend(_generate_cmd(args, starting_gpu)) starting_gpu = aggregate_gpus
command.append(":") command.extend(_generate_cmd(args, index, starting_gpu))
command.append(":")
for index in range(args.aggregate_worker_count): aggregate_gpus += gpu_count_per_generate_worker
starting_gpu = index * aggregate_gpus + args.context_worker_count
command.extend(_aggregate_cmd(args, starting_gpu)) for index in range(args.aggregate_worker_count):
command.append(":") starting_gpu = aggregate_gpus
command.extend(_aggregate_cmd(args, index, starting_gpu))
command = command[0:-1] command.append(":")
print(" ".join(command)) aggregate_gpus += gpu_count_per_aggreate_worker
if args.dry_run: command = command[0:-1]
return print(" ".join(command))
env = os.environ.copy() if args.dry_run:
return subprocess.Popen(command, env=env, stdin=subprocess.DEVNULL) return
else:
raise ValueError("Only supporting 1 worker each for now") env = os.environ.copy()
return subprocess.Popen(command, env=env, stdin=subprocess.DEVNULL)
def _launch_disagg_model(args): def _launch_disagg_model(args):
...@@ -104,21 +105,43 @@ def _launch_disagg_model(args): ...@@ -104,21 +105,43 @@ def _launch_disagg_model(args):
return subprocess.Popen(command, env=env, stdin=subprocess.DEVNULL) return subprocess.Popen(command, env=env, stdin=subprocess.DEVNULL)
def _launch_kv_aware_model(args):
if not args.kv_aware_routing:
return
starting_gpu = 0
env = os.environ.copy()
command = _kv_aware_routing_cmd(args, starting_gpu)
print(" ".join(command))
if args.dry_run:
return
return subprocess.Popen(command, env=env, stdin=subprocess.DEVNULL)
def _launch_workers(args): def _launch_workers(args):
# Launch nats-server if requested by user for convenience, otherwise # Launch nats-server if requested by user for convenience, otherwise
# it can be started separately beforehand. # it can be started separately beforehand.
if args.initialize_request_plane: if args.initialize_request_plane:
_launch_nats_server(args) _launch_nats_server(args)
# [FIXME] not really related to request plane
_launch_etcd(args)
# Launch TRT-LLM models via mpiexec in the same MPI WORLD # Launch TRT-LLM models via mpiexec in the same MPI WORLD
_launch_mpi_workers(args) _launch_mpi_workers(args)
# [FIXME] below should be "one of" or merged together
# Launch disaggregated serving "workflow" model to interface # Launch disaggregated serving "workflow" model to interface
# client-facing requests with Triton Distributed deployment. # client-facing requests with Triton Distributed deployment.
_launch_disagg_model(args) _launch_disagg_model(args)
# Launch KV aware routing "workflow" model to interface
# client-facing requests with Triton Distributed deployment.
_launch_kv_aware_model(args)
def _context_cmd(args, starting_gpu): def _context_cmd(args, index, starting_gpu):
# Hard-coded worker name for internal communication, # Hard-coded worker name for internal communication,
# see tensorrtllm.deploy script # see tensorrtllm.deploy script
worker_name = "context" worker_name = "context"
...@@ -141,7 +164,7 @@ def _context_cmd(args, starting_gpu): ...@@ -141,7 +164,7 @@ def _context_cmd(args, starting_gpu):
"--gpu-device-id", "--gpu-device-id",
f"{starting_gpu}", f"{starting_gpu}",
"--metrics-port", "--metrics-port",
"50000", str(50100 + index),
"--initialize-request-plane", "--initialize-request-plane",
"--request-plane-uri", "--request-plane-uri",
f"{os.getenv('HOSTNAME')}:{args.nats_port}", f"{os.getenv('HOSTNAME')}:{args.nats_port}",
...@@ -150,7 +173,7 @@ def _context_cmd(args, starting_gpu): ...@@ -150,7 +173,7 @@ def _context_cmd(args, starting_gpu):
return command return command
def _generate_cmd(args, starting_gpu): def _generate_cmd(args, index, starting_gpu):
# Hard-coded worker name for internal communication # Hard-coded worker name for internal communication
# see tensorrtllm.deploy script # see tensorrtllm.deploy script
worker_name = "generate" worker_name = "generate"
...@@ -173,7 +196,7 @@ def _generate_cmd(args, starting_gpu): ...@@ -173,7 +196,7 @@ def _generate_cmd(args, starting_gpu):
"--gpu-device-id", "--gpu-device-id",
f"{starting_gpu}", f"{starting_gpu}",
"--metrics-port", "--metrics-port",
"50001", str(50200 + index),
"--request-plane-uri", "--request-plane-uri",
f"{os.getenv('HOSTNAME')}:{args.nats_port}", f"{os.getenv('HOSTNAME')}:{args.nats_port}",
] ]
...@@ -181,7 +204,7 @@ def _generate_cmd(args, starting_gpu): ...@@ -181,7 +204,7 @@ def _generate_cmd(args, starting_gpu):
return command return command
def _aggregate_cmd(args, starting_gpu): def _aggregate_cmd(args, index, starting_gpu):
# Hard-coded worker name for internal communication # Hard-coded worker name for internal communication
# see tensorrtllm.deploy script # see tensorrtllm.deploy script
worker_name = "aggregate" worker_name = "aggregate"
...@@ -204,7 +227,7 @@ def _aggregate_cmd(args, starting_gpu): ...@@ -204,7 +227,7 @@ def _aggregate_cmd(args, starting_gpu):
"--gpu-device-id", "--gpu-device-id",
f"{starting_gpu}", f"{starting_gpu}",
"--metrics-port", "--metrics-port",
"50001", str(50300 + index),
"--request-plane-uri", "--request-plane-uri",
f"{os.getenv('HOSTNAME')}:{args.nats_port}", f"{os.getenv('HOSTNAME')}:{args.nats_port}",
] ]
...@@ -239,6 +262,33 @@ def _disaggregated_serving_cmd(args, starting_gpu): ...@@ -239,6 +262,33 @@ def _disaggregated_serving_cmd(args, starting_gpu):
return command return command
def _kv_aware_routing_cmd(args, starting_gpu):
# NOTE: This worker gets the args --worker-name because it will
# receive the API-serving facing requests, and internally handle
# the disaggregation. So this worker name should match the one
# registered to the API Server.
command = [
# FIXME: Does this model need a GPU assigned to it?
# "-x",
# f"CUDA_VISIBLE_DEVICES={starting_gpu}",
"python3",
"-m",
"llm.tensorrtllm.deploy",
"--worker-type",
"kv-aware-routing",
"--metrics-port",
"50002",
"--model",
args.model,
"--worker-name",
args.worker_name,
"--request-plane-uri",
f"{os.getenv('HOSTNAME')}:{args.nats_port}",
]
return command
def _launch_nats_server(args, clear_store=True): def _launch_nats_server(args, clear_store=True):
# FIXME: Use NatsServer object defined in icp package # FIXME: Use NatsServer object defined in icp package
store_dir = "/tmp/nats_store" store_dir = "/tmp/nats_store"
...@@ -262,6 +312,19 @@ def _launch_nats_server(args, clear_store=True): ...@@ -262,6 +312,19 @@ def _launch_nats_server(args, clear_store=True):
return subprocess.Popen(command, env=env, stdin=subprocess.DEVNULL) return subprocess.Popen(command, env=env, stdin=subprocess.DEVNULL)
def _launch_etcd(args):
command = [
"/usr/local/bin/etcd",
]
print(" ".join(command))
if args.dry_run:
return
env = os.environ.copy()
return subprocess.Popen(command, env=env, stdin=subprocess.DEVNULL)
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()
_launch_workers(args) _launch_workers(args)
...@@ -72,7 +72,13 @@ def parse_args(): ...@@ -72,7 +72,13 @@ def parse_args():
type=str, type=str,
default="aggregate", default="aggregate",
help="Type of worker", help="Type of worker",
choices=["aggregate", "context", "generate", "disaggregated-serving"], choices=[
"aggregate",
"context",
"generate",
"disaggregated-serving",
"kv-aware-routing",
],
) )
parser.add_argument("--gpu-device-id", type=int, default=0, help="gpu id") parser.add_argument("--gpu-device-id", type=int, default=0, help="gpu id")
...@@ -155,4 +161,12 @@ def parse_args(): ...@@ -155,4 +161,12 @@ def parse_args():
help="Enable disaggregated serving", help="Enable disaggregated serving",
) )
parser.add_argument(
"--kv-aware-routing",
action=argparse.BooleanOptionalAction,
required=False,
default=False,
help="Enable KV aware routing",
)
return parser.parse_args() return parser.parse_args()
...@@ -14,5 +14,6 @@ ...@@ -14,5 +14,6 @@
# limitations under the License. # limitations under the License.
from llm.tensorrtllm.operators.disaggregated_serving import DisaggregatedServingOperator from llm.tensorrtllm.operators.disaggregated_serving import DisaggregatedServingOperator
from llm.tensorrtllm.operators.kv_aware_routing import KvAwareRoutingOperator
__all__ = ["DisaggregatedServingOperator"] __all__ = ["DisaggregatedServingOperator", "KvAwareRoutingOperator"]
...@@ -30,12 +30,12 @@ class DisaggregatedServingOperator(TritonCoreOperator): ...@@ -30,12 +30,12 @@ class DisaggregatedServingOperator(TritonCoreOperator):
self, self,
name, name,
version, version,
triton_core,
request_plane, request_plane,
data_plane, data_plane,
parameters, parameters,
repository, repository,
logger, logger,
triton_core,
): ):
self._prefill = RemoteOperator("context", request_plane, data_plane) self._prefill = RemoteOperator("context", request_plane, data_plane)
self._decode = RemoteOperator("generate", request_plane, data_plane) self._decode = RemoteOperator("generate", request_plane, data_plane)
......
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import json
import numpy
from triton_distributed_rs import DistributedRuntime, KvRouter
from triton_distributed.runtime import (
RemoteInferenceRequest,
RemoteOperator,
TritonCoreOperator,
)
class KvAwareRoutingOperator(TritonCoreOperator):
def __init__(
self,
name,
version,
request_plane,
data_plane,
parameters,
repository,
logger,
triton_core,
):
loop = asyncio.get_running_loop()
self._runtime = DistributedRuntime(loop)
backend = self._runtime.namespace("router").component("generate")
self._router = KvRouter(self._runtime, backend)
self._generate = RemoteOperator("generate", request_plane, data_plane)
self._repository = repository
self._triton_core = triton_core
self._triton_core.register_model_repository(repository)
self._preprocess_model = self._triton_core.load("simple_preprocessing")
self._postprocess_model = self._triton_core.load("simple_postprocessing")
self._logger = logger
self._store_outputs_in_response = True
async def execute(self, requests: list[RemoteInferenceRequest]):
self._logger.debug("Executing KvAwareRouting Request")
background_tasks = []
for request in requests:
task = asyncio.create_task(self._execute_request(request))
background_tasks.append(task)
try:
results = await asyncio.gather(*background_tasks, return_exceptions=True)
for result in results:
if isinstance(result, Exception):
self._logger.exception(
f"Running request execution failed: {result}"
)
else:
self._logger.debug(
f"Request execution completed with result: {result}"
)
except Exception as e:
self._logger.exception(f"Error during request execution: {e}")
async def _execute_request(self, request: RemoteInferenceRequest):
background_tasks = []
sampling_params = {}
response_sender = request.response_sender()
"""Preprocessing"""
self._logger.debug(request)
if "text_input" in request.inputs:
query = request.inputs["text_input"].to_bytes_array()
elif "prompt" in request.inputs:
query = request.inputs["prompt"].to_bytes_array()
elif "prompt" in request.parameters:
query = request.parameters["prompt"]
else:
await response_sender.send(error=f"invalid request {request}", final=True)
return
if "sampling_params" in request.parameters:
sampling_params = json.loads(
request.parameters["sampling_params"].removeprefix("JSON:")
)
if "max_tokens" in request.inputs:
request_output_len = request.inputs["max_tokens"]
elif "max_tokens" in sampling_params:
request_output_len = numpy.array(
[[sampling_params["max_tokens"]]], dtype=numpy.int32
)
input_ids, input_lengths = await self._preprocess(query)
self._logger.debug(input_ids, input_lengths)
# [FIXME] not rate limiting due to metric polling is not supported
# KV aware routing
lora_id = 0
try:
self._generate.component_id = await self._router.schedule(
input_ids[0], lora_id
)
self._logger.debug(f"worker selected: {self._generate.component_id}")
except Exception as e:
if "No worker found" in str(e):
self._generate.component_id = None
self._logger.debug("no eligible worker")
else:
self._logger.exception(f"Error during selecting worker: {e}")
# [TODO] add disaggregated example
"""llm"""
llm_inputs = {}
llm_inputs["input_ids"] = input_ids
llm_inputs["input_lengths"] = input_lengths
llm_inputs["request_output_len"] = request_output_len
async for llm_response in await self._generate.async_infer(
inputs=llm_inputs,
):
self._logger.debug(f"llm response completed: {llm_response}")
background_tasks.append(
asyncio.create_task(
self._send_llm_response(
llm_response,
response_sender,
final=llm_response.final,
)
)
)
try:
results = await asyncio.gather(*background_tasks, return_exceptions=True)
for result in results:
if isinstance(result, Exception):
self._logger.exception(
f"Sending response failed with exception: {result}"
)
else:
self._logger.debug(f"Response sent successfully: {result}")
except Exception as e:
self._logger.exception(f"Error during response sending: {e}")
for output in llm_response.outputs:
del output
async def _preprocess(self, query):
start_ids = None
start_lengths = None
if isinstance(query, str):
query = [[query]]
async for preprocess_response in self._preprocess_model.async_infer(
inputs={"query": query}
):
self._logger.debug(f"Preprocess response completed: {preprocess_response}")
start_ids = numpy.from_dlpack(preprocess_response.outputs["start_ids"])
start_lengths = numpy.from_dlpack(
preprocess_response.outputs["start_lengths"]
)
return start_ids, start_lengths
async def _postprocessing(self, tokens_batch, sequence_lengths):
outputs = []
async for postprocess_response in self._postprocess_model.async_infer(
inputs={"tokens_batch": tokens_batch, "sequence_lengths": sequence_lengths}
):
self._logger.debug(f"Received postprocess response: {postprocess_response}")
output = postprocess_response.outputs["output"].to_string_array()
outputs.append(output)
return outputs
async def _send_llm_response(self, llm_response, response_sender, final):
tokens_batch = numpy.from_dlpack(llm_response.outputs["output_ids"])
self._logger.debug(f"Output ids length: {tokens_batch}")
sequence_length = numpy.from_dlpack(llm_response.outputs["sequence_length"])
output = await self._postprocessing(tokens_batch, sequence_length)
store_outputs_in_response = set()
if self._store_outputs_in_response:
store_outputs_in_response.add("text_output")
await response_sender.send(
outputs={"text_output": output[0]},
final=final,
store_outputs_in_response=store_outputs_in_response,
)
...@@ -22,7 +22,7 @@ dynamic_batching {} ...@@ -22,7 +22,7 @@ dynamic_batching {}
parameters { parameters {
key: "tokenizer_dir" key: "tokenizer_dir"
value: { value: {
string_value: "/workspace/examples/llm/tensorrtllm/operators/hf_downloads/llama-3.1-8b-instruct" string_value: "/workspace/examples/python/llm/tensorrtllm/operators/hf_downloads/llama-3.1-8b-instruct"
} }
} }
......
...@@ -20,7 +20,7 @@ max_batch_size: 1 ...@@ -20,7 +20,7 @@ max_batch_size: 1
parameters { parameters {
key: "tokenizer_dir" key: "tokenizer_dir"
value: { value: {
string_value: "/workspace/examples/llm/tensorrtllm/operators/hf_downloads/llama-3.1-8b-instruct" string_value: "/workspace/examples/python/llm/tensorrtllm/operators/hf_downloads/llama-3.1-8b-instruct"
} }
} }
......
...@@ -25,15 +25,15 @@ KNOWN_MODELS = { ...@@ -25,15 +25,15 @@ KNOWN_MODELS = {
"postprocessing", "postprocessing",
"ensemble", "ensemble",
( (
"/workspace/examples/llm/tensorrtllm/operators/triton_core_models/mock", "/workspace/examples/python/llm/tensorrtllm/operators/triton_core_models/mock",
"context", "context",
), ),
( (
"/workspace/examples/llm/tensorrtllm/operators/triton_core_models/mock", "/workspace/examples/python/llm/tensorrtllm/operators/triton_core_models/mock",
"generate", "generate",
), ),
( (
"/workspace/examples/llm/tensorrtllm/operators/triton_core_models/mock", "/workspace/examples/python/llm/tensorrtllm/operators/triton_core_models/mock",
"tensorrt_llm", "tensorrt_llm",
), ),
], ],
...@@ -94,7 +94,7 @@ KNOWN_MODELS = { ...@@ -94,7 +94,7 @@ KNOWN_MODELS = {
"max_beam_width": "1", "max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}", "engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True", "exclude_input_in_output": "True",
"enable_kv_cache_reuse": "False", "enable_kv_cache_reuse": "True",
"batching_strategy": "inflight_fused_batching", "batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0", "max_queue_delay_microseconds": "0",
"max_queue_size": "0", "max_queue_size": "0",
...@@ -118,6 +118,8 @@ KNOWN_MODELS = { ...@@ -118,6 +118,8 @@ KNOWN_MODELS = {
"float16", "float16",
"--paged_kv_cache", "--paged_kv_cache",
"enable", "enable",
"--use_paged_context_fmha",
"enable",
], ],
"max_num_tokens": 16384, "max_num_tokens": 16384,
"max_batch_size": 64, "max_batch_size": 64,
...@@ -138,7 +140,7 @@ KNOWN_MODELS = { ...@@ -138,7 +140,7 @@ KNOWN_MODELS = {
"max_beam_width": "1", "max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}", "engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True", "exclude_input_in_output": "True",
"enable_kv_cache_reuse": "False", "enable_kv_cache_reuse": "True",
"batching_strategy": "inflight_fused_batching", "batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0", "max_queue_delay_microseconds": "0",
"max_queue_size": "0", "max_queue_size": "0",
...@@ -174,6 +176,8 @@ KNOWN_MODELS = { ...@@ -174,6 +176,8 @@ KNOWN_MODELS = {
"enable", "enable",
"--multiple_profiles", "--multiple_profiles",
"enable", "enable",
"--use_paged_context_fmha",
"enable",
], ],
"max_num_tokens": 256, "max_num_tokens": 256,
"templates": [ "templates": [
...@@ -189,7 +193,7 @@ KNOWN_MODELS = { ...@@ -189,7 +193,7 @@ KNOWN_MODELS = {
"max_beam_width": "1", "max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}", "engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True", "exclude_input_in_output": "True",
"enable_kv_cache_reuse": "False", "enable_kv_cache_reuse": "True",
"batching_strategy": "inflight_fused_batching", "batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0", "max_queue_delay_microseconds": "0",
"max_queue_size": "0", "max_queue_size": "0",
...@@ -225,6 +229,8 @@ KNOWN_MODELS = { ...@@ -225,6 +229,8 @@ KNOWN_MODELS = {
"enable", "enable",
"--multiple_profiles", "--multiple_profiles",
"enable", "enable",
"--use_paged_context_fmha",
"enable",
], ],
"max_num_tokens": 8192, "max_num_tokens": 8192,
"templates": [ "templates": [
...@@ -240,7 +246,7 @@ KNOWN_MODELS = { ...@@ -240,7 +246,7 @@ KNOWN_MODELS = {
"max_beam_width": "1", "max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}", "engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True", "exclude_input_in_output": "True",
"enable_kv_cache_reuse": "False", "enable_kv_cache_reuse": "True",
"batching_strategy": "inflight_fused_batching", "batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0", "max_queue_delay_microseconds": "0",
"max_queue_size": "0", "max_queue_size": "0",
...@@ -275,6 +281,8 @@ KNOWN_MODELS = { ...@@ -275,6 +281,8 @@ KNOWN_MODELS = {
"enable", "enable",
"--reduce_fusion", "--reduce_fusion",
"{args.reduce_fusion}", "{args.reduce_fusion}",
"--use_paged_context_fmha",
"enable",
], ],
"max_num_tokens": 16384, "max_num_tokens": 16384,
"max_batch_size": 512, "max_batch_size": 512,
...@@ -295,7 +303,7 @@ KNOWN_MODELS = { ...@@ -295,7 +303,7 @@ KNOWN_MODELS = {
"max_beam_width": "1", "max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}", "engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True", "exclude_input_in_output": "True",
"enable_kv_cache_reuse": "False", "enable_kv_cache_reuse": "True",
"batching_strategy": "inflight_fused_batching", "batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0", "max_queue_delay_microseconds": "0",
"max_queue_size": "0", "max_queue_size": "0",
...@@ -319,6 +327,8 @@ KNOWN_MODELS = { ...@@ -319,6 +327,8 @@ KNOWN_MODELS = {
"float16", "float16",
"--paged_kv_cache", "--paged_kv_cache",
"enable", "enable",
"--use_paged_context_fmha",
"enable",
], ],
"max_batch_size": 64, "max_batch_size": 64,
"templates": [ "templates": [
...@@ -338,7 +348,7 @@ KNOWN_MODELS = { ...@@ -338,7 +348,7 @@ KNOWN_MODELS = {
"max_beam_width": "1", "max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}", "engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True", "exclude_input_in_output": "True",
"enable_kv_cache_reuse": "False", "enable_kv_cache_reuse": "True",
"batching_strategy": "inflight_fused_batching", "batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0", "max_queue_delay_microseconds": "0",
"max_queue_size": "0", "max_queue_size": "0",
...@@ -376,6 +386,8 @@ KNOWN_MODELS = { ...@@ -376,6 +386,8 @@ KNOWN_MODELS = {
"{args.reduce_fusion}", "{args.reduce_fusion}",
"--multiple_profiles", "--multiple_profiles",
"enable", "enable",
"--use_paged_context_fmha",
"enable",
], ],
"max_num_tokens": 8192, "max_num_tokens": 8192,
"templates": [ "templates": [
...@@ -391,7 +403,7 @@ KNOWN_MODELS = { ...@@ -391,7 +403,7 @@ KNOWN_MODELS = {
"max_beam_width": "1", "max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}", "engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True", "exclude_input_in_output": "True",
"enable_kv_cache_reuse": "False", "enable_kv_cache_reuse": "True",
"batching_strategy": "inflight_fused_batching", "batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0", "max_queue_delay_microseconds": "0",
"max_queue_size": "0", "max_queue_size": "0",
...@@ -429,6 +441,8 @@ KNOWN_MODELS = { ...@@ -429,6 +441,8 @@ KNOWN_MODELS = {
"{args.reduce_fusion}", "{args.reduce_fusion}",
"--multiple_profiles", "--multiple_profiles",
"enable", "enable",
"--use_paged_context_fmha",
"enable",
], ],
"max_num_tokens": 128, "max_num_tokens": 128,
"templates": [ "templates": [
...@@ -444,7 +458,7 @@ KNOWN_MODELS = { ...@@ -444,7 +458,7 @@ KNOWN_MODELS = {
"max_beam_width": "1", "max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}", "engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True", "exclude_input_in_output": "True",
"enable_kv_cache_reuse": "False", "enable_kv_cache_reuse": "True",
"batching_strategy": "inflight_fused_batching", "batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0", "max_queue_delay_microseconds": "0",
"max_queue_size": "0", "max_queue_size": "0",
...@@ -480,6 +494,8 @@ KNOWN_MODELS = { ...@@ -480,6 +494,8 @@ KNOWN_MODELS = {
"{args.reduce_fusion}", "{args.reduce_fusion}",
"--multiple_profiles", "--multiple_profiles",
"enable", "enable",
"--use_paged_context_fmha",
"enable",
], ],
"max_num_tokens": 16384, "max_num_tokens": 16384,
"templates": [ "templates": [
...@@ -497,7 +513,7 @@ KNOWN_MODELS = { ...@@ -497,7 +513,7 @@ KNOWN_MODELS = {
"max_beam_width": "1", "max_beam_width": "1",
"engine_dir": "{args.tensorrtllm_engine}", "engine_dir": "{args.tensorrtllm_engine}",
"exclude_input_in_output": "True", "exclude_input_in_output": "True",
"enable_kv_cache_reuse": "False", "enable_kv_cache_reuse": "True",
"batching_strategy": "inflight_fused_batching", "batching_strategy": "inflight_fused_batching",
"max_queue_delay_microseconds": "0", "max_queue_delay_microseconds": "0",
"max_queue_size": "0", "max_queue_size": "0",
......
...@@ -23,7 +23,7 @@ from gpu_info import get_gpu_product_name ...@@ -23,7 +23,7 @@ from gpu_info import get_gpu_product_name
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from known_models import KNOWN_MODELS from known_models import KNOWN_MODELS
TARGET_DIR = "/workspace/examples/llm/tensorrtllm/operators" TARGET_DIR = "/workspace/examples/python/llm/tensorrtllm/operators"
TENSORRTLLM_EXAMPLE_DIR = "/tensorrtllm_backend/tensorrt_llm/examples" TENSORRTLLM_EXAMPLE_DIR = "/tensorrtllm_backend/tensorrt_llm/examples"
......
...@@ -157,7 +157,87 @@ For disaggregated deployment, you will also need to pass the `kv_ip` and `kv_por ...@@ -157,7 +157,87 @@ For disaggregated deployment, you will also need to pass the `kv_ip` and `kv_por
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":<rank>,"kv_parallel_size":2,"kv_ip":<master_node_ip>,"kv_port":<kv_port>}' '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":<rank>,"kv_parallel_size":2,"kv_ip":<master_node_ip>,"kv_port":<kv_port>}'
``` ```
### 4. Known Issues and Limitations
### 4. KV Router Deployment
The KV Router is a component that aggregates KV Events from all the workers and maintains a prefix tree of the cached tokens. It makes decisions on which worker to route requests to based on the length of the prefix match and the load on the workers.
You can run the router and workers in separate terminal sessions or use the `kv-router-run.sh` script to launch them all at once in their own tmux sessions.
#### Deploying using tmux
The helper script `kv-router-run.sh` will launch the router and workers in their own tmux sessions.
kv-router-run.sh <number_of_workers> <routing_strategy> Optional[<model_name>]
Example:
```bash
# Launch 8 workers with prefix routing strategy and use deepseek-ai/DeepSeek-R1-Distill-Llama-8B as the model
/workspace/examples/python_rs/llm/vllm/kv-router-run.sh 8 prefix deepseek-ai/DeepSeek-R1-Distill-Llama-8B
# List tmux sessions
tmux ls
# Attach to the tmux sessions
tmux a -t v-1 # worker 1 - use cmd + b, d to detach
tmux a -t v-router # kv router - use cmd + b, d to detach
# Close the tmux sessions
tmux ls | grep 'v-' | cut -d: -f1 | xargs -I{} tmux kill-session -t {}
```
#### Deploying using separate terminals
**Terminal 1 - Router:**
```bash
# Activate virtual environment
source /opt/triton/venv/bin/activate
# Launch prefill worker
cd /workspace/examples/python_rs/llm/vllm
RUST_LOG=info python3 -m kv_router.router \
--routing-strategy prefix
```
You can choose between different routing strategies:
- `prefix`: Route requests to the worker that has the longest prefix match.
- `round_robin`: Route requests to the worker in a round-robin manner.
- `random`: Route requests to a random worker.
**Terminal 2 and 3 - Workers:**
```bash
# Activate virtual environment
source /opt/triton/venv/bin/activate
# Launch Worker 1 and Worker 2 with same command
cd /workspace/examples/python_rs/llm/vllm
RUST_LOG=info python3 -m kv_router.worker \
--model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--tokenizer deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--enable-prefix-caching \
--block-size 64 \
--max-model-len 16384
```
Note: Must enable prefix caching for KV Router to work
Note: block-size must be 64, otherwise Router won't work (accepts only 64 tokens)
**Terminal 3 - Client:**
```bash
# Activate virtual environment
source /opt/triton/venv/bin/activate
# Run client
# We use a long prompt to populate a few KV Blocks (64 tokens each)
# Try running it a few times to see where the router is sending the request
cd /workspace/examples/python_rs/llm/vllm
python3 -m common.client \
--prompt "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden." \
--component preprocess \
--max-tokens 10 \
--temperature 0.5
```
### 5. Known Issues and Limitations
- vLLM is not working well with the `fork` method for multiprocessing and TP > 1. This is a known issue and a workaround is to use the `spawn` method instead. See [vLLM issue](https://github.com/vllm-project/vllm/issues/6152). - vLLM is not working well with the `fork` method for multiprocessing and TP > 1. This is a known issue and a workaround is to use the `spawn` method instead. See [vLLM issue](https://github.com/vllm-project/vllm/issues/6152).
- `kv_rank` of `kv_producer` must be smaller than of `kv_consumer`. - `kv_rank` of `kv_producer` must be smaller than of `kv_consumer`.
......
...@@ -25,20 +25,23 @@ from .protocol import Request ...@@ -25,20 +25,23 @@ from .protocol import Request
@triton_worker() @triton_worker()
async def worker( async def worker(
runtime: DistributedRuntime, prompt: str, max_tokens: int, temperature: float runtime: DistributedRuntime,
component: str,
prompt: str,
max_tokens: int,
temperature: float,
): ):
""" """
Instantiate a `backend` client and call the `generate` endpoint Instantiate a `backend` client and call the `generate` endpoint
""" """
# get endpoint # get endpoint
endpoint = runtime.namespace("triton-init").component("vllm").endpoint("generate") endpoint = (
runtime.namespace("triton-init").component(component).endpoint("generate")
)
# create client # create client
client = await endpoint.client() client = await endpoint.client()
# list the endpoints
print(client.endpoint_ids())
# issue request # issue request
tasks = [] tasks = []
for _ in range(1): for _ in range(1):
...@@ -66,9 +69,10 @@ if __name__ == "__main__": ...@@ -66,9 +69,10 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--prompt", type=str, default="what is the capital of france?") parser.add_argument("--prompt", type=str, default="what is the capital of france?")
parser.add_argument("--component", type=str, default="vllm")
parser.add_argument("--max-tokens", type=int, default=10) parser.add_argument("--max-tokens", type=int, default=10)
parser.add_argument("--temperature", type=float, default=0.5) parser.add_argument("--temperature", type=float, default=0.5)
args = parser.parse_args() args = parser.parse_args()
asyncio.run(worker(args.prompt, args.max_tokens, args.temperature)) asyncio.run(worker(args.component, args.prompt, args.max_tokens, args.temperature))
...@@ -22,6 +22,14 @@ class Request(BaseModel): ...@@ -22,6 +22,14 @@ class Request(BaseModel):
sampling_params: dict sampling_params: dict
class Tokens(BaseModel):
tokens: list[int]
class TokenizedRequest(Request, Tokens):
pass
class PrefillRequest(Request): class PrefillRequest(Request):
request_id: str request_id: str
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/bin/bash
if [ $# -lt 2 ]; then
echo "Usage: $0 <number_of_workers> <routing_strategy> [model_name]"
echo "Error: Must specify at least number of workers and routing strategy"
exit 1
fi
NUM_WORKERS=$1
ROUTING_STRATEGY=$2
MODEL_NAME=${3:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
VALID_STRATEGIES=("prefix" "round_robin" "random")
if [[ ! " ${VALID_STRATEGIES[@]} " =~ " ${ROUTING_STRATEGY} " ]]; then
echo "Error: Invalid routing strategy. Must be one of: ${VALID_STRATEGIES[*]}"
exit 1
fi
SESSION_NAME="v"
WORKDIR="/workspace/examples/python_rs/llm/vllm"
INIT_CMD="source /opt/triton/venv/bin/activate && cd $WORKDIR"
ROUTER_CMD="RUST_LOG=info python3 -m kv_router.router \
--routing-strategy $ROUTING_STRATEGY \
--min-workers $NUM_WORKERS "
tmux new-session -d -s "$SESSION_NAME-router"
tmux send-keys -t "$SESSION_NAME-router" "$INIT_CMD && $ROUTER_CMD" C-m
WORKER_CMD="RUST_LOG=info python3 -m kv_router.worker \
--model $MODEL_NAME \
--tokenizer $MODEL_NAME \
--enable-prefix-caching \
--block-size 64 \
--max-model-len 16384 "
for i in $(seq 1 $NUM_WORKERS); do
tmux new-session -d -s "$SESSION_NAME-$i"
done
for i in $(seq 1 $NUM_WORKERS); do
tmux send-keys -t "$SESSION_NAME-$i" "$INIT_CMD && CUDA_VISIBLE_DEVICES=$((i-1)) $WORKER_CMD" C-m
done
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment