feat: Add KV publisher and receiver. Add KV aware routing example.

Signed-off-by: Neelay Shah <neelays@nvidia.com> Co-authored-by: aflowers <aflowers@nvidia.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com> Co-authored-by: hongkuanz <hongkuanz@nvidia.com> Co-authored-by: Neelay Shah <neelays@nvidia.com>

feat: Add KV publisher and receiver. Add KV aware routing example.
Signed-off-by: Neelay Shah <neelays@nvidia.com> Co-authored-by: aflowers <aflowers@nvidia.com> Co-authored-by: Ryan McCormick <rmccormick@nvidia.com> Co-authored-by: hongkuanz <hongkuanz@nvidia.com> Co-authored-by: Neelay Shah <neelays@nvidia.com>
8588e33a · GuanLuo · GitHub · d8aada0b · 8588e33a · 8588e33a
Commit 8588e33a authored Feb 18, 2025 by GuanLuo Committed by GitHub Feb 18, 2025
20 changed files
--- a/applications/llm/bin/tio/Cargo.lock
+++ b/applications/llm/bin/tio/Cargo.lock
@@ -5145,6 +5145,7 @@ dependencies = [
 "unicode-segmentation",
 "uuid 1.13.1",
 "validator",
+ "xxhash-rust",
 ]
 [[package]]

--- a/container/Dockerfile
+++ b/container/Dockerfile
@@ -29,6 +29,10 @@ RUN apt-get update; apt-get install -y gdb protobuf-compiler
 RUN curl https://sh.rustup.rs -sSf | bash -s -- -y
 ENV PATH="/root/.cargo/bin:${PATH}"
+# 'etcd' is runtime dependency
+RUN wget https://github.com/etcd-io/etcd/releases/download/v3.5.18/etcd-v3.5.18-linux-amd64.tar.gz && tar -xzf etcd-v3.5.18-linux-amd64.tar.gz
+RUN cp ./etcd-v3.5.18-linux-amd64/etcd* /usr/local/bin/.
 # Install OpenAI-compatible frontend and its dependencies from triton server
 # repository. These are used to have a consistent interface, schema, and FastAPI
 # app between Triton Core and Triton Distributed implementations.
@@ -66,6 +70,34 @@ RUN find /opt/tritonserver/python -maxdepth 1 -type f -name \
 ARG GENAI_PERF_TAG="r25.01"
 RUN pip install "git+https://github.com/triton-inference-server/perf_analyzer.git@${GENAI_PERF_TAG}#subdirectory=genai-perf"
+# Working directory
+WORKDIR /workspace
+COPY runtime /workspace/runtime
+RUN cd runtime/rust && \
+cargo build --release --locked && cargo doc --no-deps
+# Generate C bindings. Note that this is required for TRTLLM backend re-build
+COPY llm /workspace/llm
+RUN cd llm/rust/ && \
+cargo build --release --locked && cargo doc --no-deps
+# Install uv and create virtualenv for general use
+COPY python-wheel /workspace/python-wheel
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+RUN mkdir /opt/triton && \
+    uv venv /opt/triton/venv --python 3.12 && \
+    source /opt/triton/venv/bin/activate && \
+    cd python-wheel && \
+    uv build && \
+    uv pip install dist/triton_distributed_rs*cp312*.whl
+# Package the bindings
+RUN mkdir -p /opt/triton/llm_binding/wheels && mkdir /opt/triton/llm_binding/lib
+RUN cp python-wheel/dist/triton_distributed_rs*cp312*.whl /opt/triton/llm_binding/wheels/.
+RUN cp llm/rust/target/release/libtriton_llm_capi.so /opt/triton/llm_binding/lib/.
+RUN cp -r llm/rust/libtriton-llm/include /opt/triton/llm_binding/.
 # Backend & Framework Specific Installation
 ARG FRAMEWORK="STANDARD"
 ARG TENSORRTLLM_BACKEND_REPO_TAG=
@@ -73,7 +105,7 @@ ARG TENSORRTLLM_BACKEND_REBUILD=
 ENV FRAMEWORK=${FRAMEWORK}
 RUN --mount=type=bind,source=./container/deps/requirements.tensorrtllm.txt,target=/tmp/requirements.txt \
    --mount=type=bind,source=./container/deps/clone_tensorrtllm.sh,target=/tmp/clone_tensorrtllm.sh \
-    if [[ "$FRAMEWORK" == "TENSORRTLLM" ]] ; then pip install --timeout=2000 -r /tmp/requirements.txt; /tmp/clone_tensorrtllm.sh --tensorrtllm-backend-repo-tag ${TENSORRTLLM_BACKEND_REPO_TAG} --tensorrtllm-backend-rebuild ${TENSORRTLLM_BACKEND_REBUILD} ; fi
+    if [[ "$FRAMEWORK" == "TENSORRTLLM" ]] ; then pip install --timeout=2000 -r /tmp/requirements.txt; /tmp/clone_tensorrtllm.sh --tensorrtllm-backend-repo-tag ${TENSORRTLLM_BACKEND_REPO_TAG} --tensorrtllm-backend-rebuild ${TENSORRTLLM_BACKEND_REBUILD} --triton-llm-path /opt/triton/llm_binding ; fi
 RUN --mount=type=bind,source=./container/deps/requirements.standard.txt,target=/tmp/requirements.txt \
@@ -86,6 +118,24 @@ ENV LD_LIBRARY_PATH=${FRAMEWORK_LD_LIBRARY_PATH}:${LD_LIBRARY_PATH}
 ENV TENSORRTLLM_BACKEND_REPO_TAG=$TENSORRTLLM_BACKEND_REPO_TAG
 ENV TRTLLM_USE_MPI_KVCACHE=${TENSORRTLLM_FRAMEWORK:+"1"}
+# TODO set VLLM Version
+# ENV VLLM_VERSION
+ARG VLLM_FRAMEWORK
+# DEFAULT VLLM VARIABLES
+# ENV VLLM_ATTENTION_BACKEND=${VLLM_FRAMEWORK:+FLASHINFER}
+ENV VLLM_WORKER_MULTIPROC_METHOD=${VLLM_FRAMEWORK:+spawn}
+ENV VLLM_TORCH_HOST=${VLLM_FRAMEWORK:+localhost}
+ENV VLLM_TORCH_PORT=${VLLM_FRAMEWORK:+36183}
+ENV VLLM_DATA_PLANE_BACKEND=${VLLM_FRAMEWORK:+nccl}
+ENV VLLM_BASELINE_WORKERS=${VLLM_FRAMEWORK:+0}
+ENV VLLM_CONTEXT_WORKERS=${VLLM_FRAMEWORK:+1}
+ENV VLLM_GENERATE_WORKERS=${VLLM_FRAMEWORK:+1}
+ENV VLLM_BASELINE_TP_SIZE=${VLLM_FRAMEWORK:+1}
+ENV VLLM_CONTEXT_TP_SIZE=${VLLM_FRAMEWORK:+1}
+ENV VLLM_GENERATE_TP_SIZE=${VLLM_FRAMEWORK:+1}
+ENV VLLM_KV_CAPI_PATH="/opt/triton/llm_binding/lib/libtriton_llm_capi.so"
+ENV PYTHONUNBUFFERED=1
 # Install NATS - pointing toward NATS github instead of binaries.nats.dev due to server instability
 RUN wget https://github.com/nats-io/nats-server/releases/download/v2.10.24/nats-server-v2.10.24-amd64.deb && dpkg -i nats-server-v2.10.24-amd64.deb
@@ -131,7 +181,7 @@ RUN mkdir /opt/triton && \
 # Install triton_distributed_rs wheel globally in container for tests that
 # currently run without virtual environment activated.
 # TODO: In future, we may use a virtualenv for everything and remove this.
-RUN pip install runtime/rust/python-wheel/dist/triton_distributed_rs*cp312*.whl
+RUN pip install /opt/triton/llm_binding/wheels/triton_distributed_rs*cp312*.whl
 COPY icp /workspace/icp
 RUN /workspace/icp/protos/gen_python.sh

--- a/container/build.sh
+++ b/container/build.sh
@@ -61,11 +61,11 @@ TENSORRTLLM_BASE_IMAGE=nvcr.io/nvidia/tritonserver
 TENSORRTLLM_BASE_IMAGE_TAG=${TENSORRTLLM_BASE_VERSION}-trtllm-python-py3
 # IMPORTANT NOTE: Ensure the repo tag complies with the TRTLLM backend version
 # used in the base image above.
-TENSORRTLLM_BACKEND_REPO_TAG=v0.17.0
+TENSORRTLLM_BACKEND_REPO_TAG=triton-llm/v0.17.0
 # Set this as 1 to rebuild and replace trtllm backend bits in the container.
 # This will allow building triton distributed container image with custom
 # trt-llm backend repo branch.
-TENSORRTLLM_BACKEND_REBUILD=0
+TENSORRTLLM_BACKEND_REBUILD=1
 # vllm installation is done later in the Dockerfile so it will overwrite the
 # vllm version installed in the base image.

--- a/container/deps/clone_tensorrtllm.sh
+++ b/container/deps/clone_tensorrtllm.sh
@@ -16,6 +16,7 @@
 TENSORRTLLM_BACKEND_REPO_TAG=
 TENSORRTLLM_BACKEND_REBUILD=
+TRITON_LLM_PATH=
 GIT_TOKEN=
 GIT_REPO=
@@ -42,6 +43,14 @@ get_options() {
 		missing_requirement $1
            fi
            ;;
+    --triton-llm-path)
+            if [ "$2" ]; then
+                TRITON_LLM_PATH=$2
+                shift
+            else
+		missing_requirement $1
+            fi
+            ;;
    --git-token)
            if [ "$2" ]; then
                GIT_TOKEN=$2
@@ -138,7 +147,7 @@ if [ ! -z ${TENSORRTLLM_BACKEND_REBUILD} ]; then
    # Build the backend
    (cd inflight_batcher_llm/src \
-        && cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DUSE_CXX11_ABI=1 .. \
+        && cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install -DUSE_CXX11_ABI=1 -DTRITON_LLM_PATH=$TRITON_LLM_PATH .. \
        && make install \
        && cp libtriton_tensorrtllm.so /opt/tritonserver/backends/tensorrtllm/ \
        && cp trtllmExecutorWorker /opt/tritonserver/backends/tensorrtllm/ \

--- a/container/deps/vllm/vllm_v0.7.2.patch
+++ b/container/deps/vllm/vllm_v0.7.2.patch
@@ -53,6 +53,308 @@ index 9ba49757..7e871521 100644
 class CompilationLevel:
     # constants for the levels of the compilation process
+diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
+index 359b5b26..d52ee050 100644
+--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
+@@ -6,6 +6,7 @@ from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
+                                         DeviceAwareBlockAllocator)
+ from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
+ from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
+from vllm.core.event_manager import KVCacheEventManager
+ from vllm.platforms import current_platform
+ from vllm.utils import Device
+@@ -28,6 +29,7 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
+         num_gpu_blocks: int,
+         num_cpu_blocks: int,
+         block_size: int,
+        event_manager: Optional[KVCacheEventManager] = None,
+     ) -> DeviceAwareBlockAllocator:
+         """Creates a CpuGpuBlockAllocator instance with the specified
+         configuration.
+@@ -64,6 +66,7 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
+         cpu_block_ids = block_ids[num_gpu_blocks:]
+         if allocator_type == "naive":
+            assert event_manager is None, "Event API not supported with naive allocator."
+             gpu_allocator: BlockAllocator = NaiveBlockAllocator(
+                 create_block=NaiveBlock,  # type: ignore
+                 num_blocks=num_gpu_blocks,
+@@ -82,12 +85,14 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
+                 num_blocks=num_gpu_blocks,
+                 block_size=block_size,
+                 block_ids=gpu_block_ids,
+                event_manager=event_manager,
+             )
+             cpu_allocator = PrefixCachingBlockAllocator(
+                 num_blocks=num_cpu_blocks,
+                 block_size=block_size,
+                 block_ids=cpu_block_ids,
+                event_manager=event_manager,
+             )
+         else:
+             raise ValueError(f"Unknown allocator type {allocator_type=}")
+@@ -95,10 +100,12 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
+         return CpuGpuBlockAllocator(
+             cpu_block_allocator=cpu_allocator,
+             gpu_block_allocator=gpu_allocator,
+            event_manager=event_manager,
+         )
+     def __init__(self, cpu_block_allocator: BlockAllocator,
+-                 gpu_block_allocator: BlockAllocator):
+                 gpu_block_allocator: BlockAllocator,
+                 event_manager: Optional[KVCacheEventManager] = None,):
+         assert not (
+             cpu_block_allocator.all_block_ids
+             & gpu_block_allocator.all_block_ids
+@@ -108,6 +115,7 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
+             Device.CPU: cpu_block_allocator,
+             Device.GPU: gpu_block_allocator,
+         }
+        self.event_manager = event_manager
+         self._swap_mapping: Dict[int, int] = {}
+         self._null_block: Optional[Block] = None
+diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
+index 1ca9e49d..b1591c0c 100644
+--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
+@@ -4,7 +4,7 @@ import sys
+ from bisect import bisect_left
+ from os.path import commonprefix
+ from typing import (Callable, Dict, FrozenSet, Iterable, List, Optional, Set,
+-                    Tuple)
+                    Tuple, TYPE_CHECKING)
+ from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker,
+                                     get_all_blocks_recursively)
+@@ -23,6 +23,9 @@ PrefixHash = int
+ # then we know this block hasn't been accessed yet.
+ _DEFAULT_LAST_ACCESSED_TIME = -1
+if TYPE_CHECKING:
+    from vllm.core.event_manager import KVCacheEventManager
+
+ logger = init_logger(__name__)
+@@ -80,6 +83,7 @@ class PrefixCachingBlockAllocator(BlockAllocator):
+         block_size: int,
+         block_ids: Optional[Iterable[int]] = None,
+         eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
+        event_manager: Optional["KVCacheEventManager"] = None,
+     ):
+         if block_ids is None:
+             block_ids = range(num_blocks)
+@@ -131,6 +135,9 @@ class PrefixCachingBlockAllocator(BlockAllocator):
+         self.metric_data = CacheMetricData()
+        self.event_manager = event_manager
+
+    # Implements Block.Factory.
+     def _create_block(
+         self,
+         prev_block: Optional[Block],
+@@ -337,6 +344,9 @@ class PrefixCachingBlockAllocator(BlockAllocator):
+         assert self._refcounter.get(_block_id) == 0
+         assert _block_id == block_id
+        if self.event_manager:
+            self.event_manager.enqueue_removed_event(content_hash_to_evict)
+
+         self._cached_blocks.pop(content_hash_to_evict)
+         self._refcounter.incr(block_id)
+@@ -513,6 +523,10 @@ class PrefixCachingBlockAllocator(BlockAllocator):
+             # Mark this block as touched so that it can be marked as
+             # computed after the entire batch of sequences are scheduled.
+             self._touched_blocks.add(block.block_id)
+
+            if self.event_manager:
+                self.event_manager.enqueue_stored_event(block.prev_block, block)
+
+             return block.block_id
+         # Reuse the cached content hash
+diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
+index c5b3b04f..8a483aa2 100644
+--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
+@@ -9,10 +9,12 @@ from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
+ from vllm.core.block.interfaces import Block
+ from vllm.core.block.prefix_caching_block import (ComputedBlocksTracker,
+                                                   LastAccessBlocksTracker)
+from vllm.core.event_manager import KVCacheEventManager
+ from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
+ from vllm.core.interfaces import AllocStatus, BlockSpaceManager
+ from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
+ from vllm.utils import Device
+from vllm.envs import VLLM_WORKER_ID, VLLM_KV_CAPI_PATH
+ SeqId = int
+ EncoderSeqId = str
+@@ -60,6 +62,7 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
+     def __init__(
+         self,
+        model_name: str, 
+         block_size: int,
+         num_gpu_blocks: int,
+         num_cpu_blocks: int,
+@@ -91,11 +94,17 @@ class SelfAttnBlockSpaceManager(BlockSpaceManager):
+         self.watermark_blocks = int(watermark * num_gpu_blocks)
+        if VLLM_WORKER_ID is not None and VLLM_KV_CAPI_PATH is not None:
+            self.event_manager = KVCacheEventManager(model_name, worker_id=str(VLLM_WORKER_ID).encode(), lib_path=VLLM_KV_CAPI_PATH)
+        else:
+            self.event_manager = None
+
+         self.block_allocator = CpuGpuBlockAllocator.create(
+             allocator_type="prefix_caching" if enable_caching else "naive",
+             num_gpu_blocks=num_gpu_blocks,
+             num_cpu_blocks=num_cpu_blocks,
+             block_size=block_size,
+            event_manager=self.event_manager,
+         )
+         self.block_tables: Dict[SeqId, BlockTable] = {}
+diff --git a/vllm/core/event_manager.py b/vllm/core/event_manager.py
+new file mode 100644
+index 00000000..4aa90a4a
+--- /dev/null
+++ b/vllm/core/event_manager.py
+@@ -0,0 +1,89 @@
+from typing import Optional
+import logging
+from vllm.core.block.prefix_caching_block import PrefixCachingBlock, PrefixHash
+
+import ctypes
+from ctypes import c_char_p, c_uint32, c_void_p, c_size_t
+import uuid
+
+logger = logging.getLogger(__name__)
+
+class TritonResult:
+    OK = 0
+    ERR = 1
+
+class KVCacheEventManager:
+    def __init__(self, model_name: str, worker_id: bytes, lib_path: str):
+        self.lib = None
+
+        try:
+            self.lib = ctypes.CDLL(lib_path)
+            self.lib.triton_llm_init.argtypes = [c_char_p, c_char_p]
+            self.lib.triton_llm_init.restype = c_uint32
+
+            result = self.lib.triton_llm_init(model_name.encode(), worker_id)
+            if result == TritonResult.OK:
+                logger.info("KVCacheEventManager initialized successfully. Ready to publish KV Cache Events")
+            else:
+                logger.info("KVCacheEventManager initialization failed!")
+
+        except Exception as e:
+            print(f"Failed to load {lib_path}")
+            raise e
+        
+        self.lib.triton_kv_event_publish_stored.argtypes = [
+            ctypes.c_uint64,                    # event_id
+            ctypes.POINTER(ctypes.c_uint32),    # token_ids
+            ctypes.POINTER(ctypes.c_size_t),    # num_block_tokens
+            ctypes.POINTER(ctypes.c_uint64),    # block_ids
+            ctypes.c_size_t,                    # num_blocks
+            ctypes.POINTER(ctypes.c_uint64),    # parent_hash
+            ctypes.c_uint64,                    # lora_id
+        ]
+        self.lib.triton_kv_event_publish_stored.restype = ctypes.c_uint32  # triton_llm_result_t
+
+        self.lib.triton_kv_event_publish_removed.argtypes = [
+            ctypes.c_uint64,                    # event_id
+            ctypes.POINTER(ctypes.c_uint64),    # block_ids
+            ctypes.c_size_t,                    # num_blocks
+        ]
+        self.lib.triton_kv_event_publish_removed.restype = ctypes.c_uint32  # triton_llm_result_t
+
+        self.event_id_counter = 0
+
+    def enqueue_stored_event(self, parent: Optional[PrefixCachingBlock], block: PrefixCachingBlock):
+        token_ids_arr = (ctypes.c_uint32 * len(block.token_ids))(*block.token_ids)
+        num_block_tokens = (ctypes.c_size_t * 1)(len(block.token_ids))
+        block_hash = (ctypes.c_uint64 * 1)(block.content_hash)
+        parent_hash = ((ctypes.c_uint64 * 1)(parent.content_hash) if parent is not None else None)
+
+        # Publish the event
+        result = self.lib.triton_kv_event_publish_stored(
+            self.event_id_counter,        # uint64_t event_id
+            token_ids_arr,                # const uint32_t *token_ids
+            num_block_tokens,             # const uintptr_t *num_block_tokens
+            block_hash,                   # const uint64_t *block_ids
+            1,                            # uintptr_t num_blocks
+            parent_hash,                  # const uint64_t *parent_hash
+            0,                            # uint64_t lora_id
+        )
+
+        if result == TritonResult.OK:
+            logger.debug(f"Store - Published KV Event: {block.content_hash}")
+        else:
+            logger.debug(f"Store - Failed to Publish KV Event: {block.content_hash}")
+
+        self.event_id_counter += 1
+
+    def enqueue_removed_event(self, block_hash: PrefixHash):
+        result = self.lib.triton_kv_event_publish_removed(
+            self.event_id_counter,
+            (ctypes.c_uint64 * 1)(block_hash),
+            1,)
+        
+        if result == TritonResult.OK:
+            logger.debug(f"Remove - Published KV Event: {block_hash}")
+        else:
+            logger.debug(f"Remove - Failed to Publish KV Event: {block_hash}")
+
+        self.event_id_counter += 1
+\ No newline at end of file
+diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
+index f507847a..6af77646 100644
+--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
+@@ -10,7 +10,7 @@ from typing import Callable, Deque, Dict, Iterable, List, Optional
+ from typing import Sequence as GenericSequence
+ from typing import Set, Tuple, Union
+-from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.config import ModelConfig, CacheConfig, LoRAConfig, SchedulerConfig
+ from vllm.core.interfaces import AllocStatus, BlockSpaceManager
+ from vllm.logger import init_logger
+ from vllm.lora.request import LoRARequest
+@@ -325,12 +325,14 @@ class Scheduler:
+     def __init__(
+         self,
+        model_config: ModelConfig,
+         scheduler_config: SchedulerConfig,
+         cache_config: CacheConfig,
+         lora_config: Optional[LoRAConfig],
+         pipeline_parallel_size: int = 1,
+         output_proc_callback: Optional[Callable] = None,
+     ) -> None:
+        self.model_config = model_config
+         self.scheduler_config = scheduler_config
+         self.cache_config = cache_config
+         # Note for LoRA scheduling: the current policy is extremely
+@@ -356,6 +358,7 @@ class Scheduler:
+         # Create the block space manager.
+         self.block_manager = BlockSpaceManagerImpl(
+            model_name=self.model_config.served_model_name,
+             block_size=self.cache_config.block_size,
+             num_gpu_blocks=num_gpu_blocks,
+             num_cpu_blocks=num_cpu_blocks,
 diff --git a/vllm/distributed/kv_transfer/kv_connector/factory.py b/vllm/distributed/kv_transfer/kv_connector/factory.py
 index fe480533..b768e03c 100644
 --- a/vllm/distributed/kv_transfer/kv_connector/factory.py
@@ -889,6 +1191,47 @@ index 321902d1..b8937ef8 100644
 def ensure_model_parallel_initialized(
+diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
+index d82d9ad9..542ccfe8 100644
+--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
+@@ -348,7 +348,7 @@ class LLMEngine:
+         # GPU and CPU blocks, which are profiled in the distributed executor.
+         self.scheduler = [
+             Scheduler(
+-                self.scheduler_config, self.cache_config, self.lora_config,
+                self.model_config, self.scheduler_config, self.cache_config, self.lora_config,
+                 self.parallel_config.pipeline_parallel_size,
+                 self.async_callbacks[v_id]
+                 if self.model_config.use_async_output_proc else None)
+diff --git a/vllm/envs.py b/vllm/envs.py
+index 745b068b..438142e3 100644
+--- a/vllm/envs.py
+++ b/vllm/envs.py
+@@ -87,6 +87,8 @@ if TYPE_CHECKING:
+     VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON: bool = False
+     VLLM_RAY_PER_WORKER_GPUS: float = 1.0
+     VLLM_RAY_BUNDLE_INDICES: str = ""
+    VLLM_KV_CAPI_PATH: Optional[str] = None
+    VLLM_WORKER_ID: Optional[str] = None
+ def get_default_cache_root():
+@@ -572,6 +574,14 @@ environment_variables: Dict[str, Callable[[], Any]] = {
+     # models the alignment is already naturally aligned to 256 bytes.
+     "VLLM_CUDA_MEM_ALIGN_KV_CACHE":
+     lambda: bool(int(os.getenv("VLLM_CUDA_MEM_ALIGN_KV_CACHE", "1"))),
+
+    # Path to the C API Library
+    "VLLM_KV_CAPI_PATH":
+    lambda: os.environ.get("VLLM_KV_CAPI_PATH", None),
+
+    # Worker ID used for identifying workers in distributed settings
+    "VLLM_WORKER_ID":
+    lambda: os.getenv("VLLM_WORKER_ID", None),
+ }
+ # end-env-vars-definition
 diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
 index 773f5abe..3eefd266 100644
 --- a/vllm/model_executor/models/deepseek_v2.py

--- a/examples/python/llm/tensorrtllm/README.md
+++ b/examples/python/llm/tensorrtllm/README.md
@@ -243,3 +243,45 @@ Zhang. Distserve: Disaggregating prefill and decoding for goodput-optimized larg
 model serving. *arXiv:2401.09670v3 [cs.DC]*, 2024.
 For more details on Triton Distributed, see the [Hello World example](../../hello_world/) and [Triton Inference Server documentation](https://github.com/triton-inference-server/server).
+# KV Aware Routing with TensorRT-LLM
+This example also showcase smart routing based on worker KV usage, in aggregated scenario.
+To start a KV aware deployment with 2 decode workers:
+```bash
+export HOSTNAME=localhost
+export MODEL_NAME="llama-3.1-8b-instruct"
+python3 /workspace/examples/python/llm/tensorrtllm/deploy/launch_workers.py \
+  --generate-worker-count 2 \
+  --model ${MODEL_NAME} \
+  --initialize-request-plane \
+  --kv-aware-routing \
+  --request-plane-uri ${HOSTNAME}:4222 &
+```
+```bash
+python3 -m llm.api_server \
+  --tokenizer meta-llama/Llama-3.1-8B-Instruct \
+  --request-plane-uri ${HOSTNAME}:4222 \
+  --api-server-host ${HOSTNAME} \
+  --model-name ${MODEL_NAME} &
+```
+```bash
+curl ${HOSTNAME}:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "llama-3.1-8b-instruct",
+    "messages": [
+      {"role": "user", "content": "Why is Roger Federer the greatest tennis player of all time? Roger Federer is widely regarded as one of the greatest tennis players of all time, and many consider him the greatest."}
+    ],
+    "temperature": 0,
+    "top_p": 0.95,
+    "max_tokens": 25,
+    "stream": true,
+    "n": 1,
+    "frequency_penalty": 0.0,
+    "stop": []
+  }'
+```
--- a/examples/python/llm/tensorrtllm/deploy/__main__.py
+++ b/examples/python/llm/tensorrtllm/deploy/__main__.py
@@ -19,6 +19,7 @@ import time
 from pathlib import Path
 from llm.tensorrtllm.operators.disaggregated_serving import DisaggregatedServingOperator
+from llm.tensorrtllm.operators.kv_aware_routing import KvAwareRoutingOperator
 from llm.tensorrtllm.scripts.gpu_info import get_gpu_product_name
 from triton_distributed.runtime import (
@@ -62,6 +63,18 @@ def _create_disaggregated_serving_op(name, args, max_inflight_requests):
    )
+def _create_kv_aware_routing_op(name, args, max_inflight_requests):
+    model_repository = str(
+        Path(args.operator_repository) / "triton_core_models"
+    )  # stores our simple pre/post processing
+    return OperatorConfig(
+        name=name,
+        implementation=KvAwareRoutingOperator,
+        max_inflight_requests=int(max_inflight_requests),
+        repository=model_repository,
+    )
 def _create_triton_core_op(
    name,
    max_inflight_requests,
@@ -86,6 +99,7 @@ def _create_triton_core_op(
                "parameters": {
                    "participant_ids": {"string_value": f"{args.gpu_device_id}"},
                    "gpu_device_ids": {"string_value": f"{args.gpu_device_id}"},
+                    "event_buffer_max_size": {"string_value": "1024"},
                }
            },
        },
@@ -159,6 +173,22 @@ def main(args):
            request_plane_args=([], {"request_plane_uri": args.request_plane_uri}),
        )
        worker_configs.append(prefill_decode)
+    elif args.worker_type == "kv-aware-routing":
+        print("Creating KvAwareRouting Operator")
+        router_op = _create_kv_aware_routing_op(
+            name=args.model,
+            max_inflight_requests=1000,
+            args=args,
+        )
+        router = WorkerConfig(
+            operators=[router_op],
+            name=args.worker_name,
+            log_level=args.log_level,
+            metrics_port=args.metrics_port,
+            request_plane_args=([], {"request_plane_uri": args.request_plane_uri}),
+        )
+        worker_configs.append(router)
    print("Starting Worker")
    for worker_config in worker_configs:

--- a/examples/python/llm/tensorrtllm/deploy/launch_workers.py
+++ b/examples/python/llm/tensorrtllm/deploy/launch_workers.py
@@ -43,50 +43,51 @@ for sig in signals:
 def _launch_mpi_workers(args):
-    if (
+    command = [
-        args.context_worker_count == 1
+        "mpiexec",
-        or args.generate_worker_count == 1
+        "--allow-run-as-root",
-        or args.aggregate_worker_count == 1
+        "--oversubscribe",
-    ):
+        "--display-map",
-        command = [
+        "--verbose",
-            "mpiexec",
+    ]
-            "--allow-run-as-root",
-            "--oversubscribe",
+    if args.log_dir:
-            "--display-map",
+        WORKER_LOG_DIR = str(Path(args.log_dir) / "workers")
-            "--verbose",
+        command += ["--output-filename", WORKER_LOG_DIR]
-        ]
+    aggregate_gpus = 0
-        if args.log_dir:
-            WORKER_LOG_DIR = str(Path(args.log_dir) / "workers")
+    # [TODO] below placements assume model to be TP/PP 1
-            command += ["--output-filename", WORKER_LOG_DIR]
+    gpu_count_per_context_worker = 1
+    gpu_count_per_generate_worker = 1
-        aggregate_gpus = args.context_worker_count + args.generate_worker_count
+    gpu_count_per_aggreate_worker = 1
-        for index in range(args.context_worker_count):
+    for index in range(args.context_worker_count):
-            starting_gpu = index * aggregate_gpus
+        starting_gpu = aggregate_gpus
-            command.extend(_context_cmd(args, starting_gpu))
+        command.extend(_context_cmd(args, index, starting_gpu))
-            command.append(":")
+        command.append(":")
+        aggregate_gpus += gpu_count_per_context_worker
-        for index in range(args.generate_worker_count):
-            starting_gpu = index * aggregate_gpus + args.context_worker_count
+    for index in range(args.generate_worker_count):
-            command.extend(_generate_cmd(args, starting_gpu))
+        starting_gpu = aggregate_gpus
-            command.append(":")
+        command.extend(_generate_cmd(args, index, starting_gpu))
+        command.append(":")
-        for index in range(args.aggregate_worker_count):
+        aggregate_gpus += gpu_count_per_generate_worker
-            starting_gpu = index * aggregate_gpus + args.context_worker_count
-            command.extend(_aggregate_cmd(args, starting_gpu))
+    for index in range(args.aggregate_worker_count):
-            command.append(":")
+        starting_gpu = aggregate_gpus
+        command.extend(_aggregate_cmd(args, index, starting_gpu))
-        command = command[0:-1]
+        command.append(":")
-        print(" ".join(command))
+        aggregate_gpus += gpu_count_per_aggreate_worker
-        if args.dry_run:
+    command = command[0:-1]
-            return
+    print(" ".join(command))
-        env = os.environ.copy()
+    if args.dry_run:
-        return subprocess.Popen(command, env=env, stdin=subprocess.DEVNULL)
+        return
-    else:
-        raise ValueError("Only supporting 1 worker each for now")
+    env = os.environ.copy()
+    return subprocess.Popen(command, env=env, stdin=subprocess.DEVNULL)
 def _launch_disagg_model(args):
@@ -104,21 +105,43 @@ def _launch_disagg_model(args):
    return subprocess.Popen(command, env=env, stdin=subprocess.DEVNULL)
+def _launch_kv_aware_model(args):
+    if not args.kv_aware_routing:
+        return
+    starting_gpu = 0
+    env = os.environ.copy()
+    command = _kv_aware_routing_cmd(args, starting_gpu)
+    print(" ".join(command))
+    if args.dry_run:
+        return
+    return subprocess.Popen(command, env=env, stdin=subprocess.DEVNULL)
 def _launch_workers(args):
    # Launch nats-server if requested by user for convenience, otherwise
    # it can be started separately beforehand.
    if args.initialize_request_plane:
        _launch_nats_server(args)
+        # [FIXME] not really related to request plane
+        _launch_etcd(args)
    # Launch TRT-LLM models via mpiexec in the same MPI WORLD
    _launch_mpi_workers(args)
+    # [FIXME] below should be "one of" or merged together
    # Launch disaggregated serving "workflow" model to interface
    # client-facing requests with Triton Distributed deployment.
    _launch_disagg_model(args)
+    # Launch KV aware routing "workflow" model to interface
+    # client-facing requests with Triton Distributed deployment.
+    _launch_kv_aware_model(args)
-def _context_cmd(args, starting_gpu):
+def _context_cmd(args, index, starting_gpu):
    # Hard-coded worker name for internal communication,
    # see tensorrtllm.deploy script
    worker_name = "context"
@@ -141,7 +164,7 @@ def _context_cmd(args, starting_gpu):
        "--gpu-device-id",
        f"{starting_gpu}",
        "--metrics-port",
-        "50000",
+        str(50100 + index),
        "--initialize-request-plane",
        "--request-plane-uri",
        f"{os.getenv('HOSTNAME')}:{args.nats_port}",
@@ -150,7 +173,7 @@ def _context_cmd(args, starting_gpu):
    return command
-def _generate_cmd(args, starting_gpu):
+def _generate_cmd(args, index, starting_gpu):
    # Hard-coded worker name for internal communication
    # see tensorrtllm.deploy script
    worker_name = "generate"
@@ -173,7 +196,7 @@ def _generate_cmd(args, starting_gpu):
        "--gpu-device-id",
        f"{starting_gpu}",
        "--metrics-port",
-        "50001",
+        str(50200 + index),
        "--request-plane-uri",
        f"{os.getenv('HOSTNAME')}:{args.nats_port}",
    ]
@@ -181,7 +204,7 @@ def _generate_cmd(args, starting_gpu):
    return command
-def _aggregate_cmd(args, starting_gpu):
+def _aggregate_cmd(args, index, starting_gpu):
    # Hard-coded worker name for internal communication
    # see tensorrtllm.deploy script
    worker_name = "aggregate"
@@ -204,7 +227,7 @@ def _aggregate_cmd(args, starting_gpu):
        "--gpu-device-id",
        f"{starting_gpu}",
        "--metrics-port",
-        "50001",
+        str(50300 + index),
        "--request-plane-uri",
        f"{os.getenv('HOSTNAME')}:{args.nats_port}",
    ]
@@ -239,6 +262,33 @@ def _disaggregated_serving_cmd(args, starting_gpu):
    return command
+def _kv_aware_routing_cmd(args, starting_gpu):
+    # NOTE: This worker gets the args --worker-name because it will
+    # receive the API-serving facing requests, and internally handle
+    # the disaggregation. So this worker name should match the one
+    # registered to the API Server.
+    command = [
+        # FIXME: Does this model need a GPU assigned to it?
+        # "-x",
+        # f"CUDA_VISIBLE_DEVICES={starting_gpu}",
+        "python3",
+        "-m",
+        "llm.tensorrtllm.deploy",
+        "--worker-type",
+        "kv-aware-routing",
+        "--metrics-port",
+        "50002",
+        "--model",
+        args.model,
+        "--worker-name",
+        args.worker_name,
+        "--request-plane-uri",
+        f"{os.getenv('HOSTNAME')}:{args.nats_port}",
+    ]
+    return command
 def _launch_nats_server(args, clear_store=True):
    # FIXME: Use NatsServer object defined in icp package
    store_dir = "/tmp/nats_store"
@@ -262,6 +312,19 @@ def _launch_nats_server(args, clear_store=True):
    return subprocess.Popen(command, env=env, stdin=subprocess.DEVNULL)
+def _launch_etcd(args):
+    command = [
+        "/usr/local/bin/etcd",
+    ]
+    print(" ".join(command))
+    if args.dry_run:
+        return
+    env = os.environ.copy()
+    return subprocess.Popen(command, env=env, stdin=subprocess.DEVNULL)
 if __name__ == "__main__":
    args = parse_args()
    _launch_workers(args)
--- a/examples/python/llm/tensorrtllm/deploy/parser.py
+++ b/examples/python/llm/tensorrtllm/deploy/parser.py
@@ -72,7 +72,13 @@ def parse_args():
        type=str,
        default="aggregate",
        help="Type of worker",
-        choices=["aggregate", "context", "generate", "disaggregated-serving"],
+        choices=[
+            "aggregate",
+            "context",
+            "generate",
+            "disaggregated-serving",
+            "kv-aware-routing",
+        ],
    )
    parser.add_argument("--gpu-device-id", type=int, default=0, help="gpu id")
@@ -155,4 +161,12 @@ def parse_args():
        help="Enable disaggregated serving",
    )
+    parser.add_argument(
+        "--kv-aware-routing",
+        action=argparse.BooleanOptionalAction,
+        required=False,
+        default=False,
+        help="Enable KV aware routing",
+    )
    return parser.parse_args()
--- a/examples/python/llm/tensorrtllm/operators/__init__.py
+++ b/examples/python/llm/tensorrtllm/operators/__init__.py
@@ -14,5 +14,6 @@
 # limitations under the License.
 from llm.tensorrtllm.operators.disaggregated_serving import DisaggregatedServingOperator
+from llm.tensorrtllm.operators.kv_aware_routing import KvAwareRoutingOperator
-__all__ = ["DisaggregatedServingOperator"]
+__all__ = ["DisaggregatedServingOperator", "KvAwareRoutingOperator"]
--- a/examples/python/llm/tensorrtllm/operators/disaggregated_serving.py
+++ b/examples/python/llm/tensorrtllm/operators/disaggregated_serving.py
@@ -30,12 +30,12 @@ class DisaggregatedServingOperator(TritonCoreOperator):
        self,
        name,
        version,
-        triton_core,
        request_plane,
        data_plane,
        parameters,
        repository,
        logger,
+        triton_core,
    ):
        self._prefill = RemoteOperator("context", request_plane, data_plane)
        self._decode = RemoteOperator("generate", request_plane, data_plane)

--- a/examples/python/llm/tensorrtllm/operators/kv_aware_routing.py
+++ b/examples/python/llm/tensorrtllm/operators/kv_aware_routing.py
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import json
+import numpy
+from triton_distributed_rs import DistributedRuntime, KvRouter
+from triton_distributed.runtime import (
+    RemoteInferenceRequest,
+    RemoteOperator,
+    TritonCoreOperator,
+)
+class KvAwareRoutingOperator(TritonCoreOperator):
+    def __init__(
+        self,
+        name,
+        version,
+        request_plane,
+        data_plane,
+        parameters,
+        repository,
+        logger,
+        triton_core,
+    ):
+        loop = asyncio.get_running_loop()
+        self._runtime = DistributedRuntime(loop)
+        backend = self._runtime.namespace("router").component("generate")
+        self._router = KvRouter(self._runtime, backend)
+        self._generate = RemoteOperator("generate", request_plane, data_plane)
+        self._repository = repository
+        self._triton_core = triton_core
+        self._triton_core.register_model_repository(repository)
+        self._preprocess_model = self._triton_core.load("simple_preprocessing")
+        self._postprocess_model = self._triton_core.load("simple_postprocessing")
+        self._logger = logger
+        self._store_outputs_in_response = True
+    async def execute(self, requests: list[RemoteInferenceRequest]):
+        self._logger.debug("Executing KvAwareRouting Request")
+        background_tasks = []
+        for request in requests:
+            task = asyncio.create_task(self._execute_request(request))
+            background_tasks.append(task)
+        try:
+            results = await asyncio.gather(*background_tasks, return_exceptions=True)
+            for result in results:
+                if isinstance(result, Exception):
+                    self._logger.exception(
+                        f"Running request execution failed: {result}"
+                    )
+                else:
+                    self._logger.debug(
+                        f"Request execution completed with result: {result}"
+                    )
+        except Exception as e:
+            self._logger.exception(f"Error during request execution: {e}")
+    async def _execute_request(self, request: RemoteInferenceRequest):
+        background_tasks = []
+        sampling_params = {}
+        response_sender = request.response_sender()
+        """Preprocessing"""
+        self._logger.debug(request)
+        if "text_input" in request.inputs:
+            query = request.inputs["text_input"].to_bytes_array()
+        elif "prompt" in request.inputs:
+            query = request.inputs["prompt"].to_bytes_array()
+        elif "prompt" in request.parameters:
+            query = request.parameters["prompt"]
+        else:
+            await response_sender.send(error=f"invalid request {request}", final=True)
+            return
+        if "sampling_params" in request.parameters:
+            sampling_params = json.loads(
+                request.parameters["sampling_params"].removeprefix("JSON:")
+            )
+        if "max_tokens" in request.inputs:
+            request_output_len = request.inputs["max_tokens"]
+        elif "max_tokens" in sampling_params:
+            request_output_len = numpy.array(
+                [[sampling_params["max_tokens"]]], dtype=numpy.int32
+            )
+        input_ids, input_lengths = await self._preprocess(query)
+        self._logger.debug(input_ids, input_lengths)
+        # [FIXME] not rate limiting due to metric polling is not supported
+        # KV aware routing
+        lora_id = 0
+        try:
+            self._generate.component_id = await self._router.schedule(
+                input_ids[0], lora_id
+            )
+            self._logger.debug(f"worker selected: {self._generate.component_id}")
+        except Exception as e:
+            if "No worker found" in str(e):
+                self._generate.component_id = None
+                self._logger.debug("no eligible worker")
+            else:
+                self._logger.exception(f"Error during selecting worker: {e}")
+        # [TODO] add disaggregated example
+        """llm"""
+        llm_inputs = {}
+        llm_inputs["input_ids"] = input_ids
+        llm_inputs["input_lengths"] = input_lengths
+        llm_inputs["request_output_len"] = request_output_len
+        async for llm_response in await self._generate.async_infer(
+            inputs=llm_inputs,
+        ):
+            self._logger.debug(f"llm response completed: {llm_response}")
+            background_tasks.append(
+                asyncio.create_task(
+                    self._send_llm_response(
+                        llm_response,
+                        response_sender,
+                        final=llm_response.final,
+                    )
+                )
+            )
+        try:
+            results = await asyncio.gather(*background_tasks, return_exceptions=True)
+            for result in results:
+                if isinstance(result, Exception):
+                    self._logger.exception(
+                        f"Sending response failed with exception: {result}"
+                    )
+                else:
+                    self._logger.debug(f"Response sent successfully: {result}")
+        except Exception as e:
+            self._logger.exception(f"Error during response sending: {e}")
+        for output in llm_response.outputs:
+            del output
+    async def _preprocess(self, query):
+        start_ids = None
+        start_lengths = None
+        if isinstance(query, str):
+            query = [[query]]
+        async for preprocess_response in self._preprocess_model.async_infer(
+            inputs={"query": query}
+        ):
+            self._logger.debug(f"Preprocess response completed: {preprocess_response}")
+            start_ids = numpy.from_dlpack(preprocess_response.outputs["start_ids"])
+            start_lengths = numpy.from_dlpack(
+                preprocess_response.outputs["start_lengths"]
+            )
+        return start_ids, start_lengths
+    async def _postprocessing(self, tokens_batch, sequence_lengths):
+        outputs = []
+        async for postprocess_response in self._postprocess_model.async_infer(
+            inputs={"tokens_batch": tokens_batch, "sequence_lengths": sequence_lengths}
+        ):
+            self._logger.debug(f"Received postprocess response: {postprocess_response}")
+            output = postprocess_response.outputs["output"].to_string_array()
+            outputs.append(output)
+        return outputs
+    async def _send_llm_response(self, llm_response, response_sender, final):
+        tokens_batch = numpy.from_dlpack(llm_response.outputs["output_ids"])
+        self._logger.debug(f"Output ids length: {tokens_batch}")
+        sequence_length = numpy.from_dlpack(llm_response.outputs["sequence_length"])
+        output = await self._postprocessing(tokens_batch, sequence_length)
+        store_outputs_in_response = set()
+        if self._store_outputs_in_response:
+            store_outputs_in_response.add("text_output")
+        await response_sender.send(
+            outputs={"text_output": output[0]},
+            final=final,
+            store_outputs_in_response=store_outputs_in_response,
+        )
--- a/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_postprocessing/config.pbtxt
+++ b/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_postprocessing/config.pbtxt
@@ -22,7 +22,7 @@ dynamic_batching {}
 parameters {
  key: "tokenizer_dir"
  value: {
-    string_value: "/workspace/examples/llm/tensorrtllm/operators/hf_downloads/llama-3.1-8b-instruct"
+    string_value: "/workspace/examples/python/llm/tensorrtllm/operators/hf_downloads/llama-3.1-8b-instruct"
  }
 }

--- a/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_preprocessing/config.pbtxt
+++ b/examples/python/llm/tensorrtllm/operators/triton_core_models/simple_preprocessing/config.pbtxt
@@ -20,7 +20,7 @@ max_batch_size: 1
 parameters {
  key: "tokenizer_dir"
  value: {
-    string_value: "/workspace/examples/llm/tensorrtllm/operators/hf_downloads/llama-3.1-8b-instruct"
+    string_value: "/workspace/examples/python/llm/tensorrtllm/operators/hf_downloads/llama-3.1-8b-instruct"
  }
 }

--- a/examples/python/llm/tensorrtllm/scripts/known_models.py
+++ b/examples/python/llm/tensorrtllm/scripts/known_models.py
@@ -25,15 +25,15 @@ KNOWN_MODELS = {
            "postprocessing",
            "ensemble",
            (
-                "/workspace/examples/llm/tensorrtllm/operators/triton_core_models/mock",
+                "/workspace/examples/python/llm/tensorrtllm/operators/triton_core_models/mock",
                "context",
            ),
            (
-                "/workspace/examples/llm/tensorrtllm/operators/triton_core_models/mock",
+                "/workspace/examples/python/llm/tensorrtllm/operators/triton_core_models/mock",
                "generate",
            ),
            (
-                "/workspace/examples/llm/tensorrtllm/operators/triton_core_models/mock",
+                "/workspace/examples/python/llm/tensorrtllm/operators/triton_core_models/mock",
                "tensorrt_llm",
            ),
        ],
@@ -94,7 +94,7 @@ KNOWN_MODELS = {
            "max_beam_width": "1",
            "engine_dir": "{args.tensorrtllm_engine}",
            "exclude_input_in_output": "True",
-            "enable_kv_cache_reuse": "False",
+            "enable_kv_cache_reuse": "True",
            "batching_strategy": "inflight_fused_batching",
            "max_queue_delay_microseconds": "0",
            "max_queue_size": "0",
@@ -118,6 +118,8 @@ KNOWN_MODELS = {
            "float16",
            "--paged_kv_cache",
            "enable",
+            "--use_paged_context_fmha",
+            "enable",
        ],
        "max_num_tokens": 16384,
        "max_batch_size": 64,
@@ -138,7 +140,7 @@ KNOWN_MODELS = {
            "max_beam_width": "1",
            "engine_dir": "{args.tensorrtllm_engine}",
            "exclude_input_in_output": "True",
-            "enable_kv_cache_reuse": "False",
+            "enable_kv_cache_reuse": "True",
            "batching_strategy": "inflight_fused_batching",
            "max_queue_delay_microseconds": "0",
            "max_queue_size": "0",
@@ -174,6 +176,8 @@ KNOWN_MODELS = {
            "enable",
            "--multiple_profiles",
            "enable",
+            "--use_paged_context_fmha",
+            "enable",
        ],
        "max_num_tokens": 256,
        "templates": [
@@ -189,7 +193,7 @@ KNOWN_MODELS = {
            "max_beam_width": "1",
            "engine_dir": "{args.tensorrtllm_engine}",
            "exclude_input_in_output": "True",
-            "enable_kv_cache_reuse": "False",
+            "enable_kv_cache_reuse": "True",
            "batching_strategy": "inflight_fused_batching",
            "max_queue_delay_microseconds": "0",
            "max_queue_size": "0",
@@ -225,6 +229,8 @@ KNOWN_MODELS = {
            "enable",
            "--multiple_profiles",
            "enable",
+            "--use_paged_context_fmha",
+            "enable",
        ],
        "max_num_tokens": 8192,
        "templates": [
@@ -240,7 +246,7 @@ KNOWN_MODELS = {
            "max_beam_width": "1",
            "engine_dir": "{args.tensorrtllm_engine}",
            "exclude_input_in_output": "True",
-            "enable_kv_cache_reuse": "False",
+            "enable_kv_cache_reuse": "True",
            "batching_strategy": "inflight_fused_batching",
            "max_queue_delay_microseconds": "0",
            "max_queue_size": "0",
@@ -275,6 +281,8 @@ KNOWN_MODELS = {
            "enable",
            "--reduce_fusion",
            "{args.reduce_fusion}",
+            "--use_paged_context_fmha",
+            "enable",
        ],
        "max_num_tokens": 16384,
        "max_batch_size": 512,
@@ -295,7 +303,7 @@ KNOWN_MODELS = {
            "max_beam_width": "1",
            "engine_dir": "{args.tensorrtllm_engine}",
            "exclude_input_in_output": "True",
-            "enable_kv_cache_reuse": "False",
+            "enable_kv_cache_reuse": "True",
            "batching_strategy": "inflight_fused_batching",
            "max_queue_delay_microseconds": "0",
            "max_queue_size": "0",
@@ -319,6 +327,8 @@ KNOWN_MODELS = {
            "float16",
            "--paged_kv_cache",
            "enable",
+            "--use_paged_context_fmha",
+            "enable",
        ],
        "max_batch_size": 64,
        "templates": [
@@ -338,7 +348,7 @@ KNOWN_MODELS = {
            "max_beam_width": "1",
            "engine_dir": "{args.tensorrtllm_engine}",
            "exclude_input_in_output": "True",
-            "enable_kv_cache_reuse": "False",
+            "enable_kv_cache_reuse": "True",
            "batching_strategy": "inflight_fused_batching",
            "max_queue_delay_microseconds": "0",
            "max_queue_size": "0",
@@ -376,6 +386,8 @@ KNOWN_MODELS = {
            "{args.reduce_fusion}",
            "--multiple_profiles",
            "enable",
+            "--use_paged_context_fmha",
+            "enable",
        ],
        "max_num_tokens": 8192,
        "templates": [
@@ -391,7 +403,7 @@ KNOWN_MODELS = {
            "max_beam_width": "1",
            "engine_dir": "{args.tensorrtllm_engine}",
            "exclude_input_in_output": "True",
-            "enable_kv_cache_reuse": "False",
+            "enable_kv_cache_reuse": "True",
            "batching_strategy": "inflight_fused_batching",
            "max_queue_delay_microseconds": "0",
            "max_queue_size": "0",
@@ -429,6 +441,8 @@ KNOWN_MODELS = {
            "{args.reduce_fusion}",
            "--multiple_profiles",
            "enable",
+            "--use_paged_context_fmha",
+            "enable",
        ],
        "max_num_tokens": 128,
        "templates": [
@@ -444,7 +458,7 @@ KNOWN_MODELS = {
            "max_beam_width": "1",
            "engine_dir": "{args.tensorrtllm_engine}",
            "exclude_input_in_output": "True",
-            "enable_kv_cache_reuse": "False",
+            "enable_kv_cache_reuse": "True",
            "batching_strategy": "inflight_fused_batching",
            "max_queue_delay_microseconds": "0",
            "max_queue_size": "0",
@@ -480,6 +494,8 @@ KNOWN_MODELS = {
            "{args.reduce_fusion}",
            "--multiple_profiles",
            "enable",
+            "--use_paged_context_fmha",
+            "enable",
        ],
        "max_num_tokens": 16384,
        "templates": [
@@ -497,7 +513,7 @@ KNOWN_MODELS = {
            "max_beam_width": "1",
            "engine_dir": "{args.tensorrtllm_engine}",
            "exclude_input_in_output": "True",
-            "enable_kv_cache_reuse": "False",
+            "enable_kv_cache_reuse": "True",
            "batching_strategy": "inflight_fused_batching",
            "max_queue_delay_microseconds": "0",
            "max_queue_size": "0",

--- a/examples/python/llm/tensorrtllm/scripts/prepare_models.py
+++ b/examples/python/llm/tensorrtllm/scripts/prepare_models.py
@@ -23,7 +23,7 @@ from gpu_info import get_gpu_product_name
 from huggingface_hub import snapshot_download
 from known_models import KNOWN_MODELS
-TARGET_DIR = "/workspace/examples/llm/tensorrtllm/operators"
+TARGET_DIR = "/workspace/examples/python/llm/tensorrtllm/operators"
 TENSORRTLLM_EXAMPLE_DIR = "/tensorrtllm_backend/tensorrt_llm/examples"

--- a/examples/python_rs/llm/vllm/README.md
+++ b/examples/python_rs/llm/vllm/README.md
@@ -157,7 +157,87 @@ For disaggregated deployment, you will also need to pass the `kv_ip` and `kv_por
    '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":<rank>,"kv_parallel_size":2,"kv_ip":<master_node_ip>,"kv_port":<kv_port>}'
 ```
-### 4. Known Issues and Limitations
+### 4. KV Router Deployment
+The KV Router is a component that aggregates KV Events from all the workers and maintains a prefix tree of the cached tokens. It makes decisions on which worker to route requests to based on the length of the prefix match and the load on the workers.
+You can run the router and workers in separate terminal sessions or use the `kv-router-run.sh` script to launch them all at once in their own tmux sessions.
+#### Deploying using tmux
+The helper script `kv-router-run.sh` will launch the router and workers in their own tmux sessions.
+kv-router-run.sh <number_of_workers> <routing_strategy> Optional[<model_name>]
+Example:
+```bash
+# Launch 8 workers with prefix routing strategy and use deepseek-ai/DeepSeek-R1-Distill-Llama-8B as the model
+/workspace/examples/python_rs/llm/vllm/kv-router-run.sh 8 prefix deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+# List tmux sessions
+tmux ls
+# Attach to the tmux sessions
+tmux a -t v-1 # worker 1 - use cmd + b, d to detach
+tmux a -t v-router # kv router - use cmd + b, d to detach
+# Close the tmux sessions
+tmux ls | grep 'v-' | cut -d: -f1 | xargs -I{} tmux kill-session -t {}
+```
+#### Deploying using separate terminals
+**Terminal 1 - Router:**
+```bash
+# Activate virtual environment
+source /opt/triton/venv/bin/activate
+# Launch prefill worker
+cd /workspace/examples/python_rs/llm/vllm
+RUST_LOG=info python3 -m kv_router.router \
+    --routing-strategy prefix
+```
+You can choose between different routing strategies:
+- `prefix`: Route requests to the worker that has the longest prefix match.
+- `round_robin`: Route requests to the worker in a round-robin manner.
+- `random`: Route requests to a random worker.
+**Terminal 2 and 3 - Workers:**
+```bash
+# Activate virtual environment
+source /opt/triton/venv/bin/activate
+# Launch Worker 1 and Worker 2 with same command
+cd /workspace/examples/python_rs/llm/vllm
+RUST_LOG=info python3 -m kv_router.worker \
+    --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+    --tokenizer deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
+    --enable-prefix-caching \
+    --block-size 64 \
+    --max-model-len 16384
+```
+Note: Must enable prefix caching for KV Router to work
+Note: block-size must be 64, otherwise Router won't work (accepts only 64 tokens)
+**Terminal 3 - Client:**
+```bash
+# Activate virtual environment
+source /opt/triton/venv/bin/activate
+# Run client
+# We use a long prompt to populate a few KV Blocks (64 tokens each)
+# Try running it a few times to see where the router is sending the request
+cd /workspace/examples/python_rs/llm/vllm
+python3 -m common.client \
+    --prompt "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden." \
+    --component preprocess \
+    --max-tokens 10 \
+    --temperature 0.5
+```
+### 5. Known Issues and Limitations
 - vLLM is not working well with the `fork` method for multiprocessing and TP > 1. This is a known issue and a workaround is to use the `spawn` method instead. See [vLLM issue](https://github.com/vllm-project/vllm/issues/6152).
 - `kv_rank` of `kv_producer` must be smaller than of `kv_consumer`.

--- a/examples/python_rs/llm/vllm/common/client.py
+++ b/examples/python_rs/llm/vllm/common/client.py
@@ -25,20 +25,23 @@ from .protocol import Request
 @triton_worker()
 async def worker(
-    runtime: DistributedRuntime, prompt: str, max_tokens: int, temperature: float
+    runtime: DistributedRuntime,
+    component: str,
+    prompt: str,
+    max_tokens: int,
+    temperature: float,
 ):
    """
    Instantiate a `backend` client and call the `generate` endpoint
    """
    # get endpoint
-    endpoint = runtime.namespace("triton-init").component("vllm").endpoint("generate")
+    endpoint = (
+        runtime.namespace("triton-init").component(component).endpoint("generate")
+    )
    # create client
    client = await endpoint.client()
-    # list the endpoints
-    print(client.endpoint_ids())
    # issue request
    tasks = []
    for _ in range(1):
@@ -66,9 +69,10 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--prompt", type=str, default="what is the capital of france?")
+    parser.add_argument("--component", type=str, default="vllm")
    parser.add_argument("--max-tokens", type=int, default=10)
    parser.add_argument("--temperature", type=float, default=0.5)
    args = parser.parse_args()
-    asyncio.run(worker(args.prompt, args.max_tokens, args.temperature))
+    asyncio.run(worker(args.component, args.prompt, args.max_tokens, args.temperature))
--- a/examples/python_rs/llm/vllm/common/protocol.py
+++ b/examples/python_rs/llm/vllm/common/protocol.py
@@ -22,6 +22,14 @@ class Request(BaseModel):
    sampling_params: dict
+class Tokens(BaseModel):
+    tokens: list[int]
+class TokenizedRequest(Request, Tokens):
+    pass
 class PrefillRequest(Request):
    request_id: str

--- a/examples/python_rs/llm/vllm/kv-router-run.sh
+++ b/examples/python_rs/llm/vllm/kv-router-run.sh
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/bin/bash
+if [ $# -lt 2 ]; then
+    echo "Usage: $0 <number_of_workers> <routing_strategy> [model_name]"
+    echo "Error: Must specify at least number of workers and routing strategy"
+    exit 1
+fi
+NUM_WORKERS=$1
+ROUTING_STRATEGY=$2
+MODEL_NAME=${3:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
+VALID_STRATEGIES=("prefix" "round_robin" "random")
+if [[ ! " ${VALID_STRATEGIES[@]} " =~ " ${ROUTING_STRATEGY} " ]]; then
+    echo "Error: Invalid routing strategy. Must be one of: ${VALID_STRATEGIES[*]}"
+    exit 1
+fi
+SESSION_NAME="v"
+WORKDIR="/workspace/examples/python_rs/llm/vllm"
+INIT_CMD="source /opt/triton/venv/bin/activate && cd $WORKDIR"
+ROUTER_CMD="RUST_LOG=info python3 -m kv_router.router \
+    --routing-strategy $ROUTING_STRATEGY \
+    --min-workers $NUM_WORKERS "
+tmux new-session -d -s "$SESSION_NAME-router"
+tmux send-keys -t "$SESSION_NAME-router" "$INIT_CMD && $ROUTER_CMD" C-m
+WORKER_CMD="RUST_LOG=info python3 -m kv_router.worker \
+    --model $MODEL_NAME \
+    --tokenizer $MODEL_NAME \
+    --enable-prefix-caching \
+    --block-size 64 \
+    --max-model-len 16384 "
+for i in $(seq 1 $NUM_WORKERS); do
+        tmux new-session -d -s "$SESSION_NAME-$i"
+done
+for i in $(seq 1 $NUM_WORKERS); do
+        tmux send-keys -t "$SESSION_NAME-$i" "$INIT_CMD && CUDA_VISIBLE_DEVICES=$((i-1)) $WORKER_CMD" C-m
+done