Unverified Commit 86ce03f4 authored by Richard Huo's avatar Richard Huo Committed by GitHub
Browse files

fix: Update the KVBM <> TRT-LLM integration interface to match the latest...


fix: Update the KVBM <> TRT-LLM integration interface to match the latest TRT-LLM connector API (#2979)
Signed-off-by: default avatarrichardhuo-nv <rihuo@nvidia.com>
parent cd814377
...@@ -303,3 +303,9 @@ sampling_params.logits_processor = create_trtllm_adapters(processors) ...@@ -303,3 +303,9 @@ sampling_params.logits_processor = create_trtllm_adapters(processors)
## Performance Sweep ## Performance Sweep
For detailed instructions on running comprehensive performance sweeps across both aggregated and disaggregated serving configurations, see the [TensorRT-LLM Benchmark Scripts for DeepSeek R1 model](./performance_sweeps/README.md). This guide covers recommended benchmarking setups, usage of provided scripts, and best practices for evaluating system performance. For detailed instructions on running comprehensive performance sweeps across both aggregated and disaggregated serving configurations, see the [TensorRT-LLM Benchmark Scripts for DeepSeek R1 model](./performance_sweeps/README.md). This guide covers recommended benchmarking setups, usage of provided scripts, and best practices for evaluating system performance.
## Dynamo KV Block Manager Integration
Dynamo with TensorRT-LLM currently supports integration with the Dynamo KV Block Manager. This integration can significantly reduce time-to-first-token (TTFT) latency, particularly in usage patterns such as multi-turn conversations and repeated long-context requests.
Here is the instruction: [Running KVBM in TensorRT-LLM](./../../../docs/guides/run_kvbm_in_trtllm.md) .
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
ARG BASE_IMAGE="nvcr.io/nvidia/pytorch" ARG BASE_IMAGE="nvcr.io/nvidia/pytorch"
ARG BASE_IMAGE_TAG="25.06-py3" ARG BASE_IMAGE_TAG="25.06-py3"
ARG RELEASE_BUILD ARG RELEASE_BUILD
ARG ENABLE_KVBM=false
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda" ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE_TAG="12.9.1-runtime-ubuntu24.04" ARG RUNTIME_IMAGE_TAG="12.9.1-runtime-ubuntu24.04"
...@@ -234,6 +235,7 @@ ARG ARCH_ALT ...@@ -234,6 +235,7 @@ ARG ARCH_ALT
FROM quay.io/pypa/manylinux_2_28_${ARCH_ALT} AS wheel_builder FROM quay.io/pypa/manylinux_2_28_${ARCH_ALT} AS wheel_builder
ARG RELEASE_BUILD ARG RELEASE_BUILD
ARG CARGO_BUILD_JOBS ARG CARGO_BUILD_JOBS
ARG ENABLE_KVBM
# Set CARGO_BUILD_JOBS to 16 if not provided # Set CARGO_BUILD_JOBS to 16 if not provided
# This is to prevent cargo from building $(nproc) jobs in parallel, # This is to prevent cargo from building $(nproc) jobs in parallel,
# which might exceed the number of opened files limit. # which might exceed the number of opened files limit.
...@@ -279,16 +281,21 @@ COPY launch /workspace/launch ...@@ -279,16 +281,21 @@ COPY launch /workspace/launch
RUN cargo build \ RUN cargo build \
--release \ --release \
--locked \ --locked \
--features dynamo-llm/block-manager \ --features block-manager \
--workspace --workspace
# Build dynamo wheels # Build dynamo wheels
RUN uv build --wheel --out-dir /workspace/dist && \ RUN uv build --wheel --out-dir /workspace/dist && \
cd /workspace/lib/bindings/python && \ cd /workspace/lib/bindings/python && \
uv build --wheel --out-dir /workspace/dist --python 3.12 && \ uv pip install maturin[patchelf] && \
if [ "$ENABLE_KVBM" = "true" ]; then \
maturin build --release --features block-manager --out /workspace/dist; \
else \
maturin build --release --out /workspace/dist; \
fi && \
if [ "$RELEASE_BUILD" = "true" ]; then \ if [ "$RELEASE_BUILD" = "true" ]; then \
uv build --wheel --out-dir /workspace/dist --python 3.11 && \ uv run --python 3.11 maturin build --release --out /workspace/dist && \
uv build --wheel --out-dir /workspace/dist --python 3.10; \ uv run --python 3.10 maturin build --release --out /workspace/dist; \
fi fi
######################################## ########################################
......
...@@ -38,8 +38,12 @@ To use KVBM in TensorRT-LLM, you can follow the steps below: ...@@ -38,8 +38,12 @@ To use KVBM in TensorRT-LLM, you can follow the steps below:
# start up etcd for KVBM leader/worker registration and discovery # start up etcd for KVBM leader/worker registration and discovery
docker compose -f deploy/docker-compose.yml up -d docker compose -f deploy/docker-compose.yml up -d
# Build a container that includes TensorRT-LLM and KVBM. Note: KVBM integration is only available in TensorRT-LLM commit ce580ce4f52af3ad0043a800b3f9469e1f1109f6 or newer. # Build a container that includes TensorRT-LLM and KVBM. Note: KVBM integration is only available in TensorRT-LLM commit dcd110cfac07e577ce01343c455917832b0f3d5e or newer.
./container/build.sh --framework trtllm --tensorrtllm-commit ce580ce4f52af3ad0043a800b3f9469e1f1109f6 --enable-kvbm # When building with the --tensorrtllm-commit option, you may notice that https://github.com keeps prompting for a username and password.
# This happens because cloning TensorRT-LLM can hit GitHub’s rate limit.
# To work around this, you can keep pressing "Enter" or "Return.".
# Setting "export GIT_LFS_SKIP_SMUDGE=1" may also reduce the number of prompts.
./container/build.sh --framework trtllm --tensorrtllm-commit dcd110cfac07e577ce01343c455917832b0f3d5e --enable-kvbm
# launch the container # launch the container
./container/run.sh --framework trtllm -it --mount-workspace --use-nixl-gds ./container/run.sh --framework trtllm -it --mount-workspace --use-nixl-gds
......
...@@ -8,8 +8,8 @@ from tensorrt_llm._torch.pyexecutor.kv_cache_connector import ( ...@@ -8,8 +8,8 @@ from tensorrt_llm._torch.pyexecutor.kv_cache_connector import (
KvCacheConnectorScheduler, KvCacheConnectorScheduler,
SchedulerOutput, SchedulerOutput,
) )
from tensorrt_llm.bindings.executor import ExecutorConfig
from tensorrt_llm.bindings.internal.batch_manager import LlmRequest from tensorrt_llm.bindings.internal.batch_manager import LlmRequest
from tensorrt_llm.llmapi.llm_args import TorchLlmArgs
from dynamo.llm import KvbmLeader from dynamo.llm import KvbmLeader
from dynamo.llm.trtllm_integration.rust import KvbmRequest from dynamo.llm.trtllm_integration.rust import KvbmRequest
...@@ -21,21 +21,21 @@ from dynamo.runtime import DistributedRuntime ...@@ -21,21 +21,21 @@ from dynamo.runtime import DistributedRuntime
class DynamoKVBMConnectorLeader(KvCacheConnectorScheduler): class DynamoKVBMConnectorLeader(KvCacheConnectorScheduler):
def __init__(self, executor_config: ExecutorConfig): def __init__(self, llm_args: TorchLlmArgs):
super().__init__(executor_config) super().__init__(llm_args)
self.drt = DistributedRuntime.detached() self.drt = DistributedRuntime.detached()
world_size = self._config.mapping.world_size mappings = self._llm_args.parallel_config.to_mapping()
self.block_size = self._config.tokens_per_block
world_size = mappings.world_size
self.block_size = self._llm_args.kv_cache_config.tokens_per_block
# Set bytes_per_block to 0, because we will retrieve the actual value from the worker side. # Set bytes_per_block to 0, because we will retrieve the actual value from the worker side.
leader = KvbmLeader(world_size, drt=self.drt) leader = KvbmLeader(world_size, drt=self.drt)
print( print(f"KvConnectorLeader initialized with rank: {mappings.rank}")
f"KvConnectorLeader initialized with rank: {executor_config.mapping.rank}"
)
self._connector = RustKvConnectorLeader( self._connector = RustKvConnectorLeader(
executor_config.mapping.rank, self.drt, self.block_size, leader mappings.rank, self.drt, self.block_size, leader
) )
def build_connector_meta(self, scheduler_output: SchedulerOutput) -> bytes: def build_connector_meta(self, scheduler_output: SchedulerOutput) -> bytes:
......
...@@ -4,7 +4,7 @@ ...@@ -4,7 +4,7 @@
import torch import torch
from tensorrt_llm import logger from tensorrt_llm import logger
from tensorrt_llm._torch.pyexecutor.kv_cache_connector import KvCacheConnectorWorker from tensorrt_llm._torch.pyexecutor.kv_cache_connector import KvCacheConnectorWorker
from tensorrt_llm.bindings.executor import ExecutorConfig from tensorrt_llm.llmapi.llm_args import TorchLlmArgs
from dynamo.llm.trtllm_integration.rust import ( from dynamo.llm.trtllm_integration.rust import (
KvConnectorWorker as RustKvConnectorWorker, KvConnectorWorker as RustKvConnectorWorker,
...@@ -13,16 +13,15 @@ from dynamo.runtime import DistributedRuntime ...@@ -13,16 +13,15 @@ from dynamo.runtime import DistributedRuntime
class DynamoKVBMConnectorWorker(KvCacheConnectorWorker): class DynamoKVBMConnectorWorker(KvCacheConnectorWorker):
def __init__(self, executor_config: ExecutorConfig): def __init__(self, llm_args: TorchLlmArgs):
super().__init__(executor_config) super().__init__(llm_args)
self.drt = DistributedRuntime.detached() self.drt = DistributedRuntime.detached()
self.rank = executor_config.mapping.rank mappings = self._llm_args.parallel_config.to_mapping()
self.rank = mappings.rank
self._connector = RustKvConnectorWorker( self._connector = RustKvConnectorWorker(self.drt, str(self.rank))
self.drt, str(executor_config.mapping.rank)
)
def register_kv_caches(self, kv_cache_tensor: torch.Tensor): def register_kv_caches(self, kv_cache_tensor: torch.Tensor):
""" """
...@@ -33,11 +32,11 @@ class DynamoKVBMConnectorWorker(KvCacheConnectorWorker): ...@@ -33,11 +32,11 @@ class DynamoKVBMConnectorWorker(KvCacheConnectorWorker):
""" """
print(f"Register KV Caches on rank {self.rank}") print(f"Register KV Caches on rank {self.rank}")
logger.info( logger.info(
f"KvConnectorWorker started registering the kv caches on rank {self._config.mapping.rank}" f"KvConnectorWorker started registering the kv caches on rank {self.rank}"
) )
num_device_blocks = kv_cache_tensor.shape[0] num_device_blocks = kv_cache_tensor.shape[0]
page_size = self._config.tokens_per_block page_size = self._llm_args.kv_cache_config.tokens_per_block
device_id = kv_cache_tensor.device.index device_id = kv_cache_tensor.device.index
kv_cache_dtype = kv_cache_tensor.dtype kv_cache_dtype = kv_cache_tensor.dtype
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment