fix: Update TRTLLM version and fix disagg workflow (#804)

197105eb · Tanmay Verma · GitHub · d757604c · 197105eb · 197105eb
Unverified Commit 197105eb authored Apr 23, 2025 by Tanmay Verma Committed by GitHub Apr 23, 2025
6 changed files
--- a/container/Dockerfile.tensorrt_llm
+++ b/container/Dockerfile.tensorrt_llm
@@ -14,7 +14,7 @@
 # limitations under the License.

 ARG BASE_IMAGE="tensorrt_llm/release"
-ARG BASE_IMAGE_TAG="latest"
+ARG BASE_IMAGE_TAG="latest_squashed"
 ARG MANYLINUX_IMAGE="quay.io/pypa/manylinux_2_28_x86_64"
 ARG RELEASE_BUILD

@@ -214,6 +214,8 @@ RUN pip install dist/ai_dynamo_runtime*cp312*.whl  && \
 ENV DYNAMO_KV_CAPI_PATH="/opt/dynamo/bindings/lib/libdynamo_llm_capi.so"
 ENV DYNAMO_HOME=/workspace

+# Use UCX for TRTLLM KV Cache Transfer
+ENV TRTLLM_USE_UCX_KVCACHE=1
 # Needed to use NVLink for TRTLLM KV Cache Transfer
 # https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/disaggregated-service.md
 ENV UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda

--- a/container/build.sh
+++ b/container/build.sh
@@ -53,7 +53,7 @@ BUILD_CONTEXT=$(dirname "$(readlink -f "$SOURCE_DIR")")

 # Base Images
 TENSORRTLLM_BASE_IMAGE=tensorrt_llm/release
-TENSORRTLLM_BASE_IMAGE_TAG=latest
+TENSORRTLLM_BASE_IMAGE_TAG=latest_squashed
 TENSORRTLLM_PIP_WHEEL_PATH=""

 VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"

--- a/container/build_trtllm_base_image.sh
+++ b/container/build_trtllm_base_image.sh
@@ -17,7 +17,7 @@
 # Build the TRT-LLM base image.

 # This script builds the TRT-LLM base image for Dynamo with TensorRT-LLM.
-TRTLLM_COMMIT=0d4d50a745
+TRTLLM_COMMIT=dfbcb543

 while getopts "c:" opt; do
  case ${opt} in
@@ -26,6 +26,11 @@ while getopts "c:" opt; do
  esac
 done

+python3 -m venv /tmp/squash-env
+
+source /tmp/squash-env/bin/activate
+pip3 install docker-squash
+
 (cd /tmp && \
 # Clone the TensorRT-LLM repository.
 if [ ! -d "TensorRT-LLM" ]; then
@@ -46,3 +51,8 @@ git lfs pull

 # Build the TRT-LLM base image.
 make -C docker release_build)
+
+pip3 install docker-squash
+docker-squash -t tensorrt_llm/release:latest_squashed tensorrt_llm/release:latest
+
+deactivate
\ No newline at end of file
--- a/examples/tensorrt_llm/README.md
+++ b/examples/tensorrt_llm/README.md
@@ -52,7 +52,10 @@ Use the helper script to build a TensorRT-LLM container base image. The script u
 ```bash
 # TensorRT-LLM uses git-lfs, which needs to be installed in advance.
 apt-get update && apt-get -y install git git-lfs
-git lfs install
+
+# The script uses python packages like docker-squash to squash image
+# layers within trtllm base image
+DEBIAN_FRONTEND=noninteractive TZ=America/Los_Angeles apt-get -y install python3 python3-pip python3-venv

 ./container/build_trtllm_base_image.sh
 ```
@@ -68,6 +71,7 @@ If you already have a TensorRT-LLM container image, you can skip this step.

 This build script internally points to the base container image built with step 1. If you skipped previous step because you already have the container image available, you can run the build script with that image as a base.

+
 ```bash
 # Build dynamo image with other TRTLLM base image.
 ./container/build.sh --framework TENSORRTLLM --base-image <trtllm-base-image> --base-image-tag <trtllm-base-image-tag>
@@ -120,7 +124,7 @@ dynamo serve graphs.agg_router:Frontend -f ./configs/agg_router.yaml
 #### Disaggregated serving
 ```bash
 cd /workspace/examples/tensorrt_llm
-TRTLLM_USE_UCX_KVCACHE=1 dynamo serve graphs.disagg:Frontend -f ./configs/disagg.yaml
+dynamo serve graphs.disagg:Frontend -f ./configs/disagg.yaml
 ```

 We are defining TRTLLM_USE_UCX_KVCACHE so that TRTLLM uses UCX for transfering the KV
@@ -129,7 +133,7 @@ cache between the context and generation workers.
 #### Disaggregated serving with KV Routing
 ```bash
 cd /workspace/examples/tensorrt_llm
-TRTLLM_USE_UCX_KVCACHE=1 dynamo serve graphs.disagg_router:Frontend -f ./configs/disagg_router.yaml
+dynamo serve graphs.disagg_router:Frontend -f ./configs/disagg_router.yaml
 ```

 We are defining TRTLLM_USE_UCX_KVCACHE so that TRTLLM uses UCX for transfering the KV

--- a/examples/tensorrt_llm/common/base_engine.py
+++ b/examples/tensorrt_llm/common/base_engine.py
@@ -102,7 +102,7 @@ class BaseTensorrtLLMEngine:
        self._error_queue: Queue = Queue()
        self._kv_metrics_publisher = None

-        if self._remote_prefill:
+        if self._remote_prefill or self._server_type == ServerType.CTX:
            self._min_workers = min_workers
            if disagg_config_file is None or not os.path.exists(disagg_config_file):
                raise ValueError(

--- a/examples/tensorrt_llm/common/chat_processor.py
+++ b/examples/tensorrt_llm/common/chat_processor.py
@@ -189,11 +189,10 @@ class ChatProcessor(BaseChatProcessor):
                finish_reason=None,
            )
            if response.outputs[0].disaggregated_params is not None:
-                choice.disaggregated_params = (
-                    DisaggregatedTypeConverter.to_oai_disaggregated_params(
-                        response.outputs[0].disaggregated_params
-                    )
-                )
+                # Do not include the disaggregated params in response
+                # from Processor.
+                pass
+
            chunk = DynamoTRTLLMChatCompletionStreamResponse(
                id=request_id,
                choices=[choice],
@@ -271,11 +270,9 @@ class ChatProcessor(BaseChatProcessor):
                choice.stop_reason = output.stop_reason
                finish_reason_sent[i] = True
            if output.disaggregated_params is not None:
-                choice.disaggregated_params = (
-                    DisaggregatedTypeConverter.to_oai_disaggregated_params(
-                        output.disaggregated_params
-                    )
-                )
+                # Block the disaggregated params at processor level
+                pass
+
            chunk = DynamoTRTLLMChatCompletionStreamResponse(
                id=request_id,
                choices=[choice],