chore: Update TRTLLM version. Fix router. (#527)

Signed-off-by: Tanmay Verma <tanmay2592@gmail.com> Co-authored-by: Dmitry Tokarev <dtokarev@nvidia.com>

chore: Update TRTLLM version. Fix router. (#527)
Signed-off-by: Tanmay Verma <tanmay2592@gmail.com> Co-authored-by: Dmitry Tokarev <dtokarev@nvidia.com>
7dca64df · Tanmay Verma · GitHub · 1dd20902 · 7dca64df · 7dca64df
Commit 7dca64df authored Apr 07, 2025 by Tanmay Verma Committed by GitHub Apr 08, 2025
7 changed files
--- a/container/Dockerfile.tensorrt_llm
+++ b/container/Dockerfile.tensorrt_llm
@@ -13,8 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-ARG BASE_IMAGE="gitlab-master.nvidia.com:5005/dl/dgx/tritonserver/tensorrt-llm/amd64"
+ARG BASE_IMAGE="tensorrt_llm/release"
-ARG BASE_IMAGE_TAG="krish-fix-trtllm-build.23766174"
+ARG BASE_IMAGE_TAG="latest"
 FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS dev
@@ -23,7 +23,8 @@ USER root
 # Install utilities
 RUN apt update -y && apt install -y git wget curl nvtop tmux vim
 # nats
-RUN wget https://github.com/nats-io/nats-server/releases/download/v2.10.24/nats-server-v2.10.24-amd64.deb && dpkg -i nats-server-v2.10.24-amd64.deb && rm nats-server-v2.10.24-amd64.deb
+RUN wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/v2.10.24/nats-server-v2.10.24-amd64.deb && \
+    dpkg -i nats-server-v2.10.24-amd64.deb && rm nats-server-v2.10.24-amd64.deb
 # etcd
 ENV ETCD_VERSION="v3.5.18"
 RUN wget https://github.com/etcd-io/etcd/releases/download/$ETCD_VERSION/etcd-$ETCD_VERSION-linux-amd64.tar.gz -O /tmp/etcd.tar.gz && \

--- a/container/build_trtllm_base_image.sh
+++ b/container/build_trtllm_base_image.sh
@@ -17,7 +17,7 @@
 # Build the TRT-LLM base image.
 # This script builds the TRT-LLM base image for Dynamo with TensorRT-LLM.
-TRTLLM_COMMIT=9b931c0f6
+TRTLLM_COMMIT=0d4d50a745
 while getopts "c:" opt; do
  case ${opt} in
@@ -29,12 +29,15 @@ done
 (cd /tmp && \
 # Clone the TensorRT-LLM repository.
 if [ ! -d "TensorRT-LLM" ]; then
-  git clone https://github.com/NVIDIA/TensorRT-LLM.git
+  git clone --single-branch --branch main https://github.com/NVIDIA/TensorRT-LLM.git
 fi
 cd TensorRT-LLM
 # Checkout the specified commit.
+# Switch to the main branch to pull the latest changes.
+git checkout main
+git pull
 git checkout $TRTLLM_COMMIT
 # Update the submodules.

--- a/examples/tensorrt_llm/common/protocol.py
+++ b/examples/tensorrt_llm/common/protocol.py
@@ -44,6 +44,7 @@ class DynamoTRTLLMCompletionRequest(CompletionRequest):
 class DynamoTRTLLMChatCompletionRequest(ChatCompletionRequest):
    id: str = Field(default_factory=lambda: f"chatcmpl-{str(uuid.uuid4().hex)}")
    max_completion_tokens: Optional[int] = None
+    max_tokens: Optional[int] = None
    disaggregated_params: Optional[DisaggregatedParams] = Field(default=None)

--- a/examples/tensorrt_llm/components/agg_worker.py
+++ b/examples/tensorrt_llm/components/agg_worker.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import asyncio
 import signal
 from dataclasses import asdict
@@ -74,11 +75,13 @@ class TensorRTLLMWorker(BaseTensorrtLLMEngine):
    @async_on_start
    async def async_init(self):
        super().__init__(self.trtllm_engine_args)
+        task = asyncio.create_task(self.create_metrics_publisher_endpoint())
+        task.add_done_callback(lambda _: print("metrics publisher endpoint created"))
        print("TensorRT-LLM Worker initialized")
    async def create_metrics_publisher_endpoint(self):
        component = dynamo_context["component"]
-        await self.metrics_publisher.create_endpoint(component)
+        await self.trtllm_engine_args.kv_metrics_publisher.create_endpoint(component)
    @dynamo_endpoint()
    async def generate(self, request: TRTLLMWorkerRequest):

--- a/examples/tensorrt_llm/components/processor.py
+++ b/examples/tensorrt_llm/components/processor.py
@@ -129,6 +129,16 @@ class Processor(ChatProcessorMixin):
    @dynamo_endpoint(name="chat/completions")
    async def generate_chat(self, raw_request: DynamoTRTLLMChatCompletionRequest):
+        # max_tokens is deprecated, however if the max_tokens is provided instead
+        # of max_completion_tokens, we will use the value as max_completion_tokens.
+        if raw_request.max_tokens is not None:
+            if raw_request.max_completion_tokens is None:
+                raw_request.max_completion_tokens = raw_request.max_tokens
+            else:
+                if raw_request.max_tokens != raw_request.max_completion_tokens:
+                    raise ValueError(
+                        "max_tokens and max_completion_tokens must be the same"
+                    )
        async for response in self._generate(raw_request, RequestType.CHAT):
            yield response

--- a/examples/tensorrt_llm/configs/agg_router.yaml
+++ b/examples/tensorrt_llm/configs/agg_router.yaml
@@ -23,6 +23,10 @@ Processor:
  block-size: 64
  router: kv
+Router:
+  model-name: deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+  min-workers: 1
 TensorRTLLMWorker:
  engine_args: "configs/llm_api_config.yaml"
  router: kv

--- a/examples/tensorrt_llm/engines/agg_engine.py
+++ b/examples/tensorrt_llm/engines/agg_engine.py
@@ -96,8 +96,6 @@ def init_global_engine(args, engine_config):
    DynamoTRTLLMChatCompletionRequest, DynamoTRTLLMChatCompletionStreamResponse
 )
 async def generate(request):
-    if request.max_completion_tokens is not None:
-        request.max_tokens = request.max_completion_tokens
    async for response in chat_generator(engine, request):
        yield response