feat: Trtllm canary health check (#3082)

Signed-off-by: tzulingk@nvidia.com <tzulingk@nvidia.com> Signed-off-by: Tzu-Ling Kan <tzulingk@nvidia.com>

feat: Trtllm canary health check (#3082)
Signed-off-by: tzulingk@nvidia.com <tzulingk@nvidia.com> Signed-off-by: Tzu-Ling Kan <tzulingk@nvidia.com>
67ff181d · Tzu-Ling Kan · GitHub · f88d7dc7 · 67ff181d · 67ff181d
Unverified Commit 67ff181d authored Sep 17, 2025 by Tzu-Ling Kan Committed by GitHub Sep 17, 2025
2 changed files
--- a/components/backends/trtllm/src/dynamo/trtllm/health_check.py
+++ b/components/backends/trtllm/src/dynamo/trtllm/health_check.py
@@ -21,11 +21,27 @@ class TrtllmHealthCheckPayload(HealthCheckPayload):
        """
        Initialize TRT-LLM health check payload with TRT-LLM-specific defaults.
        """
-        # Set TRT-LLM default payload - minimal request that completes quickly
+        # Set TensorRT-LLM default payload - minimal request that completes quickly
+        # The handler expects token_ids, stop_conditions, and sampling_options
        self.default_payload = {
-            "messages": [{"role": "user", "content": "1"}],
+            "token_ids": [1],  # Single token for minimal processing
-            "max_tokens": 1,
+            "stop_conditions": {
-            "temperature": 0.0,
+                "max_tokens": 1,  # Generate only 1 token
-            "stream": False,
+                "stop": None,
+                "stop_token_ids": None,
+                "include_stop_str_in_output": False,
+                "ignore_eos": False,
+                "min_tokens": 0,
+            },
+            "sampling_options": {
+                "temperature": 0.0,
+                "top_p": 1.0,
+                "top_k": 1,
+                "beam_width": 1,
+                "repetition_penalty": 1.0,
+                "presence_penalty": 0.0,
+                "frequency_penalty": 0.0,
+                "seed": None,
+            },
        }
        super().__init__()
--- a/components/backends/trtllm/src/dynamo/trtllm/main.py
+++ b/components/backends/trtllm/src/dynamo/trtllm/main.py
@@ -27,6 +27,7 @@ from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
 from dynamo.runtime import DistributedRuntime, dynamo_worker
 from dynamo.runtime.logging import configure_dynamo_logging
 from dynamo.trtllm.engine import TensorRTLLMEngine, get_llm_engine
+from dynamo.trtllm.health_check import TrtllmHealthCheckPayload
 from dynamo.trtllm.multimodal_processor import MultimodalRequestProcessor
 from dynamo.trtllm.publisher import get_publisher
 from dynamo.trtllm.request_handlers.handlers import (
@@ -316,6 +317,9 @@ async def init(runtime: DistributedRuntime, config: Config):
                runtime_config=runtime_config,
            )
+        # Get health check payload (checks env var and falls back to TensorRT-LLM default)
+        health_check_payload = TrtllmHealthCheckPayload().to_dict()
        if config.publish_events_and_metrics and is_first_worker(config):
            # Initialize and pass in the publisher to the request handler to
            # publish events and metrics.
@@ -334,11 +338,15 @@ async def init(runtime: DistributedRuntime, config: Config):
                handler_config.publisher = publisher
                handler = RequestHandlerFactory().get_request_handler(handler_config)
                await endpoint.serve_endpoint(
-                    handler.generate, metrics_labels=metrics_labels
+                    handler.generate,
+                    metrics_labels=metrics_labels,
+                    health_check_payload=health_check_payload,
                )
        else:
            handler = RequestHandlerFactory().get_request_handler(handler_config)
-            await endpoint.serve_endpoint(handler.generate)
+            await endpoint.serve_endpoint(
+                handler.generate, health_check_payload=health_check_payload
+            )
 def main():