Unverified Commit 67ff181d authored by Tzu-Ling Kan's avatar Tzu-Ling Kan Committed by GitHub
Browse files

feat: Trtllm canary health check (#3082)


Signed-off-by: default avatartzulingk@nvidia.com <tzulingk@nvidia.com>
Signed-off-by: default avatarTzu-Ling Kan <tzulingk@nvidia.com>
parent f88d7dc7
...@@ -21,11 +21,27 @@ class TrtllmHealthCheckPayload(HealthCheckPayload): ...@@ -21,11 +21,27 @@ class TrtllmHealthCheckPayload(HealthCheckPayload):
""" """
Initialize TRT-LLM health check payload with TRT-LLM-specific defaults. Initialize TRT-LLM health check payload with TRT-LLM-specific defaults.
""" """
# Set TRT-LLM default payload - minimal request that completes quickly # Set TensorRT-LLM default payload - minimal request that completes quickly
# The handler expects token_ids, stop_conditions, and sampling_options
self.default_payload = { self.default_payload = {
"messages": [{"role": "user", "content": "1"}], "token_ids": [1], # Single token for minimal processing
"max_tokens": 1, "stop_conditions": {
"temperature": 0.0, "max_tokens": 1, # Generate only 1 token
"stream": False, "stop": None,
"stop_token_ids": None,
"include_stop_str_in_output": False,
"ignore_eos": False,
"min_tokens": 0,
},
"sampling_options": {
"temperature": 0.0,
"top_p": 1.0,
"top_k": 1,
"beam_width": 1,
"repetition_penalty": 1.0,
"presence_penalty": 0.0,
"frequency_penalty": 0.0,
"seed": None,
},
} }
super().__init__() super().__init__()
...@@ -27,6 +27,7 @@ from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm ...@@ -27,6 +27,7 @@ from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
from dynamo.runtime import DistributedRuntime, dynamo_worker from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.trtllm.engine import TensorRTLLMEngine, get_llm_engine from dynamo.trtllm.engine import TensorRTLLMEngine, get_llm_engine
from dynamo.trtllm.health_check import TrtllmHealthCheckPayload
from dynamo.trtllm.multimodal_processor import MultimodalRequestProcessor from dynamo.trtllm.multimodal_processor import MultimodalRequestProcessor
from dynamo.trtllm.publisher import get_publisher from dynamo.trtllm.publisher import get_publisher
from dynamo.trtllm.request_handlers.handlers import ( from dynamo.trtllm.request_handlers.handlers import (
...@@ -316,6 +317,9 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -316,6 +317,9 @@ async def init(runtime: DistributedRuntime, config: Config):
runtime_config=runtime_config, runtime_config=runtime_config,
) )
# Get health check payload (checks env var and falls back to TensorRT-LLM default)
health_check_payload = TrtllmHealthCheckPayload().to_dict()
if config.publish_events_and_metrics and is_first_worker(config): if config.publish_events_and_metrics and is_first_worker(config):
# Initialize and pass in the publisher to the request handler to # Initialize and pass in the publisher to the request handler to
# publish events and metrics. # publish events and metrics.
...@@ -334,11 +338,15 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -334,11 +338,15 @@ async def init(runtime: DistributedRuntime, config: Config):
handler_config.publisher = publisher handler_config.publisher = publisher
handler = RequestHandlerFactory().get_request_handler(handler_config) handler = RequestHandlerFactory().get_request_handler(handler_config)
await endpoint.serve_endpoint( await endpoint.serve_endpoint(
handler.generate, metrics_labels=metrics_labels handler.generate,
metrics_labels=metrics_labels,
health_check_payload=health_check_payload,
) )
else: else:
handler = RequestHandlerFactory().get_request_handler(handler_config) handler = RequestHandlerFactory().get_request_handler(handler_config)
await endpoint.serve_endpoint(handler.generate) await endpoint.serve_endpoint(
handler.generate, health_check_payload=health_check_payload
)
def main(): def main():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment