Unverified Commit 67ff181d authored by Tzu-Ling Kan's avatar Tzu-Ling Kan Committed by GitHub
Browse files

feat: Trtllm canary health check (#3082)


Signed-off-by: default avatartzulingk@nvidia.com <tzulingk@nvidia.com>
Signed-off-by: default avatarTzu-Ling Kan <tzulingk@nvidia.com>
parent f88d7dc7
......@@ -21,11 +21,27 @@ class TrtllmHealthCheckPayload(HealthCheckPayload):
"""
Initialize TRT-LLM health check payload with TRT-LLM-specific defaults.
"""
# Set TRT-LLM default payload - minimal request that completes quickly
# Set TensorRT-LLM default payload - minimal request that completes quickly
# The handler expects token_ids, stop_conditions, and sampling_options
self.default_payload = {
"messages": [{"role": "user", "content": "1"}],
"max_tokens": 1,
"token_ids": [1], # Single token for minimal processing
"stop_conditions": {
"max_tokens": 1, # Generate only 1 token
"stop": None,
"stop_token_ids": None,
"include_stop_str_in_output": False,
"ignore_eos": False,
"min_tokens": 0,
},
"sampling_options": {
"temperature": 0.0,
"stream": False,
"top_p": 1.0,
"top_k": 1,
"beam_width": 1,
"repetition_penalty": 1.0,
"presence_penalty": 0.0,
"frequency_penalty": 0.0,
"seed": None,
},
}
super().__init__()
......@@ -27,6 +27,7 @@ from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.trtllm.engine import TensorRTLLMEngine, get_llm_engine
from dynamo.trtllm.health_check import TrtllmHealthCheckPayload
from dynamo.trtllm.multimodal_processor import MultimodalRequestProcessor
from dynamo.trtllm.publisher import get_publisher
from dynamo.trtllm.request_handlers.handlers import (
......@@ -316,6 +317,9 @@ async def init(runtime: DistributedRuntime, config: Config):
runtime_config=runtime_config,
)
# Get health check payload (checks env var and falls back to TensorRT-LLM default)
health_check_payload = TrtllmHealthCheckPayload().to_dict()
if config.publish_events_and_metrics and is_first_worker(config):
# Initialize and pass in the publisher to the request handler to
# publish events and metrics.
......@@ -334,11 +338,15 @@ async def init(runtime: DistributedRuntime, config: Config):
handler_config.publisher = publisher
handler = RequestHandlerFactory().get_request_handler(handler_config)
await endpoint.serve_endpoint(
handler.generate, metrics_labels=metrics_labels
handler.generate,
metrics_labels=metrics_labels,
health_check_payload=health_check_payload,
)
else:
handler = RequestHandlerFactory().get_request_handler(handler_config)
await endpoint.serve_endpoint(handler.generate)
await endpoint.serve_endpoint(
handler.generate, health_check_payload=health_check_payload
)
def main():
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment