]# Frontend ports: use PORTS[0] for single router, PORTS for multi-router
NUM_REQUESTS=10
# Shared test payload for all tests
TEST_PAYLOAD:Dict[str,Any]={
"model":MODEL_NAME,
"messages":[
{
"role":"user",
"content":"In a quiet meadow tucked between rolling hills, a plump gray rabbit nibbled on clover beneath the shade of a gnarled oak tree. Its ears twitched at the faint rustle of leaves, but it remained calm, confident in the safety of its burrow just a few hops away. The late afternoon sun warmed its fur, and tiny dust motes danced in the golden light as bees hummed lazily nearby. Though the rabbit lived a simple life, every day was an adventure of scents, shadows, and snacks—an endless search for the tastiest patch of greens and the softest spot to nap.",
}
],
"stream":True,
"max_tokens":10,
}
# Shared TRT-LLM configuration for all tests
# free_gpu_memory_fraction limits actual VRAM allocation (required for multi-worker on same GPU)
TRTLLM_ARGS:Dict[str,Any]={
"kv_block_size":TRTLLM_BLOCK_SIZE,
"model":MODEL_NAME,
"free_gpu_memory_fraction":0.4,# Limit VRAM allocation per worker
"max_seq_len":1024,# Limit context length to reduce KV cache size
}
classTRTLLMProcess:
"""Manages TRT-LLM workers using dynamo.trtllm (HTTP API + KV events).
This is a drop-in replacement for MockerProcess that uses real TRT-LLM workers.
The key difference: dynamo.trtllm automatically handles:
- HTTP API serving
- KV cache event publishing
- Integration with dynamo.frontend router
"""
def__init__(
self,
request,
trtllm_args:Optional[Dict[str,Any]]=None,
num_workers:int=2,
single_gpu:bool=False,
):
"""Initialize TRT-LLM workers with dynamo integration.