fix: avoid example TRT-LLM worker OOM by tuning params (#5250)

1efc41e0 · Qi Wang · GitHub · 9dd76ba2 · 1efc41e0 · 1efc41e0
Unverified Commit 1efc41e0 authored Jan 07, 2026 by Qi Wang Committed by GitHub Jan 07, 2026
2 changed files
--- a/examples/deployments/router_standalone_trtllm/README.md
+++ b/examples/deployments/router_standalone_trtllm/README.md
@@ -176,6 +176,7 @@ python test_router.py -v
 - `DYNAMO_DEBUG=1`: Enable debug file dumps to `/tmp/debug_*.txt`
 - `LOGLEVEL=DEBUG`: Set logging level (DEBUG, INFO, WARNING, ERROR)
 - `TRANSFORMERS_ATTN_IMPLEMENTATION=eager`: Disable FlashAttention (set automatically)
+- `TRTLLM_MAX_NUM_TOKENS`: Set max token length
 ### Port Assignment

--- a/examples/deployments/router_standalone_trtllm/worker.py
+++ b/examples/deployments/router_standalone_trtllm/worker.py
@@ -20,9 +20,15 @@ logger = logging.getLogger(__name__)
 DEFAULT_KV_EVENT_BUFFER_MAX_SIZE = 1024
+llm_max_num_tokens = int(os.getenv("TRTLLM_MAX_NUM_TOKENS", "8192"))
 # Debug flag: set DYNAMO_DEBUG=1 to enable debug file dumps
 DEBUG_ENABLED = os.environ.get("DYNAMO_DEBUG", "0") == "1"
 DEBUG_WORKER_KV_FILE = "/tmp/debug_worker_kv.txt"
+# As api.py spins up 2 workers by default, we split the single GPU memory between 2
+# workers. Hence, 0.4.
+# TODO: allow memory args passing so that the caller can decide the best way to
+# allocate memory.
+kv_cache_free_gpu_memory_fraction = float(os.getenv("TRTLLM_FREE_GPU_FRAC", "0.4"))
 # Qwen2-VL specific token ID for image placeholders
 IMAGE_TOKEN_ID = 151937
@@ -347,7 +353,9 @@ class TrtllmWorker:
            kv_cache_config=KvCacheConfig(
                enable_block_reuse=True,
                event_buffer_max_size=DEFAULT_KV_EVENT_BUFFER_MAX_SIZE,
+                free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction,
            ),
+            max_num_tokens=llm_max_num_tokens,
        )
        self.metrics_publisher = MetricsPublisher(metrics_port)