fix: Migrating trtllm examples from `1.0.0rc0` to `1.0.4rc4` (#2217)

f10aab3b · KrishnanPrash · GitHub · 97390ac0 · f10aab3b · f10aab3b
Unverified Commit f10aab3b authored Jul 31, 2025 by KrishnanPrash Committed by GitHub Jul 31, 2025
18 changed files
--- a/components/backends/trtllm/engine_configs/agg.yaml
+++ b/components/backends/trtllm/engine_configs/agg.yaml
@@ -28,4 +28,7 @@ kv_cache_config:
 # NOTE: overlap_scheduler enabled by default since this commit and changed
 # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
-use_cuda_graph: true
+cuda_graph_config:
+  max_batch_size: 16
\ No newline at end of file
--- a/components/backends/trtllm/engine_configs/decode.yaml
+++ b/components/backends/trtllm/engine_configs/decode.yaml
@@ -16,11 +16,16 @@ tensor_parallel_size: 1
 moe_expert_parallel_size: 1
 enable_attention_dp: false
 max_num_tokens: 8192
-max_batch_size: 16
 trust_remote_code: true
 backend: pytorch
 enable_chunked_prefill: true
 disable_overlap_scheduler: false
-use_cuda_graph: true
+cuda_graph_config:
+  max_batch_size: 16
 kv_cache_config:
  free_gpu_memory_fraction: 0.95
+cache_transceiver_config:
+  backend: default
\ No newline at end of file
--- a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml
@@ -28,23 +28,24 @@ max_num_tokens: 8448
 max_seq_len: 8448
 kv_cache_config:
  free_gpu_memory_fraction: 0.30
+  dtype: fp8
 # Enable the MTP(Multi-Token Prediction) in the model engine
 speculative_config:
  decoding_type: MTP
  num_nextn_predict_layers: 1
-use_cuda_graph: true
+cuda_graph_config:
-cuda_graph_padding_enabled: true
+  enable_padding: true
-cuda_graph_batch_sizes:
+  batch_sizes:
- 1
+  - 1
- 2
+  - 2
- 4
+  - 4
- 8
+  - 8
- 16
+  - 16
- 32
+  - 32
- 64
+  - 64
- 128
+  - 128
- 256
+  - 256
 print_iter_log: true
-kv_cache_dtype: fp8
--- a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml
@@ -31,23 +31,24 @@ max_num_tokens: 512
 max_seq_len: 8704
 kv_cache_config:
  free_gpu_memory_fraction: 0.85
+  dtype: fp8
 # Enable the MTP(Multi-Token Prediction) in decode model engine
 speculative_config:
  decoding_type: MTP
  num_nextn_predict_layers: 1
-use_cuda_graph: true
+cuda_graph_config:
-cuda_graph_padding_enabled: true
+  enable_padding: true
-cuda_graph_batch_sizes:
+  batch_sizes:
- 1
+  - 1
- 2
+  - 2
- 4
+  - 4
- 8
+  - 8
- 16
+  - 16
- 32
+  - 32
- 64
+  - 64
- 128
+  - 128
- 256
+  - 256
 print_iter_log: true
\ No newline at end of file
-kv_cache_dtype: fp8
--- a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml
@@ -27,8 +27,9 @@ max_num_tokens: 8192
 max_seq_len: 8192
 kv_cache_config:
  free_gpu_memory_fraction: 0.75
+  dtype: fp8
 print_iter_log: true
-kv_cache_dtype: fp8
 disable_overlap_scheduler: true
 # Enable the MTP(Multi-Token Prediction) in the prefill model engine

--- a/components/backends/trtllm/engine_configs/deepseek_r1/simple/agg.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/simple/agg.yaml
@@ -31,24 +31,26 @@ kv_cache_config:
  # With dp attention enabled: large ISL at high concurrency may need
  # free_gpu_memory_fraction low to have enough available memory.
  # free_gpu_memory_fraction: 0.30
+  dtype: fp8
 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
 # NOTE: overlap_scheduler enabled by default since this commit and changed
 # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
-use_cuda_graph: true
+cuda_graph_config:
-cuda_graph_padding_enabled: true
+  enable_padding: true
 # NOTE: For larger max batch size, you may want to add larger cuda graph
 # batch sizes below to match.
-cuda_graph_batch_sizes:
+  batch_sizes:
- 1
+  - 1
- 2
+  - 2
- 4
+  - 4
- 8
+  - 8
- 16
+  - 16
- 32
+  - 32
- 64
+  - 64
- 128
+  - 128
- 256
+  - 256
 print_iter_log: true
-kv_cache_dtype: fp8
--- a/components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml
@@ -31,25 +31,27 @@ kv_cache_config:
  # With dp attention enabled: large ISL at high concurrency may need
  # free_gpu_memory_fraction low to have enough available memory.
  # free_gpu_memory_fraction: 0.30
+  dtype: fp8
 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
 # NOTE: overlap_scheduler enabled by default since this commit and changed
 # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
 disable_overlap_scheduler: false
-use_cuda_graph: true
-cuda_graph_padding_enabled: true
+cuda_graph_config:
-# NOTE: For larger max batch size, you may want to add larger cuda graph
+  enable_padding: true
-# batch sizes below to match.
+  # NOTE: For larger max batch size, you may want to
-cuda_graph_batch_sizes:
+  # add larger cuda graph batch sizes below to match.
- 1
+  batch_sizes:
- 2
+  - 1
- 4
+  - 2
- 8
+  - 4
- 16
+  - 8
- 32
+  - 16
- 64
+  - 32
- 128
+  - 64
- 256
+  - 128
+  - 256
 print_iter_log: true
-kv_cache_dtype: fp8
--- a/components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml
@@ -26,6 +26,7 @@ max_seq_len: 8192
 kv_cache_config:
  free_gpu_memory_fraction: 0.75
+  dtype: fp8 # NOTE: This dtype must match in both prefill/decode configs
 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
 # NOTE: overlap_scheduler enabled by default since this commit and changed
@@ -33,5 +34,3 @@ kv_cache_config:
 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
 disable_overlap_scheduler: true
 print_iter_log: true
\ No newline at end of file
-# NOTE: This dtype must match in both prefill/decode configs
-kv_cache_dtype: fp8
--- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml
@@ -10,18 +10,20 @@ enable_attention_dp: true
 max_batch_size: 256
 max_num_tokens: 256
 max_seq_len: 8448
 kv_cache_config:
  free_gpu_memory_fraction: 0.7
-use_cuda_graph: true
+  dtype: fp8
-cuda_graph_padding_enabled: true
-cuda_graph_batch_sizes:
+cuda_graph_config:
- 1
+  enable_padding: true
- 2
+  batch_sizes:
- 4
+  - 1
- 8
+  - 2
- 16
+  - 4
- 32
+  - 8
- 64
+  - 16
- 128
+  - 32
- 256
+  - 64
-kv_cache_dtype: fp8
+  - 128
+  - 256
--- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml
@@ -3,14 +3,16 @@
 backend: pytorch
 # WideEP related settings
-moe_backend: WideEP
+moe_config:
-# moe_max_num_tokens will default to max_num_tokens if left unspecified.
+  backend: WIDEEP
-#
+  # moe_max_num_tokens will default to max_num_tokens if left unspecified.
-# If you want to set this value explicitly, one recommendation is below:
+  #
-#   moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
+  # If you want to set this value explicitly, one recommendation is below:
-#   4096 = 256 * 16
+  #   moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
-# moe_max_num_tokens: 4096
+  #   4096 = 256 * 16
-moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
+  # moe_max_num_tokens: 4096
+  load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
 tensor_parallel_size: 16
 moe_expert_parallel_size: 16
@@ -18,18 +20,20 @@ enable_attention_dp: true
 max_batch_size: 256
 max_num_tokens: 256
 max_seq_len: 8448
 kv_cache_config:
-  free_gpu_memory_fraction: 0.7
+  free_gpu_memory_fraction: 0.3
-use_cuda_graph: true
+  dtype: fp8
-cuda_graph_padding_enabled: true
-cuda_graph_batch_sizes:
+cuda_graph_config:
- 1
+  enable_padding: true
- 2
+  batch_sizes:
- 4
+  - 1
- 8
+  - 2
- 16
+  - 4
- 32
+  - 8
- 64
+  - 16
- 128
+  - 32
- 256
+  - 64
-kv_cache_dtype: fp8
+  - 128
+  - 256
\ No newline at end of file
--- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml
@@ -15,8 +15,9 @@
 backend: pytorch
 # WideEP related settings
-moe_backend: WideEP
+moe_config:
-moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
+  backend: WIDEEP
+  load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
 # TP/EP/PP/DP
 tensor_parallel_size: 16
@@ -35,25 +36,28 @@ kv_cache_config:
  # With dp attention enabled: large ISL at high concurrency may need
  # free_gpu_memory_fraction low to have enough available memory.
  free_gpu_memory_fraction: 0.30
+  dtype: fp8
 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
 # NOTE: overlap_scheduler enabled by default since this commit and changed
 # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
 disable_overlap_scheduler: false
-use_cuda_graph: true
+cuda_graph_config:
-cuda_graph_padding_enabled: true
+  enable_padding: true
-# NOTE: For larger max batch size, you may want to add larger cuda graph
+  # NOTE: For larger max batch size, you may want to
-# batch sizes below to match.
+  # add larger cuda graph batch sizes below to match.
-cuda_graph_batch_sizes:
+  batch_sizes:
- 1
+  - 1
- 2
+  - 2
- 4
+  - 4
- 8
+  - 8
- 16
+  - 16
- 32
+  - 32
- 64
+  - 64
- 128
+  - 128
- 256
+  - 256
 print_iter_log: true
-kv_cache_dtype: fp8
--- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml
@@ -15,8 +15,9 @@
 backend: pytorch
 # WideEP related settings
-moe_backend: WideEP
+moe_config:
-moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
+  backend: WIDEEP
+  load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
 # TP/EP/PP/DP
 tensor_parallel_size: 16
@@ -29,7 +30,8 @@ max_num_tokens: 8192
 max_seq_len: 8192
 kv_cache_config:
-  free_gpu_memory_fraction: 0.75
+  free_gpu_memory_fraction: 0.3
+  dtype: fp8 # NOTE: This dtype must match in both prefill/decode configs
 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
 # NOTE: overlap_scheduler enabled by default since this commit and changed
@@ -37,5 +39,3 @@ kv_cache_config:
 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
 disable_overlap_scheduler: true
 print_iter_log: true
\ No newline at end of file
-# NOTE: This dtype must match in both prefill/decode configs
-kv_cache_dtype: fp8
--- a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yaml
+++ b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yaml
@@ -21,31 +21,21 @@ max_batch_size: 256
 # Will be investigated in the future with TRTLLM team.
 max_num_tokens: 1024
 max_seq_len: 8448
-autotuner_enabled: false
+enable_autotuner: false
 disable_overlap_scheduler: true
 # Enable Speculative Decoding in the model engine
 speculative_config:
  decoding_type: Eagle
  max_draft_len: 1
-  pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
+  speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
-  eagle3_one_model: False
+  eagle3_one_model: false
 kv_cache_config:
  free_gpu_memory_fraction: 0.5
  enable_block_reuse: false
-use_cuda_graph: true
-cuda_graph_padding_enabled: true
+cuda_graph_config:
-cuda_graph_batch_sizes:
+  max_batch_size: 8
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
-print_iter_log: true
-kv_cache_dtype: fp8
--- a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml
+++ b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml
@@ -28,23 +28,24 @@ speculative_config:
  decoding_type: Eagle
  max_draft_len: 1
  pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
-  eagle3_one_model: False
+  eagle3_one_model: false
 kv_cache_config:
  free_gpu_memory_fraction: 0.5
  enable_block_reuse: false
+  dtype: fp8
+cuda_graph_config:
+  enable_padding: true
+  batch_sizes:
+  - 1
+  - 2
+  - 4
+  - 8
+  - 16
+  - 32
+  - 64
+  - 128
+  - 256
-use_cuda_graph: true
-cuda_graph_padding_enabled: true
-cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
 print_iter_log: true
-kv_cache_dtype: fp8
--- a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml
+++ b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml
@@ -29,7 +29,7 @@ speculative_config:
  decoding_type: Eagle
  max_draft_len: 1
  pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
-  eagle3_one_model: False
+  eagle3_one_model: false
 kv_cache_config:
  free_gpu_memory_fraction: 0.5

--- a/components/backends/trtllm/engine_configs/prefill.yaml
+++ b/components/backends/trtllm/engine_configs/prefill.yaml
@@ -16,13 +16,15 @@ tensor_parallel_size: 1
 moe_expert_parallel_size: 1
 enable_attention_dp: false
 max_num_tokens: 8192
-max_batch_size: 16
 trust_remote_code: true
 backend: pytorch
 enable_chunked_prefill: true
 # Overlap scheduler not currently supported in prefill only workers.
 disable_overlap_scheduler: true
-use_cuda_graph: false
+cuda_graph_config:
+  max_batch_size: 16
 kv_cache_config:
  free_gpu_memory_fraction: 0.95
+cache_transceiver_config:
+  backend: default
\ No newline at end of file
--- a/components/backends/trtllm/src/dynamo/trtllm/main.py
+++ b/components/backends/trtllm/src/dynamo/trtllm/main.py
@@ -101,8 +101,10 @@ async def init(runtime: DistributedRuntime, config: Config):
            kv_cache_config["event_buffer_max_size"] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
        else:
            kv_cache_config = arg_map["kv_cache_config"]
-            if not kv_cache_config.event_buffer_max_size:
+            if "event_buffer_max_size" not in kv_cache_config:
-                kv_cache_config.event_buffer_max_size = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
+                kv_cache_config[
+                    "event_buffer_max_size"
+                ] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
        arg_map["kv_cache_config"] = kv_cache_config
        # Only pytorch backend is supported for now to publish events and metrics.

--- a/container/build.sh
+++ b/container/build.sh
@@ -96,7 +96,7 @@ TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
 TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
 # TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package.
 # Need to update the Dockerfile.tensorrt_llm to use the ai-dynamo[trtllm] package.
-DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.0.0rc0"
+DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.0.0rc4"
 TENSORRTLLM_PIP_WHEEL=""