fix: [TRTLLM+ LLAMA4 + Eagle 3] Remove the ‘two-models config’ and set the...

fix: [TRTLLM+ LLAMA4 + Eagle 3] Remove the ‘two-models config’ and set the ‘one-model’ solution as the default (#2661)

fix: [TRTLLM+ LLAMA4 + Eagle 3] Remove the ‘two-models config’ and set the...
fix: [TRTLLM+ LLAMA4 + Eagle 3] Remove the ‘two-models config’ and set the ‘one-model’ solution as the default (#2661)
0bd4995d · Richard Huo · GitHub · cbe854fc · cbe854fc · 0bd4995d
Unverified Commit 0bd4995d authored Aug 22, 2025 by Richard Huo Committed by GitHub Aug 22, 2025
7 changed files
--- a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yaml
+++ b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-backend: pytorch
-tensor_parallel_size: 4
-moe_expert_parallel_size: 4
-max_batch_size: 256
-# When max_num_tokens set to higher values, can cause OOM issues.
-# Will be investigated in the future with TRTLLM team.
-max_num_tokens: 1024
-max_seq_len: 8448
-enable_autotuner: false
-disable_overlap_scheduler: true
-
-# Enable Speculative Decoding in the model engine
-speculative_config:
-  decoding_type: Eagle
-  max_draft_len: 1
-  speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
-  eagle3_one_model: false
-
-kv_cache_config:
-  free_gpu_memory_fraction: 0.5
-  enable_block_reuse: false
-
-
-cuda_graph_config:
-  max_batch_size: 8
-
--- a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_agg.yml
+++ b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_agg.yml
@@ -14,11 +14,11 @@
 # limitations under the License.

 backend: pytorch
-tensor_parallel_size: 8
-moe_expert_parallel_size: 8
-max_batch_size: 8
-max_num_tokens: 4096
-disable_overlap_scheduler: true # disable_overlap_scheduler is having acc issue on both aggregated and disaggregated serving
+tensor_parallel_size: 4
+moe_expert_parallel_size: 4
+max_batch_size: 192
+max_num_tokens: 3072
+disable_overlap_scheduler: false

 # Enable Speculative Decoding in the model engine
 speculative_config:
@@ -28,11 +28,12 @@ speculative_config:
  eagle3_one_model: true

 kv_cache_config:
-  free_gpu_memory_fraction: 0.5
-  enable_block_reuse: false # true when target and draft are same kv dtype
+  free_gpu_memory_fraction: 0.2
+  enable_block_reuse: false

 cuda_graph_config:
-  padding_enabled: true
-  max_batch_size: 8
+  enable_padding: true
+  batch_sizes: [1,2,3,4,5,6,7,8,16,32,48,64,128,190,191,192]

 print_iter_log: true
+
--- a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml
+++ b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml
@@ -17,23 +17,21 @@ backend: pytorch
 tensor_parallel_size: 4
 moe_expert_parallel_size: 4
 max_batch_size: 256
-max_num_tokens: 512
+max_num_tokens: 1024
 # 8704 = 8192 ISL + 512 OSL
 max_seq_len: 8704
 disable_overlap_scheduler: true
-enable_autotuner: false

 # Enable Speculative Decoding in the model engine
 speculative_config:
  decoding_type: Eagle
-  max_draft_len: 1
+  max_draft_len: 3
  speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
-  eagle3_one_model: false
+  eagle3_one_model: true

 kv_cache_config:
  free_gpu_memory_fraction: 0.5
  enable_block_reuse: false
-  dtype: fp8

 cuda_graph_config:
  enable_padding: true

--- a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml
+++ b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml
@@ -21,19 +21,17 @@ max_num_tokens: 8192
 max_seq_len: 8192
 print_iter_log: true
 disable_overlap_scheduler: true
-enable_autotuner: false

 # Enable Speculative Decoding in the model engine
 speculative_config:
  decoding_type: Eagle
-  max_draft_len: 1
+  max_draft_len: 3
  speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
-  eagle3_one_model: false
+  eagle3_one_model: true

 kv_cache_config:
  free_gpu_memory_fraction: 0.5
  enable_block_reuse: false
-  dtype: fp8

 cache_transceiver_config:
  backend: default
--- a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_decode.yaml
+++ b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_decode.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-backend: pytorch
-tensor_parallel_size: 8
-moe_expert_parallel_size: 8
-max_batch_size: 256
-max_num_tokens: 1024
-# 8704 = 8192 ISL + 512 OSL
-max_seq_len: 8704
-disable_overlap_scheduler: true
-
-# Enable Speculative Decoding in the model engine
-speculative_config:
-  decoding_type: Eagle
-  max_draft_len: 3
-  speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
-  eagle3_one_model: True
-
-kv_cache_config:
-  free_gpu_memory_fraction: 0.5
-  enable_block_reuse: false
-
-cuda_graph_config:
-  padding_enabled: true
-  max_batch_size: 256
-
-print_iter_log: true
-
-cache_transceiver_config:
-  backend: default
--- a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_prefill.yaml
+++ b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_prefill.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-backend: pytorch
-tensor_parallel_size: 8
-moe_expert_parallel_size: 8
-max_batch_size: 1
-max_num_tokens: 8192
-max_seq_len: 8192
-print_iter_log: true
-disable_overlap_scheduler: true
-
-# Enable Speculative Decoding in the model engine
-speculative_config:
-  decoding_type: Eagle
-  max_draft_len: 3
-  speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
-  eagle3_one_model: True
-
-kv_cache_config:
-  free_gpu_memory_fraction: 0.5
-  enable_block_reuse: false
-
-cache_transceiver_config:
-  backend: default
--- a/components/backends/trtllm/llama4_plus_eagle.md
+++ b/components/backends/trtllm/llama4_plus_eagle.md
@@ -30,16 +30,7 @@ This guide demonstrates how to deploy Llama 4 Maverick Instruct with Eagle Specu
 For advanced control over how requests are routed between prefill and decode workers in disaggregated mode, refer to the [Disaggregation Strategy](./README.md#disaggregation-strategy) section.

 ## Notes
-* To run Eagle Speculative Decoding with Llama 4, ensure the container meets the following criteria:
-  * Built with a version of TensorRT-LLM based on the 0.21 release [Link](https://github.com/NVIDIA/TensorRT-LLM/tree/release/0.21)
-* If you need to download model weights off huggingface, make sure you run the command `huggingface-cli login` and have access to the necessary gated models.
-
-## Eagle3-one-model
-* Eagle3-one-model (`eagle3_one_model=True`) config is added in `engine_configs/llama4/eagle_one_model`. Build dynamo with the latest commit `66f299a` in TRTLLM 1.0.0.rc2 [Link](https://github.com/NVIDIA/TensorRT-LLM/commits/v1.0.0rc2/).
-* The configs in `engine_configs/llama4/eagle_one_model` are tested with 8xH100 cluster. Be sure to change the `NUM_GPUS_PER_NODE` accordingly or change TP/EP size in config. 1 8xH100 node for aggregated .yml file, 2 8xH100 for prefill/decode .yml file.
-* The current `./multinode/start_frontend_services.sh` may got ran `NUM_GPUS_PER_NODE` times depending on how srun/mpi is launched, beware that the frontend service only needs to be ran once.
-* Eagle3-one-model appends the eagle3 layer at the end of the TRTLLM engine, instead of sending base/draft requests between 2 engines. Visit TRTLLM for more information.
-
+* Make sure the (`eagle3_one_model: true`) is set in the LLM API config inside the `engine_configs/llama4/eagle` folder.

 ## Setup

@@ -66,7 +57,6 @@ export NUM_NODES=1
 export ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_agg.yaml"
 ./multinode/srun_aggregated.sh
 ```
-* Known Issue: In Aggregated Serving, setting `max_num_tokens` to higher values (e.g. `max_num_tokens: 8448`) can lead to Out of Memory (OOM) errors. This is being investigated by the TRTLLM team.

 ## Disaggregated Serving

@@ -77,8 +67,6 @@ export NUM_DECODE_NODES=1
 export DECODE_ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_decode.yaml"
 ./multinode/srun_disaggregated.sh
 ```
-* Known Issue: In Aggregated Serving, setting `max_num_tokens` to higher values (e.g. `max_num_tokens: 8448`) can lead to Out of Memory (OOM) errors. This is being investigated by the TRTLLM team.
-

 ## Example Request