Unverified Commit d537378a authored by Ryan McCormick's avatar Ryan McCormick Committed by GitHub
Browse files

fix: Update disagg configs for trtllm 1.0.0rc4 changes (main) (#2278) (#2282)

parent 4b8a748f
...@@ -28,4 +28,4 @@ kv_cache_config: ...@@ -28,4 +28,4 @@ kv_cache_config:
free_gpu_memory_fraction: 0.95 free_gpu_memory_fraction: 0.95
cache_transceiver_config: cache_transceiver_config:
backend: default backend: default
\ No newline at end of file
...@@ -51,4 +51,7 @@ cuda_graph_config: ...@@ -51,4 +51,7 @@ cuda_graph_config:
- 128 - 128
- 256 - 256
print_iter_log: true print_iter_log: true
\ No newline at end of file
cache_transceiver_config:
backend: default
...@@ -36,3 +36,6 @@ disable_overlap_scheduler: true ...@@ -36,3 +36,6 @@ disable_overlap_scheduler: true
speculative_config: speculative_config:
decoding_type: MTP decoding_type: MTP
num_nextn_predict_layers: 1 num_nextn_predict_layers: 1
cache_transceiver_config:
backend: default
...@@ -55,3 +55,6 @@ cuda_graph_config: ...@@ -55,3 +55,6 @@ cuda_graph_config:
- 256 - 256
print_iter_log: true print_iter_log: true
cache_transceiver_config:
backend: default
...@@ -33,4 +33,7 @@ kv_cache_config: ...@@ -33,4 +33,7 @@ kv_cache_config:
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: true disable_overlap_scheduler: true
print_iter_log: true print_iter_log: true
\ No newline at end of file
cache_transceiver_config:
backend: default
...@@ -61,3 +61,6 @@ cuda_graph_config: ...@@ -61,3 +61,6 @@ cuda_graph_config:
print_iter_log: true print_iter_log: true
cache_transceiver_config:
backend: default
...@@ -38,4 +38,7 @@ kv_cache_config: ...@@ -38,4 +38,7 @@ kv_cache_config:
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: true disable_overlap_scheduler: true
print_iter_log: true print_iter_log: true
\ No newline at end of file
cache_transceiver_config:
backend: default
...@@ -21,13 +21,13 @@ max_num_tokens: 512 ...@@ -21,13 +21,13 @@ max_num_tokens: 512
# 8704 = 8192 ISL + 512 OSL # 8704 = 8192 ISL + 512 OSL
max_seq_len: 8704 max_seq_len: 8704
disable_overlap_scheduler: true disable_overlap_scheduler: true
autotuner_enabled: false enable_autotuner: false
# Enable Speculative Decoding in the model engine # Enable Speculative Decoding in the model engine
speculative_config: speculative_config:
decoding_type: Eagle decoding_type: Eagle
max_draft_len: 1 max_draft_len: 1
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: false eagle3_one_model: false
kv_cache_config: kv_cache_config:
...@@ -49,3 +49,6 @@ cuda_graph_config: ...@@ -49,3 +49,6 @@ cuda_graph_config:
- 256 - 256
print_iter_log: true print_iter_log: true
cache_transceiver_config:
backend: default
...@@ -20,17 +20,20 @@ max_batch_size: 1 ...@@ -20,17 +20,20 @@ max_batch_size: 1
max_num_tokens: 8192 max_num_tokens: 8192
max_seq_len: 8192 max_seq_len: 8192
print_iter_log: true print_iter_log: true
kv_cache_dtype: fp8
disable_overlap_scheduler: true disable_overlap_scheduler: true
autotuner_enabled: false enable_autotuner: false
# Enable Speculative Decoding in the model engine # Enable Speculative Decoding in the model engine
speculative_config: speculative_config:
decoding_type: Eagle decoding_type: Eagle
max_draft_len: 1 max_draft_len: 1
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: false eagle3_one_model: false
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.5 free_gpu_memory_fraction: 0.5
enable_block_reuse: false enable_block_reuse: false
dtype: fp8
cache_transceiver_config:
backend: default
...@@ -24,7 +24,7 @@ disable_overlap_scheduler: true # disable_overlap_scheduler is having acc issue ...@@ -24,7 +24,7 @@ disable_overlap_scheduler: true # disable_overlap_scheduler is having acc issue
speculative_config: speculative_config:
decoding_type: Eagle decoding_type: Eagle
max_draft_len: 3 max_draft_len: 3
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: true eagle3_one_model: true
kv_cache_config: kv_cache_config:
......
...@@ -26,7 +26,7 @@ disable_overlap_scheduler: true ...@@ -26,7 +26,7 @@ disable_overlap_scheduler: true
speculative_config: speculative_config:
decoding_type: Eagle decoding_type: Eagle
max_draft_len: 3 max_draft_len: 3
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: True eagle3_one_model: True
kv_cache_config: kv_cache_config:
...@@ -38,3 +38,6 @@ cuda_graph_config: ...@@ -38,3 +38,6 @@ cuda_graph_config:
max_batch_size: 256 max_batch_size: 256
print_iter_log: true print_iter_log: true
cache_transceiver_config:
backend: default
...@@ -26,9 +26,12 @@ disable_overlap_scheduler: true ...@@ -26,9 +26,12 @@ disable_overlap_scheduler: true
speculative_config: speculative_config:
decoding_type: Eagle decoding_type: Eagle
max_draft_len: 3 max_draft_len: 3
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: True eagle3_one_model: True
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.5 free_gpu_memory_fraction: 0.5
enable_block_reuse: false enable_block_reuse: false
cache_transceiver_config:
backend: default
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment