Unverified Commit d537378a authored by Ryan McCormick's avatar Ryan McCormick Committed by GitHub
Browse files

fix: Update disagg configs for trtllm 1.0.0rc4 changes (main) (#2278) (#2282)

parent 4b8a748f
......@@ -28,4 +28,4 @@ kv_cache_config:
free_gpu_memory_fraction: 0.95
cache_transceiver_config:
backend: default
\ No newline at end of file
backend: default
......@@ -51,4 +51,7 @@ cuda_graph_config:
- 128
- 256
print_iter_log: true
\ No newline at end of file
print_iter_log: true
cache_transceiver_config:
backend: default
......@@ -36,3 +36,6 @@ disable_overlap_scheduler: true
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 1
cache_transceiver_config:
backend: default
......@@ -55,3 +55,6 @@ cuda_graph_config:
- 256
print_iter_log: true
cache_transceiver_config:
backend: default
......@@ -33,4 +33,7 @@ kv_cache_config:
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: true
print_iter_log: true
\ No newline at end of file
print_iter_log: true
cache_transceiver_config:
backend: default
......@@ -61,3 +61,6 @@ cuda_graph_config:
print_iter_log: true
cache_transceiver_config:
backend: default
......@@ -38,4 +38,7 @@ kv_cache_config:
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: true
print_iter_log: true
\ No newline at end of file
print_iter_log: true
cache_transceiver_config:
backend: default
......@@ -21,13 +21,13 @@ max_num_tokens: 512
# 8704 = 8192 ISL + 512 OSL
max_seq_len: 8704
disable_overlap_scheduler: true
autotuner_enabled: false
enable_autotuner: false
# Enable Speculative Decoding in the model engine
speculative_config:
decoding_type: Eagle
max_draft_len: 1
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: false
kv_cache_config:
......@@ -49,3 +49,6 @@ cuda_graph_config:
- 256
print_iter_log: true
cache_transceiver_config:
backend: default
......@@ -20,17 +20,20 @@ max_batch_size: 1
max_num_tokens: 8192
max_seq_len: 8192
print_iter_log: true
kv_cache_dtype: fp8
disable_overlap_scheduler: true
autotuner_enabled: false
enable_autotuner: false
# Enable Speculative Decoding in the model engine
speculative_config:
decoding_type: Eagle
max_draft_len: 1
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: false
kv_cache_config:
free_gpu_memory_fraction: 0.5
enable_block_reuse: false
dtype: fp8
cache_transceiver_config:
backend: default
......@@ -24,7 +24,7 @@ disable_overlap_scheduler: true # disable_overlap_scheduler is having acc issue
speculative_config:
decoding_type: Eagle
max_draft_len: 3
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: true
kv_cache_config:
......
......@@ -26,7 +26,7 @@ disable_overlap_scheduler: true
speculative_config:
decoding_type: Eagle
max_draft_len: 3
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: True
kv_cache_config:
......@@ -38,3 +38,6 @@ cuda_graph_config:
max_batch_size: 256
print_iter_log: true
cache_transceiver_config:
backend: default
......@@ -26,9 +26,12 @@ disable_overlap_scheduler: true
speculative_config:
decoding_type: Eagle
max_draft_len: 3
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: True
kv_cache_config:
free_gpu_memory_fraction: 0.5
enable_block_reuse: false
cache_transceiver_config:
backend: default
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment