Unverified Commit f10aab3b authored by KrishnanPrash's avatar KrishnanPrash Committed by GitHub
Browse files

fix: Migrating trtllm examples from `1.0.0rc0` to `1.0.4rc4` (#2217)

parent 97390ac0
......@@ -28,4 +28,7 @@ kv_cache_config:
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
use_cuda_graph: true
cuda_graph_config:
max_batch_size: 16
\ No newline at end of file
......@@ -16,11 +16,16 @@ tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 8192
max_batch_size: 16
trust_remote_code: true
backend: pytorch
enable_chunked_prefill: true
disable_overlap_scheduler: false
use_cuda_graph: true
cuda_graph_config:
max_batch_size: 16
kv_cache_config:
free_gpu_memory_fraction: 0.95
cache_transceiver_config:
backend: default
\ No newline at end of file
......@@ -28,23 +28,24 @@ max_num_tokens: 8448
max_seq_len: 8448
kv_cache_config:
free_gpu_memory_fraction: 0.30
dtype: fp8
# Enable the MTP(Multi-Token Prediction) in the model engine
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 1
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
kv_cache_dtype: fp8
......@@ -31,23 +31,24 @@ max_num_tokens: 512
max_seq_len: 8704
kv_cache_config:
free_gpu_memory_fraction: 0.85
dtype: fp8
# Enable the MTP(Multi-Token Prediction) in decode model engine
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 1
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
kv_cache_dtype: fp8
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
\ No newline at end of file
......@@ -27,8 +27,9 @@ max_num_tokens: 8192
max_seq_len: 8192
kv_cache_config:
free_gpu_memory_fraction: 0.75
dtype: fp8
print_iter_log: true
kv_cache_dtype: fp8
disable_overlap_scheduler: true
# Enable the MTP(Multi-Token Prediction) in the prefill model engine
......
......@@ -31,24 +31,26 @@ kv_cache_config:
# With dp attention enabled: large ISL at high concurrency may need
# free_gpu_memory_fraction low to have enough available memory.
# free_gpu_memory_fraction: 0.30
dtype: fp8
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_config:
enable_padding: true
# NOTE: For larger max batch size, you may want to add larger cuda graph
# batch sizes below to match.
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
kv_cache_dtype: fp8
......@@ -31,25 +31,27 @@ kv_cache_config:
# With dp attention enabled: large ISL at high concurrency may need
# free_gpu_memory_fraction low to have enough available memory.
# free_gpu_memory_fraction: 0.30
dtype: fp8
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: false
use_cuda_graph: true
cuda_graph_padding_enabled: true
# NOTE: For larger max batch size, you may want to add larger cuda graph
# batch sizes below to match.
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
cuda_graph_config:
enable_padding: true
# NOTE: For larger max batch size, you may want to
# add larger cuda graph batch sizes below to match.
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
kv_cache_dtype: fp8
......@@ -26,12 +26,11 @@ max_seq_len: 8192
kv_cache_config:
free_gpu_memory_fraction: 0.75
dtype: fp8 # NOTE: This dtype must match in both prefill/decode configs
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: true
print_iter_log: true
# NOTE: This dtype must match in both prefill/decode configs
kv_cache_dtype: fp8
print_iter_log: true
\ No newline at end of file
......@@ -10,18 +10,20 @@ enable_attention_dp: true
max_batch_size: 256
max_num_tokens: 256
max_seq_len: 8448
kv_cache_config:
free_gpu_memory_fraction: 0.7
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
kv_cache_dtype: fp8
dtype: fp8
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
......@@ -3,14 +3,16 @@
backend: pytorch
# WideEP related settings
moe_backend: WideEP
# moe_max_num_tokens will default to max_num_tokens if left unspecified.
#
# If you want to set this value explicitly, one recommendation is below:
# moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
# 4096 = 256 * 16
# moe_max_num_tokens: 4096
moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
moe_config:
backend: WIDEEP
# moe_max_num_tokens will default to max_num_tokens if left unspecified.
#
# If you want to set this value explicitly, one recommendation is below:
# moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
# 4096 = 256 * 16
# moe_max_num_tokens: 4096
load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
tensor_parallel_size: 16
moe_expert_parallel_size: 16
......@@ -18,18 +20,20 @@ enable_attention_dp: true
max_batch_size: 256
max_num_tokens: 256
max_seq_len: 8448
kv_cache_config:
free_gpu_memory_fraction: 0.7
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
kv_cache_dtype: fp8
free_gpu_memory_fraction: 0.3
dtype: fp8
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
\ No newline at end of file
......@@ -15,8 +15,9 @@
backend: pytorch
# WideEP related settings
moe_backend: WideEP
moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
moe_config:
backend: WIDEEP
load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
# TP/EP/PP/DP
tensor_parallel_size: 16
......@@ -35,25 +36,28 @@ kv_cache_config:
# With dp attention enabled: large ISL at high concurrency may need
# free_gpu_memory_fraction low to have enough available memory.
free_gpu_memory_fraction: 0.30
dtype: fp8
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: false
use_cuda_graph: true
cuda_graph_padding_enabled: true
# NOTE: For larger max batch size, you may want to add larger cuda graph
# batch sizes below to match.
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
cuda_graph_config:
enable_padding: true
# NOTE: For larger max batch size, you may want to
# add larger cuda graph batch sizes below to match.
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
kv_cache_dtype: fp8
......@@ -15,8 +15,9 @@
backend: pytorch
# WideEP related settings
moe_backend: WideEP
moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
moe_config:
backend: WIDEEP
load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
# TP/EP/PP/DP
tensor_parallel_size: 16
......@@ -29,13 +30,12 @@ max_num_tokens: 8192
max_seq_len: 8192
kv_cache_config:
free_gpu_memory_fraction: 0.75
free_gpu_memory_fraction: 0.3
dtype: fp8 # NOTE: This dtype must match in both prefill/decode configs
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: true
print_iter_log: true
# NOTE: This dtype must match in both prefill/decode configs
kv_cache_dtype: fp8
print_iter_log: true
\ No newline at end of file
......@@ -21,31 +21,21 @@ max_batch_size: 256
# Will be investigated in the future with TRTLLM team.
max_num_tokens: 1024
max_seq_len: 8448
autotuner_enabled: false
enable_autotuner: false
disable_overlap_scheduler: true
# Enable Speculative Decoding in the model engine
speculative_config:
decoding_type: Eagle
max_draft_len: 1
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: False
speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: false
kv_cache_config:
free_gpu_memory_fraction: 0.5
enable_block_reuse: false
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
kv_cache_dtype: fp8
cuda_graph_config:
max_batch_size: 8
......@@ -28,23 +28,24 @@ speculative_config:
decoding_type: Eagle
max_draft_len: 1
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: False
eagle3_one_model: false
kv_cache_config:
free_gpu_memory_fraction: 0.5
enable_block_reuse: false
dtype: fp8
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
kv_cache_dtype: fp8
......@@ -29,7 +29,7 @@ speculative_config:
decoding_type: Eagle
max_draft_len: 1
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: False
eagle3_one_model: false
kv_cache_config:
free_gpu_memory_fraction: 0.5
......
......@@ -16,13 +16,15 @@ tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 8192
max_batch_size: 16
trust_remote_code: true
backend: pytorch
enable_chunked_prefill: true
# Overlap scheduler not currently supported in prefill only workers.
disable_overlap_scheduler: true
use_cuda_graph: false
cuda_graph_config:
max_batch_size: 16
kv_cache_config:
free_gpu_memory_fraction: 0.95
cache_transceiver_config:
backend: default
\ No newline at end of file
......@@ -101,8 +101,10 @@ async def init(runtime: DistributedRuntime, config: Config):
kv_cache_config["event_buffer_max_size"] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
else:
kv_cache_config = arg_map["kv_cache_config"]
if not kv_cache_config.event_buffer_max_size:
kv_cache_config.event_buffer_max_size = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
if "event_buffer_max_size" not in kv_cache_config:
kv_cache_config[
"event_buffer_max_size"
] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
arg_map["kv_cache_config"] = kv_cache_config
# Only pytorch backend is supported for now to publish events and metrics.
......
......@@ -96,7 +96,7 @@ TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
# TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package.
# Need to update the Dockerfile.tensorrt_llm to use the ai-dynamo[trtllm] package.
DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.0.0rc0"
DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.0.0rc4"
TENSORRTLLM_PIP_WHEEL=""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment