Unverified Commit f10aab3b authored by KrishnanPrash's avatar KrishnanPrash Committed by GitHub
Browse files

fix: Migrating trtllm examples from `1.0.0rc0` to `1.0.4rc4` (#2217)

parent 97390ac0
...@@ -28,4 +28,7 @@ kv_cache_config: ...@@ -28,4 +28,7 @@ kv_cache_config:
# NOTE: overlap_scheduler enabled by default since this commit and changed # NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
use_cuda_graph: true
cuda_graph_config:
max_batch_size: 16
\ No newline at end of file
...@@ -16,11 +16,16 @@ tensor_parallel_size: 1 ...@@ -16,11 +16,16 @@ tensor_parallel_size: 1
moe_expert_parallel_size: 1 moe_expert_parallel_size: 1
enable_attention_dp: false enable_attention_dp: false
max_num_tokens: 8192 max_num_tokens: 8192
max_batch_size: 16
trust_remote_code: true trust_remote_code: true
backend: pytorch backend: pytorch
enable_chunked_prefill: true enable_chunked_prefill: true
disable_overlap_scheduler: false disable_overlap_scheduler: false
use_cuda_graph: true
cuda_graph_config:
max_batch_size: 16
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.95 free_gpu_memory_fraction: 0.95
cache_transceiver_config:
backend: default
\ No newline at end of file
...@@ -28,23 +28,24 @@ max_num_tokens: 8448 ...@@ -28,23 +28,24 @@ max_num_tokens: 8448
max_seq_len: 8448 max_seq_len: 8448
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.30 free_gpu_memory_fraction: 0.30
dtype: fp8
# Enable the MTP(Multi-Token Prediction) in the model engine # Enable the MTP(Multi-Token Prediction) in the model engine
speculative_config: speculative_config:
decoding_type: MTP decoding_type: MTP
num_nextn_predict_layers: 1 num_nextn_predict_layers: 1
use_cuda_graph: true cuda_graph_config:
cuda_graph_padding_enabled: true enable_padding: true
cuda_graph_batch_sizes: batch_sizes:
- 1 - 1
- 2 - 2
- 4 - 4
- 8 - 8
- 16 - 16
- 32 - 32
- 64 - 64
- 128 - 128
- 256 - 256
print_iter_log: true print_iter_log: true
kv_cache_dtype: fp8
...@@ -31,23 +31,24 @@ max_num_tokens: 512 ...@@ -31,23 +31,24 @@ max_num_tokens: 512
max_seq_len: 8704 max_seq_len: 8704
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.85 free_gpu_memory_fraction: 0.85
dtype: fp8
# Enable the MTP(Multi-Token Prediction) in decode model engine # Enable the MTP(Multi-Token Prediction) in decode model engine
speculative_config: speculative_config:
decoding_type: MTP decoding_type: MTP
num_nextn_predict_layers: 1 num_nextn_predict_layers: 1
use_cuda_graph: true cuda_graph_config:
cuda_graph_padding_enabled: true enable_padding: true
cuda_graph_batch_sizes: batch_sizes:
- 1 - 1
- 2 - 2
- 4 - 4
- 8 - 8
- 16 - 16
- 32 - 32
- 64 - 64
- 128 - 128
- 256 - 256
print_iter_log: true
kv_cache_dtype: fp8 print_iter_log: true
\ No newline at end of file
...@@ -27,8 +27,9 @@ max_num_tokens: 8192 ...@@ -27,8 +27,9 @@ max_num_tokens: 8192
max_seq_len: 8192 max_seq_len: 8192
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.75 free_gpu_memory_fraction: 0.75
dtype: fp8
print_iter_log: true print_iter_log: true
kv_cache_dtype: fp8
disable_overlap_scheduler: true disable_overlap_scheduler: true
# Enable the MTP(Multi-Token Prediction) in the prefill model engine # Enable the MTP(Multi-Token Prediction) in the prefill model engine
......
...@@ -31,24 +31,26 @@ kv_cache_config: ...@@ -31,24 +31,26 @@ kv_cache_config:
# With dp attention enabled: large ISL at high concurrency may need # With dp attention enabled: large ISL at high concurrency may need
# free_gpu_memory_fraction low to have enough available memory. # free_gpu_memory_fraction low to have enough available memory.
# free_gpu_memory_fraction: 0.30 # free_gpu_memory_fraction: 0.30
dtype: fp8
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed # NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
use_cuda_graph: true cuda_graph_config:
cuda_graph_padding_enabled: true enable_padding: true
# NOTE: For larger max batch size, you may want to add larger cuda graph # NOTE: For larger max batch size, you may want to add larger cuda graph
# batch sizes below to match. # batch sizes below to match.
cuda_graph_batch_sizes: batch_sizes:
- 1 - 1
- 2 - 2
- 4 - 4
- 8 - 8
- 16 - 16
- 32 - 32
- 64 - 64
- 128 - 128
- 256 - 256
print_iter_log: true print_iter_log: true
kv_cache_dtype: fp8
...@@ -31,25 +31,27 @@ kv_cache_config: ...@@ -31,25 +31,27 @@ kv_cache_config:
# With dp attention enabled: large ISL at high concurrency may need # With dp attention enabled: large ISL at high concurrency may need
# free_gpu_memory_fraction low to have enough available memory. # free_gpu_memory_fraction low to have enough available memory.
# free_gpu_memory_fraction: 0.30 # free_gpu_memory_fraction: 0.30
dtype: fp8
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed # NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: false disable_overlap_scheduler: false
use_cuda_graph: true
cuda_graph_padding_enabled: true cuda_graph_config:
# NOTE: For larger max batch size, you may want to add larger cuda graph enable_padding: true
# batch sizes below to match. # NOTE: For larger max batch size, you may want to
cuda_graph_batch_sizes: # add larger cuda graph batch sizes below to match.
- 1 batch_sizes:
- 2 - 1
- 4 - 2
- 8 - 4
- 16 - 8
- 32 - 16
- 64 - 32
- 128 - 64
- 256 - 128
- 256
print_iter_log: true print_iter_log: true
kv_cache_dtype: fp8
...@@ -26,12 +26,11 @@ max_seq_len: 8192 ...@@ -26,12 +26,11 @@ max_seq_len: 8192
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.75 free_gpu_memory_fraction: 0.75
dtype: fp8 # NOTE: This dtype must match in both prefill/decode configs
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed # NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: true disable_overlap_scheduler: true
print_iter_log: true print_iter_log: true
# NOTE: This dtype must match in both prefill/decode configs \ No newline at end of file
kv_cache_dtype: fp8
...@@ -10,18 +10,20 @@ enable_attention_dp: true ...@@ -10,18 +10,20 @@ enable_attention_dp: true
max_batch_size: 256 max_batch_size: 256
max_num_tokens: 256 max_num_tokens: 256
max_seq_len: 8448 max_seq_len: 8448
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.7 free_gpu_memory_fraction: 0.7
use_cuda_graph: true dtype: fp8
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes: cuda_graph_config:
- 1 enable_padding: true
- 2 batch_sizes:
- 4 - 1
- 8 - 2
- 16 - 4
- 32 - 8
- 64 - 16
- 128 - 32
- 256 - 64
kv_cache_dtype: fp8 - 128
- 256
...@@ -3,14 +3,16 @@ ...@@ -3,14 +3,16 @@
backend: pytorch backend: pytorch
# WideEP related settings # WideEP related settings
moe_backend: WideEP moe_config:
# moe_max_num_tokens will default to max_num_tokens if left unspecified. backend: WIDEEP
# # moe_max_num_tokens will default to max_num_tokens if left unspecified.
# If you want to set this value explicitly, one recommendation is below: #
# moe_max_num_tokens = max_batch_size * moe_expert_parallel_size # If you want to set this value explicitly, one recommendation is below:
# 4096 = 256 * 16 # moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
# moe_max_num_tokens: 4096 # 4096 = 256 * 16
moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml # moe_max_num_tokens: 4096
load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
tensor_parallel_size: 16 tensor_parallel_size: 16
moe_expert_parallel_size: 16 moe_expert_parallel_size: 16
...@@ -18,18 +20,20 @@ enable_attention_dp: true ...@@ -18,18 +20,20 @@ enable_attention_dp: true
max_batch_size: 256 max_batch_size: 256
max_num_tokens: 256 max_num_tokens: 256
max_seq_len: 8448 max_seq_len: 8448
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.7 free_gpu_memory_fraction: 0.3
use_cuda_graph: true dtype: fp8
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes: cuda_graph_config:
- 1 enable_padding: true
- 2 batch_sizes:
- 4 - 1
- 8 - 2
- 16 - 4
- 32 - 8
- 64 - 16
- 128 - 32
- 256 - 64
kv_cache_dtype: fp8 - 128
- 256
\ No newline at end of file
...@@ -15,8 +15,9 @@ ...@@ -15,8 +15,9 @@
backend: pytorch backend: pytorch
# WideEP related settings # WideEP related settings
moe_backend: WideEP moe_config:
moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml backend: WIDEEP
load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
# TP/EP/PP/DP # TP/EP/PP/DP
tensor_parallel_size: 16 tensor_parallel_size: 16
...@@ -35,25 +36,28 @@ kv_cache_config: ...@@ -35,25 +36,28 @@ kv_cache_config:
# With dp attention enabled: large ISL at high concurrency may need # With dp attention enabled: large ISL at high concurrency may need
# free_gpu_memory_fraction low to have enough available memory. # free_gpu_memory_fraction low to have enough available memory.
free_gpu_memory_fraction: 0.30 free_gpu_memory_fraction: 0.30
dtype: fp8
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed # NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: false disable_overlap_scheduler: false
use_cuda_graph: true cuda_graph_config:
cuda_graph_padding_enabled: true enable_padding: true
# NOTE: For larger max batch size, you may want to add larger cuda graph # NOTE: For larger max batch size, you may want to
# batch sizes below to match. # add larger cuda graph batch sizes below to match.
cuda_graph_batch_sizes: batch_sizes:
- 1 - 1
- 2 - 2
- 4 - 4
- 8 - 8
- 16 - 16
- 32 - 32
- 64 - 64
- 128 - 128
- 256 - 256
print_iter_log: true print_iter_log: true
kv_cache_dtype: fp8
...@@ -15,8 +15,9 @@ ...@@ -15,8 +15,9 @@
backend: pytorch backend: pytorch
# WideEP related settings # WideEP related settings
moe_backend: WideEP moe_config:
moe_load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml backend: WIDEEP
load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml
# TP/EP/PP/DP # TP/EP/PP/DP
tensor_parallel_size: 16 tensor_parallel_size: 16
...@@ -29,13 +30,12 @@ max_num_tokens: 8192 ...@@ -29,13 +30,12 @@ max_num_tokens: 8192
max_seq_len: 8192 max_seq_len: 8192
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.75 free_gpu_memory_fraction: 0.3
dtype: fp8 # NOTE: This dtype must match in both prefill/decode configs
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed # NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
disable_overlap_scheduler: true disable_overlap_scheduler: true
print_iter_log: true print_iter_log: true
# NOTE: This dtype must match in both prefill/decode configs \ No newline at end of file
kv_cache_dtype: fp8
...@@ -21,31 +21,21 @@ max_batch_size: 256 ...@@ -21,31 +21,21 @@ max_batch_size: 256
# Will be investigated in the future with TRTLLM team. # Will be investigated in the future with TRTLLM team.
max_num_tokens: 1024 max_num_tokens: 1024
max_seq_len: 8448 max_seq_len: 8448
autotuner_enabled: false enable_autotuner: false
disable_overlap_scheduler: true disable_overlap_scheduler: true
# Enable Speculative Decoding in the model engine # Enable Speculative Decoding in the model engine
speculative_config: speculative_config:
decoding_type: Eagle decoding_type: Eagle
max_draft_len: 1 max_draft_len: 1
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: False eagle3_one_model: false
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.5 free_gpu_memory_fraction: 0.5
enable_block_reuse: false enable_block_reuse: false
use_cuda_graph: true
cuda_graph_padding_enabled: true cuda_graph_config:
cuda_graph_batch_sizes: max_batch_size: 8
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true
kv_cache_dtype: fp8
...@@ -28,23 +28,24 @@ speculative_config: ...@@ -28,23 +28,24 @@ speculative_config:
decoding_type: Eagle decoding_type: Eagle
max_draft_len: 1 max_draft_len: 1
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: False eagle3_one_model: false
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.5 free_gpu_memory_fraction: 0.5
enable_block_reuse: false enable_block_reuse: false
dtype: fp8
cuda_graph_config:
enable_padding: true
batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
use_cuda_graph: true
cuda_graph_padding_enabled: true
cuda_graph_batch_sizes:
- 1
- 2
- 4
- 8
- 16
- 32
- 64
- 128
- 256
print_iter_log: true print_iter_log: true
kv_cache_dtype: fp8
...@@ -29,7 +29,7 @@ speculative_config: ...@@ -29,7 +29,7 @@ speculative_config:
decoding_type: Eagle decoding_type: Eagle
max_draft_len: 1 max_draft_len: 1
pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3 pytorch_weights_path: nvidia/Llama-4-Maverick-17B-128E-Eagle3
eagle3_one_model: False eagle3_one_model: false
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.5 free_gpu_memory_fraction: 0.5
......
...@@ -16,13 +16,15 @@ tensor_parallel_size: 1 ...@@ -16,13 +16,15 @@ tensor_parallel_size: 1
moe_expert_parallel_size: 1 moe_expert_parallel_size: 1
enable_attention_dp: false enable_attention_dp: false
max_num_tokens: 8192 max_num_tokens: 8192
max_batch_size: 16
trust_remote_code: true trust_remote_code: true
backend: pytorch backend: pytorch
enable_chunked_prefill: true enable_chunked_prefill: true
# Overlap scheduler not currently supported in prefill only workers. # Overlap scheduler not currently supported in prefill only workers.
disable_overlap_scheduler: true disable_overlap_scheduler: true
use_cuda_graph: false cuda_graph_config:
max_batch_size: 16
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.95 free_gpu_memory_fraction: 0.95
cache_transceiver_config:
backend: default
\ No newline at end of file
...@@ -101,8 +101,10 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -101,8 +101,10 @@ async def init(runtime: DistributedRuntime, config: Config):
kv_cache_config["event_buffer_max_size"] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE kv_cache_config["event_buffer_max_size"] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
else: else:
kv_cache_config = arg_map["kv_cache_config"] kv_cache_config = arg_map["kv_cache_config"]
if not kv_cache_config.event_buffer_max_size: if "event_buffer_max_size" not in kv_cache_config:
kv_cache_config.event_buffer_max_size = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE kv_cache_config[
"event_buffer_max_size"
] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
arg_map["kv_cache_config"] = kv_cache_config arg_map["kv_cache_config"] = kv_cache_config
# Only pytorch backend is supported for now to publish events and metrics. # Only pytorch backend is supported for now to publish events and metrics.
......
...@@ -96,7 +96,7 @@ TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0" ...@@ -96,7 +96,7 @@ TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple" TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
# TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package. # TODO: Remove the version specification from here and use the ai-dynamo[trtllm] package.
# Need to update the Dockerfile.tensorrt_llm to use the ai-dynamo[trtllm] package. # Need to update the Dockerfile.tensorrt_llm to use the ai-dynamo[trtllm] package.
DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.0.0rc0" DEFAULT_TENSORRTLLM_PIP_WHEEL="tensorrt-llm==1.0.0rc4"
TENSORRTLLM_PIP_WHEEL="" TENSORRTLLM_PIP_WHEEL=""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment