Unverified Commit d9f6d7a5 authored by Ryan McCormick's avatar Ryan McCormick Committed by GitHub
Browse files

fix: Flatten pytorch_backend_config section to address breaking change to trtllm config (#1326)

parent d3ca7661
...@@ -38,23 +38,23 @@ kv_cache_config: ...@@ -38,23 +38,23 @@ kv_cache_config:
# free_gpu_memory_fraction low to have enough available memory. # free_gpu_memory_fraction low to have enough available memory.
# free_gpu_memory_fraction: 0.30 # free_gpu_memory_fraction: 0.30
pytorch_backend_config: # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed # NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
use_cuda_graph: true use_cuda_graph: true
cuda_graph_padding_enabled: true cuda_graph_padding_enabled: true
# NOTE: For larger max batch size, you may want to add larger cuda graph # NOTE: For larger max batch size, you may want to add larger cuda graph
# batch sizes below to match. # batch sizes below to match.
cuda_graph_batch_sizes: cuda_graph_batch_sizes:
- 1 - 1
- 2 - 2
- 4 - 4
- 8 - 8
- 16 - 16
- 32 - 32
- 64 - 64
- 128 - 128
- 256 - 256
print_iter_log: true print_iter_log: true
kv_cache_dtype: fp8 kv_cache_dtype: fp8
...@@ -37,13 +37,13 @@ context_servers: ...@@ -37,13 +37,13 @@ context_servers:
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.75 free_gpu_memory_fraction: 0.75
pytorch_backend_config: # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
# Overlap scheduler not currently supported in context-only # Overlap scheduler not currently supported in context-only
disable_overlap_scheduler: true disable_overlap_scheduler: true
print_iter_log: true print_iter_log: true
# NOTE: This dtype must match in both context/generation configs # NOTE: This dtype must match in both context/generation configs
kv_cache_dtype: fp8 kv_cache_dtype: fp8
generation_servers: generation_servers:
# Generation/decode processes one token per request at a time, so a larger # Generation/decode processes one token per request at a time, so a larger
...@@ -66,23 +66,23 @@ generation_servers: ...@@ -66,23 +66,23 @@ generation_servers:
# free_gpu_memory_fraction low to have enough available memory. # free_gpu_memory_fraction low to have enough available memory.
# free_gpu_memory_fraction: 0.30 # free_gpu_memory_fraction: 0.30
pytorch_backend_config: # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
disable_overlap_scheduler: false disable_overlap_scheduler: false
use_cuda_graph: true use_cuda_graph: true
cuda_graph_padding_enabled: true cuda_graph_padding_enabled: true
# NOTE: For larger max batch size, you may want to add larger cuda graph # NOTE: For larger max batch size, you may want to add larger cuda graph
# batch sizes below to match. # batch sizes below to match.
cuda_graph_batch_sizes: cuda_graph_batch_sizes:
- 1 - 1
- 2 - 2
- 4 - 4
- 8 - 8
- 16 - 16
- 32 - 32
- 64 - 64
- 128 - 128
- 256 - 256
print_iter_log: true print_iter_log: true
# NOTE: This dtype must match in both context/generation configs # NOTE: This dtype must match in both context/generation configs
kv_cache_dtype: fp8 kv_cache_dtype: fp8
...@@ -33,8 +33,8 @@ enable_chunked_prefill: true ...@@ -33,8 +33,8 @@ enable_chunked_prefill: true
kv_cache_config: kv_cache_config:
free_gpu_memory_fraction: 0.95 free_gpu_memory_fraction: 0.95
pytorch_backend_config: # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed # NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
use_cuda_graph: true use_cuda_graph: true
...@@ -35,9 +35,9 @@ kv_cache_config: ...@@ -35,9 +35,9 @@ kv_cache_config:
event_buffer_max_size: 1024 event_buffer_max_size: 1024
enable_block_reuse: true enable_block_reuse: true
pytorch_backend_config: # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed # NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
use_cuda_graph: true use_cuda_graph: true
enable_iter_perf_stats: true enable_iter_perf_stats: true
...@@ -35,9 +35,9 @@ kv_cache_config: ...@@ -35,9 +35,9 @@ kv_cache_config:
event_buffer_max_size: 1024 event_buffer_max_size: 1024
enable_block_reuse: true enable_block_reuse: true
pytorch_backend_config: # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed # NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
use_cuda_graph: true use_cuda_graph: true
enable_iter_perf_stats: true enable_iter_perf_stats: true
...@@ -33,11 +33,11 @@ context_servers: ...@@ -33,11 +33,11 @@ context_servers:
free_gpu_memory_fraction: 0.40 free_gpu_memory_fraction: 0.40
cache_transceiver_config: cache_transceiver_config:
max_num_tokens: 10240 max_num_tokens: 10240
pytorch_backend_config: # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
# Overlap scheduler not currently supported in context-only # Overlap scheduler not currently supported in context-only
disable_overlap_scheduler: true disable_overlap_scheduler: true
use_cuda_graph: false use_cuda_graph: false
urls: urls:
- "localhost:8001" - "localhost:8001"
...@@ -50,9 +50,9 @@ generation_servers: ...@@ -50,9 +50,9 @@ generation_servers:
free_gpu_memory_fraction: 0.40 free_gpu_memory_fraction: 0.40
cache_transceiver_config: cache_transceiver_config:
max_num_tokens: 256 max_num_tokens: 256
pytorch_backend_config: # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
disable_overlap_scheduler: false disable_overlap_scheduler: false
use_cuda_graph: false use_cuda_graph: false
urls: urls:
- "localhost:8002" - "localhost:8002"
...@@ -35,12 +35,12 @@ context_servers: ...@@ -35,12 +35,12 @@ context_servers:
enable_block_reuse: true enable_block_reuse: true
cache_transceiver_config: cache_transceiver_config:
max_num_tokens: 10240 max_num_tokens: 10240
pytorch_backend_config: # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
# Overlap scheduler not currently supported in context-only # Overlap scheduler not currently supported in context-only
disable_overlap_scheduler: true disable_overlap_scheduler: true
use_cuda_graph: false use_cuda_graph: false
enable_iter_perf_stats: true enable_iter_perf_stats: true
urls: urls:
- "localhost:8001" - "localhost:8001"
...@@ -55,10 +55,10 @@ generation_servers: ...@@ -55,10 +55,10 @@ generation_servers:
enable_block_reuse: true enable_block_reuse: true
cache_transceiver_config: cache_transceiver_config:
max_num_tokens: 256 max_num_tokens: 256
pytorch_backend_config: # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
disable_overlap_scheduler: false disable_overlap_scheduler: false
use_cuda_graph: false use_cuda_graph: false
enable_iter_perf_stats: true enable_iter_perf_stats: true
urls: urls:
- "localhost:8002" - "localhost:8002"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment