Unverified Commit 859944f4 authored by Ryan McCormick's avatar Ryan McCormick Committed by GitHub
Browse files

fix: Update breaking change to enable_overlap_scheduler field from TRTLLM commit b4e5df0e (#1310)

parent f7890bf0
...@@ -88,7 +88,7 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/" ...@@ -88,7 +88,7 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
# TensorRT-LLM commit to use for building the trtllm wheel if not provided. # TensorRT-LLM commit to use for building the trtllm wheel if not provided.
# Important Note: This commit is not used in our CI pipeline. See the CI # Important Note: This commit is not used in our CI pipeline. See the CI
# variables to learn how to run a pipeline with a specific commit. # variables to learn how to run a pipeline with a specific commit.
TRTLLM_COMMIT=290649b6aaed5f233b0a0adf50edc1347f8d2b14 TRTLLM_COMMIT="8cb6163a57226e69d8a85788eff542a440ed9c89"
# TensorRT-LLM PyPI index URL # TensorRT-LLM PyPI index URL
TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple" TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
......
...@@ -39,6 +39,9 @@ kv_cache_config: ...@@ -39,6 +39,9 @@ kv_cache_config:
# free_gpu_memory_fraction: 0.30 # free_gpu_memory_fraction: 0.30
pytorch_backend_config: pytorch_backend_config:
# NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
use_cuda_graph: true use_cuda_graph: true
cuda_graph_padding_enabled: true cuda_graph_padding_enabled: true
# NOTE: For larger max batch size, you may want to add larger cuda graph # NOTE: For larger max batch size, you may want to add larger cuda graph
...@@ -54,5 +57,4 @@ pytorch_backend_config: ...@@ -54,5 +57,4 @@ pytorch_backend_config:
- 128 - 128
- 256 - 256
print_iter_log: true print_iter_log: true
enable_overlap_scheduler: true
kv_cache_dtype: fp8 kv_cache_dtype: fp8
...@@ -34,8 +34,13 @@ context_servers: ...@@ -34,8 +34,13 @@ context_servers:
pipeline_parallel_size: 1 pipeline_parallel_size: 1
enable_attention_dp: true enable_attention_dp: true
kv_cache_config:
free_gpu_memory_fraction: 0.75 free_gpu_memory_fraction: 0.75
pytorch_backend_config: pytorch_backend_config:
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
# Overlap scheduler not currently supported in context-only
disable_overlap_scheduler: true
print_iter_log: true print_iter_log: true
# NOTE: This dtype must match in both context/generation configs # NOTE: This dtype must match in both context/generation configs
kv_cache_dtype: fp8 kv_cache_dtype: fp8
...@@ -54,6 +59,7 @@ generation_servers: ...@@ -54,6 +59,7 @@ generation_servers:
pipeline_parallel_size: 1 pipeline_parallel_size: 1
enable_attention_dp: false enable_attention_dp: false
kv_cache_config:
# With dp attention disabled: high free_gpu_memory_fraction is fine. # With dp attention disabled: high free_gpu_memory_fraction is fine.
free_gpu_memory_fraction: 0.85 free_gpu_memory_fraction: 0.85
# With dp attention enabled: large ISL at high concurrency may need # With dp attention enabled: large ISL at high concurrency may need
...@@ -61,6 +67,8 @@ generation_servers: ...@@ -61,6 +67,8 @@ generation_servers:
# free_gpu_memory_fraction: 0.30 # free_gpu_memory_fraction: 0.30
pytorch_backend_config: pytorch_backend_config:
# NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
disable_overlap_scheduler: false
use_cuda_graph: true use_cuda_graph: true
cuda_graph_padding_enabled: true cuda_graph_padding_enabled: true
# NOTE: For larger max batch size, you may want to add larger cuda graph # NOTE: For larger max batch size, you may want to add larger cuda graph
...@@ -76,6 +84,5 @@ generation_servers: ...@@ -76,6 +84,5 @@ generation_servers:
- 128 - 128
- 256 - 256
print_iter_log: true print_iter_log: true
enable_overlap_scheduler: true
# NOTE: This dtype must match in both context/generation configs # NOTE: This dtype must match in both context/generation configs
kv_cache_dtype: fp8 kv_cache_dtype: fp8
...@@ -34,5 +34,7 @@ kv_cache_config: ...@@ -34,5 +34,7 @@ kv_cache_config:
free_gpu_memory_fraction: 0.95 free_gpu_memory_fraction: 0.95
pytorch_backend_config: pytorch_backend_config:
enable_overlap_scheduler: true # NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
use_cuda_graph: true use_cuda_graph: true
...@@ -36,6 +36,8 @@ kv_cache_config: ...@@ -36,6 +36,8 @@ kv_cache_config:
enable_block_reuse: true enable_block_reuse: true
pytorch_backend_config: pytorch_backend_config:
enable_overlap_scheduler: false # NOTE: overlap_scheduler enabled by default since this commit and changed
use_cuda_graph: false # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
use_cuda_graph: true
enable_iter_perf_stats: true enable_iter_perf_stats: true
...@@ -36,6 +36,8 @@ kv_cache_config: ...@@ -36,6 +36,8 @@ kv_cache_config:
enable_block_reuse: true enable_block_reuse: true
pytorch_backend_config: pytorch_backend_config:
enable_overlap_scheduler: true # NOTE: overlap_scheduler enabled by default since this commit and changed
# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
use_cuda_graph: true use_cuda_graph: true
enable_iter_perf_stats: true enable_iter_perf_stats: true
...@@ -34,7 +34,9 @@ context_servers: ...@@ -34,7 +34,9 @@ context_servers:
cache_transceiver_config: cache_transceiver_config:
max_num_tokens: 10240 max_num_tokens: 10240
pytorch_backend_config: pytorch_backend_config:
enable_overlap_scheduler: false # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
# Overlap scheduler not currently supported in context-only
disable_overlap_scheduler: true
use_cuda_graph: false use_cuda_graph: false
urls: urls:
- "localhost:8001" - "localhost:8001"
...@@ -49,7 +51,8 @@ generation_servers: ...@@ -49,7 +51,8 @@ generation_servers:
cache_transceiver_config: cache_transceiver_config:
max_num_tokens: 256 max_num_tokens: 256
pytorch_backend_config: pytorch_backend_config:
enable_overlap_scheduler: true # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
disable_overlap_scheduler: false
use_cuda_graph: false use_cuda_graph: false
urls: urls:
- "localhost:8002" - "localhost:8002"
...@@ -36,7 +36,9 @@ context_servers: ...@@ -36,7 +36,9 @@ context_servers:
cache_transceiver_config: cache_transceiver_config:
max_num_tokens: 10240 max_num_tokens: 10240
pytorch_backend_config: pytorch_backend_config:
enable_overlap_scheduler: false # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
# Overlap scheduler not currently supported in context-only
disable_overlap_scheduler: true
use_cuda_graph: false use_cuda_graph: false
enable_iter_perf_stats: true enable_iter_perf_stats: true
urls: urls:
...@@ -54,7 +56,8 @@ generation_servers: ...@@ -54,7 +56,8 @@ generation_servers:
cache_transceiver_config: cache_transceiver_config:
max_num_tokens: 256 max_num_tokens: 256
pytorch_backend_config: pytorch_backend_config:
enable_overlap_scheduler: true # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
disable_overlap_scheduler: false
use_cuda_graph: false use_cuda_graph: false
enable_iter_perf_stats: true enable_iter_perf_stats: true
urls: urls:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment