fix: Update breaking change to enable_overlap_scheduler field from TRTLLM commit b4e5df0e (#1310)

859944f4 · Ryan McCormick · GitHub · f7890bf0 · 859944f4 · 859944f4
Unverified Commit 859944f4 authored May 31, 2025 by Ryan McCormick Committed by GitHub May 30, 2025
8 changed files
--- a/container/build.sh
+++ b/container/build.sh
@@ -88,7 +88,7 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
 # TensorRT-LLM commit to use for building the trtllm wheel if not provided.
 # Important Note: This commit is not used in our CI pipeline. See the CI
 # variables to learn how to run a pipeline with a specific commit.
-TRTLLM_COMMIT=290649b6aaed5f233b0a0adf50edc1347f8d2b14
+TRTLLM_COMMIT="8cb6163a57226e69d8a85788eff542a440ed9c89"

 # TensorRT-LLM PyPI index URL
 TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"

--- a/examples/tensorrt_llm/configs/deepseek_r1/agg_llm_api_config.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/agg_llm_api_config.yaml
@@ -39,6 +39,9 @@ kv_cache_config:
  # free_gpu_memory_fraction: 0.30

 pytorch_backend_config:
+  # NOTE: overlap_scheduler enabled by default since this commit and changed
+  # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
+  # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
  use_cuda_graph: true
  cuda_graph_padding_enabled: true
  # NOTE: For larger max batch size, you may want to add larger cuda graph
@@ -54,5 +57,4 @@ pytorch_backend_config:
  - 128
  - 256
  print_iter_log: true
-  enable_overlap_scheduler: true
  kv_cache_dtype: fp8
--- a/examples/tensorrt_llm/configs/deepseek_r1/disagg_llm_api_config.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/disagg_llm_api_config.yaml
@@ -34,8 +34,13 @@ context_servers:
  pipeline_parallel_size: 1
  enable_attention_dp: true

+  kv_cache_config:
    free_gpu_memory_fraction: 0.75
+
  pytorch_backend_config:
+    # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
+    # Overlap scheduler not currently supported in context-only
+    disable_overlap_scheduler: true
    print_iter_log: true
    # NOTE: This dtype must match in both context/generation configs
    kv_cache_dtype: fp8
@@ -54,6 +59,7 @@ generation_servers:
  pipeline_parallel_size: 1
  enable_attention_dp: false

+  kv_cache_config:
    # With dp attention disabled: high free_gpu_memory_fraction is fine.
    free_gpu_memory_fraction: 0.85
    # With dp attention enabled: large ISL at high concurrency may need
@@ -61,6 +67,8 @@ generation_servers:
    # free_gpu_memory_fraction: 0.30

  pytorch_backend_config:
+    # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
+    disable_overlap_scheduler: false
    use_cuda_graph: true
    cuda_graph_padding_enabled: true
    # NOTE: For larger max batch size, you may want to add larger cuda graph
@@ -76,6 +84,5 @@ generation_servers:
    - 128
    - 256
    print_iter_log: true
-    enable_overlap_scheduler: true
    # NOTE: This dtype must match in both context/generation configs
    kv_cache_dtype: fp8
--- a/examples/tensorrt_llm/configs/llm_api_config.yaml
+++ b/examples/tensorrt_llm/configs/llm_api_config.yaml
@@ -34,5 +34,7 @@ kv_cache_config:
  free_gpu_memory_fraction: 0.95

 pytorch_backend_config:
-  enable_overlap_scheduler: true
+  # NOTE: overlap_scheduler enabled by default since this commit and changed
+  # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
+  # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
  use_cuda_graph: true
--- a/examples/tensorrt_llm/configs/llm_api_config_disagg_router.yaml
+++ b/examples/tensorrt_llm/configs/llm_api_config_disagg_router.yaml
@@ -36,6 +36,8 @@ kv_cache_config:
  enable_block_reuse: true

 pytorch_backend_config:
-  enable_overlap_scheduler: false
-  use_cuda_graph: false
+  # NOTE: overlap_scheduler enabled by default since this commit and changed
+  # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
+  # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
+  use_cuda_graph: true
  enable_iter_perf_stats: true
--- a/examples/tensorrt_llm/configs/llm_api_config_router.yaml
+++ b/examples/tensorrt_llm/configs/llm_api_config_router.yaml
@@ -36,6 +36,8 @@ kv_cache_config:
  enable_block_reuse: true

 pytorch_backend_config:
-  enable_overlap_scheduler: true
+  # NOTE: overlap_scheduler enabled by default since this commit and changed
+  # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
+  # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
  use_cuda_graph: true
  enable_iter_perf_stats: true
--- a/examples/tensorrt_llm/configs/llmapi_disagg_configs/single_node_config.yaml
+++ b/examples/tensorrt_llm/configs/llmapi_disagg_configs/single_node_config.yaml
@@ -34,7 +34,9 @@ context_servers:
  cache_transceiver_config:
    max_num_tokens: 10240
  pytorch_backend_config:
-    enable_overlap_scheduler: false
+    # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
+    # Overlap scheduler not currently supported in context-only
+    disable_overlap_scheduler: true
    use_cuda_graph: false
  urls:
      - "localhost:8001"
@@ -49,7 +51,8 @@ generation_servers:
  cache_transceiver_config:
    max_num_tokens: 256
  pytorch_backend_config:
-    enable_overlap_scheduler: true
+    # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
+    disable_overlap_scheduler: false
    use_cuda_graph: false
  urls:
      - "localhost:8002"
--- a/examples/tensorrt_llm/configs/llmapi_disagg_router_configs/single_node_config.yaml
+++ b/examples/tensorrt_llm/configs/llmapi_disagg_router_configs/single_node_config.yaml
@@ -36,7 +36,9 @@ context_servers:
  cache_transceiver_config:
    max_num_tokens: 10240
  pytorch_backend_config:
-    enable_overlap_scheduler: false
+    # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
+    # Overlap scheduler not currently supported in context-only
+    disable_overlap_scheduler: true
    use_cuda_graph: false
    enable_iter_perf_stats: true
  urls:
@@ -54,7 +56,8 @@ generation_servers:
  cache_transceiver_config:
    max_num_tokens: 256
  pytorch_backend_config:
-    enable_overlap_scheduler: true
+    # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
+    disable_overlap_scheduler: false
    use_cuda_graph: false
    enable_iter_perf_stats: true
  urls: