fix: Flatten pytorch_backend_config section to address breaking change to trtllm config (#1326)

d9f6d7a5 · Ryan McCormick · GitHub · d3ca7661 · d9f6d7a5 · d9f6d7a5
Unverified Commit d9f6d7a5 authored Jun 03, 2025 by Ryan McCormick Committed by GitHub Jun 02, 2025
7 changed files
--- a/examples/tensorrt_llm/configs/deepseek_r1/agg_llm_api_config.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/agg_llm_api_config.yaml
@@ -38,23 +38,23 @@ kv_cache_config:
  # free_gpu_memory_fraction low to have enough available memory.
  # free_gpu_memory_fraction: 0.30
-pytorch_backend_config:
+# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
-  # NOTE: overlap_scheduler enabled by default since this commit and changed
+# NOTE: overlap_scheduler enabled by default since this commit and changed
-  # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
+# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
-  # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
+# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
-  use_cuda_graph: true
+use_cuda_graph: true
-  cuda_graph_padding_enabled: true
+cuda_graph_padding_enabled: true
-  # NOTE: For larger max batch size, you may want to add larger cuda graph
+# NOTE: For larger max batch size, you may want to add larger cuda graph
-  # batch sizes below to match.
+# batch sizes below to match.
-  cuda_graph_batch_sizes:
+cuda_graph_batch_sizes:
-  - 1
+- 1
-  - 2
+- 2
-  - 4
+- 4
-  - 8
+- 8
-  - 16
+- 16
-  - 32
+- 32
-  - 64
+- 64
-  - 128
+- 128
-  - 256
+- 256
-  print_iter_log: true
+print_iter_log: true
-  kv_cache_dtype: fp8
+kv_cache_dtype: fp8
--- a/examples/tensorrt_llm/configs/deepseek_r1/disagg_llm_api_config.yaml
+++ b/examples/tensorrt_llm/configs/deepseek_r1/disagg_llm_api_config.yaml
@@ -37,13 +37,13 @@ context_servers:
  kv_cache_config:
    free_gpu_memory_fraction: 0.75
-  pytorch_backend_config:
+  # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
-    # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
+  # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
-    # Overlap scheduler not currently supported in context-only
+  # Overlap scheduler not currently supported in context-only
-    disable_overlap_scheduler: true
+  disable_overlap_scheduler: true
-    print_iter_log: true
+  print_iter_log: true
-    # NOTE: This dtype must match in both context/generation configs
+  # NOTE: This dtype must match in both context/generation configs
-    kv_cache_dtype: fp8
+  kv_cache_dtype: fp8
 generation_servers:
  # Generation/decode processes one token per request at a time, so a larger
@@ -66,23 +66,23 @@ generation_servers:
    # free_gpu_memory_fraction low to have enough available memory.
    # free_gpu_memory_fraction: 0.30
-  pytorch_backend_config:
+  # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
-    # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
+  # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
-    disable_overlap_scheduler: false
+  disable_overlap_scheduler: false
-    use_cuda_graph: true
+  use_cuda_graph: true
-    cuda_graph_padding_enabled: true
+  cuda_graph_padding_enabled: true
-    # NOTE: For larger max batch size, you may want to add larger cuda graph
+  # NOTE: For larger max batch size, you may want to add larger cuda graph
-    # batch sizes below to match.
+  # batch sizes below to match.
-    cuda_graph_batch_sizes:
+  cuda_graph_batch_sizes:
-    - 1
+  - 1
-    - 2
+  - 2
-    - 4
+  - 4
-    - 8
+  - 8
-    - 16
+  - 16
-    - 32
+  - 32
-    - 64
+  - 64
-    - 128
+  - 128
-    - 256
+  - 256
-    print_iter_log: true
+  print_iter_log: true
-    # NOTE: This dtype must match in both context/generation configs
+  # NOTE: This dtype must match in both context/generation configs
-    kv_cache_dtype: fp8
+  kv_cache_dtype: fp8
--- a/examples/tensorrt_llm/configs/llm_api_config.yaml
+++ b/examples/tensorrt_llm/configs/llm_api_config.yaml
@@ -33,8 +33,8 @@ enable_chunked_prefill: true
 kv_cache_config:
  free_gpu_memory_fraction: 0.95
-pytorch_backend_config:
+# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
-  # NOTE: overlap_scheduler enabled by default since this commit and changed
+# NOTE: overlap_scheduler enabled by default since this commit and changed
-  # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
+# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
-  # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
+# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
-  use_cuda_graph: true
+use_cuda_graph: true
--- a/examples/tensorrt_llm/configs/llm_api_config_disagg_router.yaml
+++ b/examples/tensorrt_llm/configs/llm_api_config_disagg_router.yaml
@@ -35,9 +35,9 @@ kv_cache_config:
  event_buffer_max_size: 1024
  enable_block_reuse: true
-pytorch_backend_config:
+# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
-  # NOTE: overlap_scheduler enabled by default since this commit and changed
+# NOTE: overlap_scheduler enabled by default since this commit and changed
-  # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
+# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
-  # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
+# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
-  use_cuda_graph: true
+use_cuda_graph: true
-  enable_iter_perf_stats: true
+enable_iter_perf_stats: true
--- a/examples/tensorrt_llm/configs/llm_api_config_router.yaml
+++ b/examples/tensorrt_llm/configs/llm_api_config_router.yaml
@@ -35,9 +35,9 @@ kv_cache_config:
  event_buffer_max_size: 1024
  enable_block_reuse: true
-pytorch_backend_config:
+# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
-  # NOTE: overlap_scheduler enabled by default since this commit and changed
+# NOTE: overlap_scheduler enabled by default since this commit and changed
-  # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
+# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
-  # https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
+# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
-  use_cuda_graph: true
+use_cuda_graph: true
-  enable_iter_perf_stats: true
+enable_iter_perf_stats: true
--- a/examples/tensorrt_llm/configs/llmapi_disagg_configs/single_node_config.yaml
+++ b/examples/tensorrt_llm/configs/llmapi_disagg_configs/single_node_config.yaml
@@ -33,11 +33,11 @@ context_servers:
    free_gpu_memory_fraction: 0.40
  cache_transceiver_config:
    max_num_tokens: 10240
-  pytorch_backend_config:
+  # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
-    # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
+  # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
-    # Overlap scheduler not currently supported in context-only
+  # Overlap scheduler not currently supported in context-only
-    disable_overlap_scheduler: true
+  disable_overlap_scheduler: true
-    use_cuda_graph: false
+  use_cuda_graph: false
  urls:
      - "localhost:8001"
@@ -50,9 +50,9 @@ generation_servers:
    free_gpu_memory_fraction: 0.40
  cache_transceiver_config:
    max_num_tokens: 256
-  pytorch_backend_config:
+  # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
-    # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
+  # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
-    disable_overlap_scheduler: false
+  disable_overlap_scheduler: false
-    use_cuda_graph: false
+  use_cuda_graph: false
  urls:
      - "localhost:8002"
--- a/examples/tensorrt_llm/configs/llmapi_disagg_router_configs/single_node_config.yaml
+++ b/examples/tensorrt_llm/configs/llmapi_disagg_router_configs/single_node_config.yaml
@@ -35,12 +35,12 @@ context_servers:
    enable_block_reuse: true
  cache_transceiver_config:
    max_num_tokens: 10240
-  pytorch_backend_config:
+  # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
-    # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
+  # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
-    # Overlap scheduler not currently supported in context-only
+  # Overlap scheduler not currently supported in context-only
-    disable_overlap_scheduler: true
+  disable_overlap_scheduler: true
-    use_cuda_graph: false
+  use_cuda_graph: false
-    enable_iter_perf_stats: true
+  enable_iter_perf_stats: true
  urls:
      - "localhost:8001"
@@ -55,10 +55,10 @@ generation_servers:
    enable_block_reuse: true
  cache_transceiver_config:
    max_num_tokens: 256
-  pytorch_backend_config:
+  # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
-    # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
+  # NOTE: This field is called 'enable_overlap_scheduler' in older TRTLLM versions
-    disable_overlap_scheduler: false
+  disable_overlap_scheduler: false
-    use_cuda_graph: false
+  use_cuda_graph: false
-    enable_iter_perf_stats: true
+  enable_iter_perf_stats: true
  urls:
      - "localhost:8002"