chore: Update trtllm version to 1.1.0rc3 (#2930)

Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com>

chore: Update trtllm version to 1.1.0rc3 (#2930)
Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com>
e8cb972e · Indrajit Bhosale · GitHub · 241bd014 · e8cb972e · e8cb972e
Unverified Commit e8cb972e authored Sep 09, 2025 by Indrajit Bhosale Committed by GitHub Sep 09, 2025
20 changed files
--- a/README.md
+++ b/README.md
@@ -199,7 +199,7 @@ It is recommended to use [NGC PyTorch Container](https://catalog.ngc.nvidia.com/

 > [!Note]
 > Ensure that you select a PyTorch container image version that matches the version of TensorRT-LLM you are using.
-> For example, if you are using `tensorrt-llm==1.0.0rc6`, use the PyTorch container image version `25.06`.
+> For example, if you are using `tensorrt-llm==1.1.0rc3`, use the PyTorch container image version `25.06`.
 > To find the correct PyTorch container version for your desired `tensorrt-llm` release, visit the [TensorRT-LLM Dockerfile.multi](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docker/Dockerfile.multi) on GitHub. Switch to the branch that matches your `tensorrt-llm` version, and look for the `BASE_TAG` line to identify the recommended PyTorch container tag.

 > [!Important]

--- a/components/backends/trtllm/engine_configs/decode.yaml
+++ b/components/backends/trtllm/engine_configs/decode.yaml
@@ -28,4 +28,4 @@ kv_cache_config:
  free_gpu_memory_fraction: 0.85

 cache_transceiver_config:
-  backend: default
+  backend: DEFAULT
--- a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml
@@ -54,4 +54,4 @@ cuda_graph_config:
 print_iter_log: true

 cache_transceiver_config:
-  backend: default
+  backend: DEFAULT
--- a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml
@@ -38,4 +38,4 @@ speculative_config:
  num_nextn_predict_layers: 1

 cache_transceiver_config:
-  backend: default
+  backend: DEFAULT
--- a/components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml
@@ -57,4 +57,4 @@ cuda_graph_config:
 print_iter_log: true

 cache_transceiver_config:
-  backend: default
+  backend: DEFAULT
--- a/components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml
@@ -36,4 +36,4 @@ disable_overlap_scheduler: true
 print_iter_log: true

 cache_transceiver_config:
-  backend: default
+  backend: DEFAULT
--- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml
@@ -63,4 +63,4 @@ cuda_graph_config:
 print_iter_log: true

 cache_transceiver_config:
-  backend: default
+  backend: DEFAULT
--- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml
+++ b/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml
@@ -41,4 +41,4 @@ disable_overlap_scheduler: true
 print_iter_log: true

 cache_transceiver_config:
-  backend: default
+  backend: DEFAULT
--- a/components/backends/trtllm/engine_configs/encode.yaml
+++ b/components/backends/trtllm/engine_configs/encode.yaml
@@ -27,4 +27,4 @@ kv_cache_config:
  free_gpu_memory_fraction: 0.85

 cache_transceiver_config:
-  backend: default
+  backend: DEFAULT
--- a/components/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml
+++ b/components/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml
@@ -26,4 +26,4 @@ kv_cache_config:
    - 32768

 cache_transceiver_config:
-  backend: default
+  backend: DEFAULT
--- a/components/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml
+++ b/components/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml
@@ -27,4 +27,4 @@ kv_cache_config:
    - 32768

 cache_transceiver_config:
-  backend: default
+  backend: DEFAULT
--- a/components/backends/trtllm/engine_configs/gpt_oss/decode.yaml
+++ b/components/backends/trtllm/engine_configs/gpt_oss/decode.yaml
@@ -19,7 +19,7 @@ moe_config:
 cuda_graph_config:
    enable_padding: true
 cache_transceiver_config:
-  backend: ucx
+  backend: UCX
  max_tokens_in_buffer: 65536
 print_iter_log: false
 stream_interval: 10
--- a/components/backends/trtllm/engine_configs/gpt_oss/prefill.yaml
+++ b/components/backends/trtllm/engine_configs/gpt_oss/prefill.yaml
@@ -21,7 +21,7 @@ cuda_graph_config:
    max_batch_size: 32
    enable_padding: true
 cache_transceiver_config:
-  backend: ucx
+  backend: UCX
  max_tokens_in_buffer: 65536
 print_iter_log: false
 stream_interval: 10
--- a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml
+++ b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml
@@ -49,4 +49,4 @@ cuda_graph_config:
 print_iter_log: true

 cache_transceiver_config:
-  backend: default
+  backend: DEFAULT
--- a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml
+++ b/components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml
@@ -34,4 +34,4 @@ kv_cache_config:
  enable_block_reuse: false

 cache_transceiver_config:
-  backend: default
+  backend: DEFAULT
--- a/components/backends/trtllm/engine_configs/multimodal/agg.yaml
+++ b/components/backends/trtllm/engine_configs/multimodal/agg.yaml
@@ -26,7 +26,7 @@ kv_cache_config:
  enable_block_reuse: false

 cache_transceiver_config:
-  backend: default
+  backend: DEFAULT
 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
 # NOTE: overlap_scheduler enabled by default since this commit and changed
 # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':

--- a/components/backends/trtllm/engine_configs/multimodal/decode.yaml
+++ b/components/backends/trtllm/engine_configs/multimodal/decode.yaml
@@ -26,4 +26,4 @@ kv_cache_config:
  enable_block_reuse: false

 cache_transceiver_config:
-  backend: default
\ No newline at end of file
+  backend: DEFAULT
\ No newline at end of file
--- a/components/backends/trtllm/engine_configs/multimodal/llama4/decode.yaml
+++ b/components/backends/trtllm/engine_configs/multimodal/llama4/decode.yaml
@@ -26,4 +26,4 @@ kv_cache_config:
  enable_block_reuse: false

 cache_transceiver_config:
-  backend: default
\ No newline at end of file
+  backend: DEFAULT
\ No newline at end of file
--- a/components/backends/trtllm/engine_configs/multimodal/llama4/prefill.yaml
+++ b/components/backends/trtllm/engine_configs/multimodal/llama4/prefill.yaml
@@ -28,4 +28,4 @@ kv_cache_config:
  enable_block_reuse: false

 cache_transceiver_config:
-  backend: default
\ No newline at end of file
+  backend: DEFAULT
\ No newline at end of file
--- a/components/backends/trtllm/engine_configs/multimodal/prefill.yaml
+++ b/components/backends/trtllm/engine_configs/multimodal/prefill.yaml
@@ -28,4 +28,4 @@ kv_cache_config:
  enable_block_reuse: false

 cache_transceiver_config:
-  backend: default
\ No newline at end of file
+  backend: DEFAULT
\ No newline at end of file