fix(ci): Reduce the free gpu memory fraction (#2433)

f6fef485 · Tanmay Verma · GitHub · cebe9219 · f6fef485 · f6fef485
Unverified Commit f6fef485 authored Aug 13, 2025 by Tanmay Verma Committed by GitHub Aug 13, 2025
3 changed files
--- a/components/backends/trtllm/engine_configs/agg.yaml
+++ b/components/backends/trtllm/engine_configs/agg.yaml
@@ -22,7 +22,7 @@ backend: pytorch
 enable_chunked_prefill: true

 kv_cache_config:
-  free_gpu_memory_fraction: 0.95
+  free_gpu_memory_fraction: 0.85

 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
 # NOTE: overlap_scheduler enabled by default since this commit and changed

--- a/components/backends/trtllm/engine_configs/decode.yaml
+++ b/components/backends/trtllm/engine_configs/decode.yaml
@@ -25,7 +25,7 @@ cuda_graph_config:
  max_batch_size: 16

 kv_cache_config:
-  free_gpu_memory_fraction: 0.95
+  free_gpu_memory_fraction: 0.85

 cache_transceiver_config:
  backend: default
--- a/components/backends/trtllm/engine_configs/prefill.yaml
+++ b/components/backends/trtllm/engine_configs/prefill.yaml
@@ -24,7 +24,7 @@ disable_overlap_scheduler: true
 cuda_graph_config:
  max_batch_size: 16
 kv_cache_config:
-  free_gpu_memory_fraction: 0.95
+  free_gpu_memory_fraction: 0.85

 cache_transceiver_config:
  backend: default
\ No newline at end of file