Unverified Commit f6fef485 authored by Tanmay Verma's avatar Tanmay Verma Committed by GitHub
Browse files

fix(ci): Reduce the free gpu memory fraction (#2433)

parent cebe9219
......@@ -22,7 +22,7 @@ backend: pytorch
enable_chunked_prefill: true
kv_cache_config:
free_gpu_memory_fraction: 0.95
free_gpu_memory_fraction: 0.85
# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
# NOTE: overlap_scheduler enabled by default since this commit and changed
......
......@@ -25,7 +25,7 @@ cuda_graph_config:
max_batch_size: 16
kv_cache_config:
free_gpu_memory_fraction: 0.95
free_gpu_memory_fraction: 0.85
cache_transceiver_config:
backend: default
......@@ -24,7 +24,7 @@ disable_overlap_scheduler: true
cuda_graph_config:
max_batch_size: 16
kv_cache_config:
free_gpu_memory_fraction: 0.95
free_gpu_memory_fraction: 0.85
cache_transceiver_config:
backend: default
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment