Unverified Commit f4e20810 authored by Indrajit Bhosale's avatar Indrajit Bhosale Committed by GitHub
Browse files

fix: Update free_gpu_memory_fraction for llama4 Maverick to avoid OOM in multinode setup (#7090)

parent 48eb52e7
......@@ -15,14 +15,14 @@
tensor_parallel_size: 8
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 8192
max_batch_size: 16
max_num_tokens: 4096
max_batch_size: 8
trust_remote_code: true
backend: pytorch
enable_chunked_prefill: true
disable_overlap_scheduler: false
kv_cache_config:
free_gpu_memory_fraction: 0.30
free_gpu_memory_fraction: 0.20
enable_block_reuse: false
cache_transceiver_config:
......
......@@ -15,8 +15,8 @@
tensor_parallel_size: 8
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 8192
max_batch_size: 16
max_num_tokens: 4096
max_batch_size: 8
trust_remote_code: true
backend: pytorch
enable_chunked_prefill: true
......@@ -24,7 +24,7 @@ enable_chunked_prefill: true
disable_overlap_scheduler: true
kv_cache_config:
free_gpu_memory_fraction: 0.30
free_gpu_memory_fraction: 0.20
enable_block_reuse: false
cache_transceiver_config:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment