Unverified Commit 8ab3d742 authored by Matthew Bonanni's avatar Matthew Bonanni Committed by GitHub
Browse files

[Bugfix] Fix DeepSeek V3.2 OOM during CG memory profiling (#36691)


Signed-off-by: default avatarMatthew Bonanni <mbonanni@redhat.com>
parent 84e436ed
...@@ -5550,16 +5550,14 @@ class GPUModelRunner( ...@@ -5550,16 +5550,14 @@ class GPUModelRunner(
kv_cache_spec = self.get_kv_cache_spec() kv_cache_spec = self.get_kv_cache_spec()
kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec) kv_cache_groups = get_kv_cache_groups(self.vllm_config, kv_cache_spec)
min_blocks = self.compilation_config.max_cudagraph_capture_size or 1 min_blocks = self.compilation_config.max_cudagraph_capture_size or 1
if kv_cache_groups:
page_size = kv_cache_groups[0].kv_cache_spec.page_size_bytes
group_size = max(len(g.layer_names) for g in kv_cache_groups)
available_memory = min_blocks * page_size * group_size
else:
available_memory = 1 # Attention-free model
# Temporarily change num_gpu_blocks_override to allocate a minimal KV cache
saved_override = self.cache_config.num_gpu_blocks_override
self.cache_config.num_gpu_blocks_override = min_blocks
minimal_config = get_kv_cache_config_from_groups( minimal_config = get_kv_cache_config_from_groups(
self.vllm_config, kv_cache_groups, available_memory=available_memory self.vllm_config, kv_cache_groups, available_memory=0
) )
self.cache_config.num_gpu_blocks_override = saved_override
self.initialize_kv_cache(minimal_config) self.initialize_kv_cache(minimal_config)
self.cache_config.num_gpu_blocks = minimal_config.num_blocks self.cache_config.num_gpu_blocks = minimal_config.num_blocks
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment