Unverified Commit f2bd3515 authored by Baizhou Zhang's avatar Baizhou Zhang Committed by GitHub
Browse files

Tune memory arguments on B200 (#6718)

parent c459536b
...@@ -149,6 +149,8 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner): ...@@ -149,6 +149,8 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
gpu_mem = get_device_memory_capacity() gpu_mem = get_device_memory_capacity()
if gpu_mem is not None and gpu_mem > 96 * 1024: if gpu_mem is not None and gpu_mem > 96 * 1024:
capture_bs += list(range(160, 257, 8)) capture_bs += list(range(160, 257, 8))
if gpu_mem is not None and gpu_mem > 180 * 1000:
capture_bs += list(range(256, 528, 16))
if max(capture_bs) > model_runner.req_to_token_pool.size: if max(capture_bs) > model_runner.req_to_token_pool.size:
# In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests # In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests
......
...@@ -260,7 +260,9 @@ class ServerArgs: ...@@ -260,7 +260,9 @@ class ServerArgs:
self.mem_fraction_static = 0.88 self.mem_fraction_static = 0.88
else: else:
self.mem_fraction_static = 0.88 self.mem_fraction_static = 0.88
if gpu_mem is not None and gpu_mem > 96 * 1024: if gpu_mem is not None and gpu_mem > 180 * 1000:
self.mem_fraction_static = 0.79
elif gpu_mem is not None and gpu_mem > 96 * 1024:
mem_fraction = self.mem_fraction_static mem_fraction = self.mem_fraction_static
# 15 GB + additional 3GB for cuda graph # 15 GB + additional 3GB for cuda graph
reserve_mem = 1024 * 18 reserve_mem = 1024 * 18
...@@ -277,7 +279,9 @@ class ServerArgs: ...@@ -277,7 +279,9 @@ class ServerArgs:
# Set chunked prefill size, which depends on the gpu memory capacity # Set chunked prefill size, which depends on the gpu memory capacity
if self.chunked_prefill_size is None: if self.chunked_prefill_size is None:
if gpu_mem is not None and gpu_mem < 25_000: if gpu_mem is not None and gpu_mem > 180_000:
self.chunked_prefill_size = 16384
elif gpu_mem is not None and gpu_mem < 25_000:
self.chunked_prefill_size = 2048 self.chunked_prefill_size = 2048
elif self.disaggregation_mode != "null": elif self.disaggregation_mode != "null":
self.chunked_prefill_size = 16384 self.chunked_prefill_size = 16384
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment