Unverified Commit e4b6133b authored by JieXin Liang's avatar JieXin Liang Committed by GitHub
Browse files

[fix] relax mem_fraction_static for h200 (#5893)


Co-authored-by: default avataralcanerian <alcanerian@gmail.com>
parent dd408ee4
...@@ -135,7 +135,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner): ...@@ -135,7 +135,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
gpu_mem = get_device_memory_capacity() gpu_mem = get_device_memory_capacity()
# Batch size of each rank will not become so large when DP is on # Batch size of each rank will not become so large when DP is on
if gpu_mem is not None and gpu_mem > 81920 and server_args.dp_size == 1: if gpu_mem is not None and gpu_mem > 96 * 1024:
capture_bs += list(range(160, 257, 8)) capture_bs += list(range(160, 257, 8))
if max(capture_bs) > model_runner.req_to_token_pool.size: if max(capture_bs) > model_runner.req_to_token_pool.size:
......
...@@ -222,7 +222,6 @@ class ServerArgs: ...@@ -222,7 +222,6 @@ class ServerArgs:
# Set mem fraction static, which depends on the tensor parallelism size # Set mem fraction static, which depends on the tensor parallelism size
if self.mem_fraction_static is None: if self.mem_fraction_static is None:
if gpu_mem <= 81920:
if self.tp_size >= 16: if self.tp_size >= 16:
self.mem_fraction_static = 0.79 self.mem_fraction_static = 0.79
elif self.tp_size >= 8: elif self.tp_size >= 8:
...@@ -233,9 +232,13 @@ class ServerArgs: ...@@ -233,9 +232,13 @@ class ServerArgs:
self.mem_fraction_static = 0.87 self.mem_fraction_static = 0.87
else: else:
self.mem_fraction_static = 0.88 self.mem_fraction_static = 0.88
else: if gpu_mem > 96 * 1024:
# FIXME: more fine grained auto-selection polices mem_fraction = self.mem_fraction_static
self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem self.mem_fraction_static = min(
mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
(gpu_mem - 1024 * 18)
/ gpu_mem, # 15 GB + additional 3GB for cuda graph
)
# Set chunked prefill size, which depends on the gpu memory capacity # Set chunked prefill size, which depends on the gpu memory capacity
if self.chunked_prefill_size is None: if self.chunked_prefill_size is None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment