[fix] relax mem_fraction_static for h200 (#5893)

Co-authored-by: alcanerian <alcanerian@gmail.com>

[fix] relax mem_fraction_static for h200 (#5893)
Co-authored-by: alcanerian <alcanerian@gmail.com>
e4b6133b · JieXin Liang · GitHub · dd408ee4 · e4b6133b · e4b6133b
Unverified Commit e4b6133b authored Apr 30, 2025 by JieXin Liang Committed by GitHub Apr 29, 2025
Showing with 17 additions and 14 deletions

python/sglang/srt/model_executor/cuda_graph_runner.py python/sglang/srt/model_executor/cuda_graph_runner.py +1 -1

python/sglang/srt/server_args.py python/sglang/srt/server_args.py +16 -13

No files found.
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -135,7 +135,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
        gpu_mem = get_device_memory_capacity()
        # Batch size of each rank will not become so large when DP is on
-        if gpu_mem is not None and gpu_mem > 81920 and server_args.dp_size == 1:
+        if gpu_mem is not None and gpu_mem > 96 * 1024:
            capture_bs += list(range(160, 257, 8))
    if max(capture_bs) > model_runner.req_to_token_pool.size:

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -222,7 +222,6 @@ class ServerArgs:
        # Set mem fraction static, which depends on the tensor parallelism size
        if self.mem_fraction_static is None:
-            if gpu_mem <= 81920:
            if self.tp_size >= 16:
                self.mem_fraction_static = 0.79
            elif self.tp_size >= 8:
@@ -233,9 +232,13 @@ class ServerArgs:
                self.mem_fraction_static = 0.87
            else:
                self.mem_fraction_static = 0.88
-            else:
+            if gpu_mem > 96 * 1024:
-                # FIXME: more fine grained auto-selection polices
+                mem_fraction = self.mem_fraction_static
-                self.mem_fraction_static = (gpu_mem - 1024 * 13) / gpu_mem
+                self.mem_fraction_static = min(
+                    mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
+                    (gpu_mem - 1024 * 18)
+                    / gpu_mem,  # 15 GB + additional 3GB for cuda graph
+                )
        # Set chunked prefill size, which depends on the gpu memory capacity
        if self.chunked_prefill_size is None: