Tune memory arguments on B200 (#6718)

f2bd3515 · Baizhou Zhang · GitHub · c459536b · f2bd3515 · f2bd3515
Unverified Commit f2bd3515 authored May 29, 2025 by Baizhou Zhang Committed by GitHub May 29, 2025
Showing with 8 additions and 2 deletions

python/sglang/srt/model_executor/cuda_graph_runner.py python/sglang/srt/model_executor/cuda_graph_runner.py +2 -0

python/sglang/srt/server_args.py python/sglang/srt/server_args.py +6 -2

No files found.
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -149,6 +149,8 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
        gpu_mem = get_device_memory_capacity()
        if gpu_mem is not None and gpu_mem > 96 * 1024:
            capture_bs += list(range(160, 257, 8))
+        if gpu_mem is not None and gpu_mem > 180 * 1000:
+            capture_bs += list(range(256, 528, 16))
    if max(capture_bs) > model_runner.req_to_token_pool.size:
        # In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -260,7 +260,9 @@ class ServerArgs:
                    self.mem_fraction_static = 0.88
            else:
                self.mem_fraction_static = 0.88
-            if gpu_mem is not None and gpu_mem > 96 * 1024:
+            if gpu_mem is not None and gpu_mem > 180 * 1000:
+                self.mem_fraction_static = 0.79
+            elif gpu_mem is not None and gpu_mem > 96 * 1024:
                mem_fraction = self.mem_fraction_static
                # 15 GB + additional 3GB for cuda graph
                reserve_mem = 1024 * 18
@@ -277,7 +279,9 @@ class ServerArgs:
        # Set chunked prefill size, which depends on the gpu memory capacity
        if self.chunked_prefill_size is None:
-            if gpu_mem is not None and gpu_mem < 25_000:
+            if gpu_mem is not None and gpu_mem > 180_000:
+                self.chunked_prefill_size = 16384
+            elif gpu_mem is not None and gpu_mem < 25_000:
                self.chunked_prefill_size = 2048
            elif self.disaggregation_mode != "null":
                self.chunked_prefill_size = 16384