Bug: fix capture_bs (#3857)

4606e2a3 · who who who · GitHub · 127998cc · 4606e2a3
Unverified Commit 4606e2a3 authored Feb 26, 2025 by who who who Committed by GitHub Feb 25, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 4 additions and 2 deletions

python/sglang/srt/model_executor/cuda_graph_runner.py python/sglang/srt/model_executor/cuda_graph_runner.py +4 -2

No files found.
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -114,6 +114,10 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
            capture_bs = list(range(1, 33)) + [64, 128]
        else:
            capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
+
+    if is_hip_:
+        capture_bs += [i * 8 for i in range(21, 33)]
+
    if max(capture_bs) > model_runner.req_to_token_pool.size:
        # In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests
        # is very samll. We add more values here to make sure we capture the maximum bs.
@@ -132,8 +136,6 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
        if bs <= model_runner.req_to_token_pool.size
        and bs <= server_args.cuda_graph_max_bs
    ]
-    if is_hip_:
-        capture_bs += [i * 8 for i in range(21, 33)]
    compile_bs = (
        [bs for bs in capture_bs if bs <= server_args.torch_compile_max_bs]
        if server_args.enable_torch_compile