[ROCm] Logic to decide whether to used manually unrolled kernel. (#3306)

7ab84948 · Wen-Heng (Jack) Chung · GitHub · 4885b908 · 7ab84948 · 7ab84948
Unverified Commit 7ab84948 authored Feb 04, 2025 by Wen-Heng (Jack) Chung Committed by GitHub Feb 04, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 17 additions and 3 deletions

python/sglang/srt/layers/quantization/fp8_kernel.py python/sglang/srt/layers/quantization/fp8_kernel.py +10 -3

python/sglang/srt/utils.py python/sglang/srt/utils.py +7 -0

No files found.
--- a/python/sglang/srt/layers/quantization/fp8_kernel.py
+++ b/python/sglang/srt/layers/quantization/fp8_kernel.py
@@ -22,7 +22,7 @@ import torch
 import triton
 import triton.language as tl
-from sglang.srt.utils import get_device_name, is_hip
+from sglang.srt.utils import get_device_core_count, get_device_name, is_hip
 is_hip_ = is_hip()
 fp8_type_ = torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn
@@ -450,9 +450,16 @@ def w8a8_block_fp8_matmul(
            triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
        )
-    # Use manually unrolledx4 kernel on AMD GPU.
+    # Use manually unrolledx4 kernel on AMD GPU when the grid size is small.
+    # Empirical testing shows the sweet spot lies when it's less than the # of
+    # compute units available on the device.
+    num_workgroups = triton.cdiv(M, config["BLOCK_SIZE_M"]) * triton.cdiv(
+        N, config["BLOCK_SIZE_N"]
+    )
    kernel = (
-        _w8a8_block_fp8_matmul_unrolledx4 if is_hip_ == True else _w8a8_block_fp8_matmul
+        _w8a8_block_fp8_matmul_unrolledx4
+        if (is_hip_ == True and num_workgroups <= get_device_core_count())
+        else _w8a8_block_fp8_matmul
    )
    kernel[grid](

--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -1046,6 +1046,13 @@ def get_device_name(device_id: int = 0) -> str:
        return torch.hpu.get_device_name(device_id)
+def get_device_core_count(device_id: int = 0) -> int:
+    if hasattr(torch, "cuda") and torch.cuda.is_available():
+        return torch.cuda.get_device_properties(device_id).multi_processor_count
+    return 0
 def get_device_capability(device_id: int = 0) -> Tuple[int, int]:
    major, minor = None, None
    if hasattr(torch, "cuda") and torch.cuda.is_available():