Merge branch 'v0.7.2-dev-wm' into 'v0.7.2-dev'

[fix]修复0.7.2版本benchmark_moe因新增rocm参数报错问题 See merge request dcutoolkit/deeplearing/vllm!68

Merge branch 'v0.7.2-dev-wm' into 'v0.7.2-dev'
[fix]修复0.7.2版本benchmark_moe因新增rocm参数报错问题 See merge request dcutoolkit/deeplearing/vllm!68
49ff8ab5 · zhuwenwen · a3ac00e0 · e753189f · 49ff8ab5
Commit 49ff8ab5 authored Feb 18, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 32 additions and 51 deletions

benchmarks/kernels/benchmark_moe.py benchmarks/kernels/benchmark_moe.py +32 -51

No files found.
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -183,7 +183,8 @@ def benchmark_config(


 def get_rocm_tuning_space(use_fp16, nn_moe: Optional[bool] = False):
-    block_mn_range = [16, 32, 64, 128, 256]
+    block_m_range = [16, 32, 64, 128, 256]
+    block_n_range = [32, 64, 128, 256]
    block_k_range = [16, 32, 64, 128, 256]
    if not use_fp16:
        block_k_range.remove(16)  # BLOCK_K=16 not supported for fp8
@@ -195,8 +196,8 @@ def get_rocm_tuning_space(use_fp16, nn_moe: Optional[bool] = False):
    kpack_range = [1, 2] if use_fp16 else []

    param_ranges = {
-        "BLOCK_SIZE_M": block_mn_range,
-        "BLOCK_SIZE_N": block_mn_range,
+        "BLOCK_SIZE_M": block_m_range,
+        "BLOCK_SIZE_N": block_n_range,
        "BLOCK_SIZE_K": block_k_range,
        "GROUP_SIZE_M": group_m_range,
        "num_warps": num_warps_range,
@@ -204,11 +205,12 @@ def get_rocm_tuning_space(use_fp16, nn_moe: Optional[bool] = False):
        "waves_per_eu": waves_per_eu_range,
    }
    if nn_moe:
-        param_ranges["num_ldmatrixes"] = 1
+        param_ranges["num_ldmatrixes"] = [1]
    
-    if use_fp16:
-        param_ranges["matrix_instr_nonkdim"] = matrix_instr_nonkdim_range
-        param_ranges["kpack"] = kpack_range
+    # DCU currently does not support the following parameters
+    # if use_fp16:
+    #     param_ranges["matrix_instr_nonkdim"] = matrix_instr_nonkdim_range
+    #     param_ranges["kpack"] = kpack_range

    return param_ranges

@@ -277,10 +279,11 @@ def prune_rocm_configs(M, N, K, configs, is_fp16=True):
        BLOCK_SIZE_K = config.get("BLOCK_SIZE_K")
        num_warps = config.get("num_warps")

-        if is_fp16:
-            matrix_instr_nonkdim = config.get("matrix_instr_nonkdim")
-            if matrix_instr_nonkdim > mfma:
-                continue
+        # DCU currently does not support matrix_instr_nonkdim param
+        # if is_fp16:
+        #     matrix_instr_nonkdim = config.get("matrix_instr_nonkdim")
+        #     if matrix_instr_nonkdim > mfma:
+        #         continue
        if mfma == 4 and BLOCK_SIZE_K < 64:
            continue
        # some layouts could not work properly in case
@@ -289,16 +292,18 @@ def prune_rocm_configs(M, N, K, configs, is_fp16=True):
            continue
        SPLIT_K = config.get("SPLIT_K", 1)
        GROUP_M = config.get("GROUP_SIZE_M")
-        if is_fp16:
-            if (matrix_instr_nonkdim > BLOCK_SIZE_M
-                    or matrix_instr_nonkdim > BLOCK_SIZE_N):
-                continue
-            if (matrix_instr_nonkdim >= M
-                    and matrix_instr_nonkdim != BLOCK_SIZE_M):
-                continue
-            if (matrix_instr_nonkdim >= N
-                    and matrix_instr_nonkdim != BLOCK_SIZE_N):
-                continue
+
+        # DCU currently does not support matrix_instr_nonkdim param
+        # if is_fp16:
+        #     if (matrix_instr_nonkdim > BLOCK_SIZE_M
+        #             or matrix_instr_nonkdim > BLOCK_SIZE_N):
+        #         continue
+        #     if (matrix_instr_nonkdim >= M
+        #             and matrix_instr_nonkdim != BLOCK_SIZE_M):
+        #         continue
+        #     if (matrix_instr_nonkdim >= N
+        #             and matrix_instr_nonkdim != BLOCK_SIZE_N):
+        #         continue
        # Skip BLOCK_SIZE that is too large compare to M/N
        # unless BLOCK_SIZE is already small enough
        if M * 2 < BLOCK_SIZE_M and BLOCK_SIZE_M != 16:
@@ -452,7 +457,6 @@ class BenchmarkWorker:

 def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:

-    if "num_ldmatrixes" not in config:
    return {
            "BLOCK_SIZE_M": 
            config["BLOCK_SIZE_M"],
@@ -467,31 +471,8 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
            "num_stages": 
            config["num_stages"],
            **({
-            "waves_per_eu": config["waves_per_eu"]
-            } if "waves_per_eu" in config else {}),
-            **({
-                "matrix_instr_nonkdim": config["matrix_instr_nonkdim"]
-            } if "matrix_instr_nonkdim" in config else {}),
-            **({
-                "kpack": config["kpack"]
-            } if "kpack" in config else {}),
-        }
-    else:
-        return {
-            "BLOCK_SIZE_M": 
-            config["BLOCK_SIZE_M"],
-            "BLOCK_SIZE_N": 
-            config["BLOCK_SIZE_N"],
-            "BLOCK_SIZE_K": 
-            config["BLOCK_SIZE_K"],
-            "GROUP_SIZE_M": 
-            config["GROUP_SIZE_M"],
-            "num_warps": 
-            config["num_warps"],
-            "num_stages": 
-            config["num_stages"],
-            "num_ldmatrixes": 
-            config["num_ldmatrixes"],
+            "num_ldmatrixes": config["num_ldmatrixes"]
+            } if "num_ldmatrixes" in config else {}),
            **({
            "waves_per_eu": config["waves_per_eu"]
            } if "waves_per_eu" in config else {}),
@@ -643,7 +624,7 @@ if __name__ == "__main__":
    parser.add_argument("--tune", action="store_true")
    parser.add_argument("--nn-moe", action='store_true', default=False)
    parser.add_argument("--trust-remote-code", action="store_true")
-    parser.add_argument("--moe-ep-size", type=int, default=1)
+    parser.add_argument("--moe-ep-size", "-ep", type=int, default=1)
    parser.add_argument("--num-gpus", type=int, default=1)
    args = parser.parse_args()