Merge pull request #82 from InfiniTensor/issue/80

修复attention prefill计时方式，重构目录

Merge pull request #82 from InfiniTensor/issue/80
修复attention prefill计时方式，重构目录
5d182420 · PanZezhong1725 · GitHub · 469f2884 · 02676be8 · 5d182420
Unverified Commit 5d182420 authored Nov 22, 2025 by PanZezhong1725 Committed by GitHub Nov 22, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 86 additions and 39 deletions

test/models/qwen3_moe/attention_test.py test/models/qwen3_moe/attention_test.py +54 -27

test/models/qwen3_moe/moe_test.py test/models/qwen3_moe/moe_test.py +32 -12

No files found.
--- a/test/qwen3_atteniton_test.py
+++ b/test/qwen3_atteniton_test.py
@@ -60,6 +60,20 @@ def get_args():
    return parser.parse_args()


+def torch_synchronize(_device):
+    if _device == "cuda":
+        torch.cuda.synchronize()
+    elif _device == "musa":
+        torch.musa.synchronize()
+
+
+def torch_empty_cache(_device):
+    if _device == "cuda":
+        torch.cuda.empty_cache()
+    elif _device == "musa":
+        torch.musa.empty_cache()
+
+
 def create_Qwen3attention_torch(dir_path, *, device, dtype=torch.bfloat16):
    config = AutoConfig.from_pretrained(dir_path)
    config.num_hidden_layers = 1
@@ -128,12 +142,16 @@ def generate_attention_input_torch(
    return req_list


-def benchmark_Qwen3attention_prefill_torch(model, rotary_emb, req_list, test_cases):
+def benchmark_Qwen3attention_prefill_torch(
+    model, rotary_emb, test_cases, device, dtype=torch.bfloat16
+):
    """
    Test Qwen3attention.

    """
-
+    req_list = generate_attention_input_torch(
+        model, rotary_emb, test_cases, device, dtype=dtype
+    )
    req_out_list = []
    for req in req_list:
        # ----------------------------------------- #
@@ -172,9 +190,16 @@ def benchmark_Qwen3attention_prefill_torch(model, rotary_emb, req_list, test_cas
        output_host = output_device.to("cpu")
        req_out_list.append(output_host)

-    torch.cuda.synchronize()
+    torch_synchronize(device)

    for _ in range(WARMUPS):
+        for i, req in enumerate(req_list):
+            # ----------------------------------------- #
+            #          恢复 kv chche的长度
+            # ----------------------------------------- #
+            origin_len = test_cases["pastlens"][i]
+            req["past_key_values"].crop(origin_len)
+
        for req in req_list:
            # ----------------------------------------- #
            #         获得每个req的数据
@@ -216,9 +241,13 @@ def benchmark_Qwen3attention_prefill_torch(model, rotary_emb, req_list, test_cas
            origin_len = test_cases["pastlens"][i]
            req["past_key_values"].crop(origin_len)

-            torch.cuda.synchronize()
-            start_time = time.time()
+        torch_synchronize(device)
+        # ----------------------------------------- #
+        #       重要：每个req都按整个batch的起始时间计算
+        # ----------------------------------------- #
+        start_time = time.time()

+        for i, req in enumerate(req_list):
            # ----------------------------------------- #
            #         获得每个req的数据
            # ----------------------------------------- #
@@ -249,26 +278,32 @@ def benchmark_Qwen3attention_prefill_torch(model, rotary_emb, req_list, test_cas
                past_key_values=past_key_values,
            )

-            torch.cuda.synchronize()
+            torch_synchronize(device)
            end_time = time.time()

-        time_consuming += (end_time - start_time) * 1000
+            # 记录每个req从进入所有req进入推理到自己结束的时间
+            time_consuming += end_time - start_time

    out_token_count = RUNS * len(req_list)

-    latency = time_consuming / out_token_count
+    latency = time_consuming * 1000 / out_token_count

    print(
-        f"\t WARMUPS={WARMUPS} RUNS={RUNS}, Attention Torch, average latency: {round(latency, 2)} ms\n"
+        f"\t WARMUPS={WARMUPS} RUNS={RUNS}, Attention Torch, average TTFT: {round(latency, 2)} ms\n"
    )

    return req_out_list


-def benchmark_Qwen3attention_decode_torch(model, rotary_emb, req_list, test_cases):
+def benchmark_Qwen3attention_decode_torch(
+    model, rotary_emb, test_cases, device, dtype=torch.bfloat16
+):
    """
    Test Qwen3attention_decode.
    """
+    req_list = generate_attention_input_torch(
+        model, rotary_emb, test_cases, device, dtype=dtype
+    )
    req_out_list = []
    for req in req_list:
        # ----------------------------------------- #
@@ -302,7 +337,7 @@ def benchmark_Qwen3attention_decode_torch(model, rotary_emb, req_list, test_case

        req_out_list.append(output_host)

-    torch.cuda.synchronize()
+    torch_synchronize(device)

    for req in req_list:
        for _ in range(WARMUPS):
@@ -341,7 +376,7 @@ def benchmark_Qwen3attention_decode_torch(model, rotary_emb, req_list, test_case
        origin_len = test_cases["pastlens"][i]
        req["past_key_values"].crop(origin_len)

-    torch.cuda.synchronize()
+    torch_synchronize(device)
    start_time = time.time()

    for i, req in enumerate(req_list):
@@ -381,7 +416,7 @@ def benchmark_Qwen3attention_decode_torch(model, rotary_emb, req_list, test_case
            # -------------------------------------------------------------- #
            req["hidden_states"] = output_device

-    torch.cuda.synchronize()
+    torch_synchronize(device)
    end_time = time.time()

    time_consuming = end_time - start_time
@@ -390,7 +425,7 @@ def benchmark_Qwen3attention_decode_torch(model, rotary_emb, req_list, test_case
    throughput = out_token_count / time_consuming

    print(
-        f"\t WARMUPS={WARMUPS} RUNS={RUNS}  Attention Torch average throughput: {round(throughput, 2)} /s \n"
+        f"\t WARMUPS={WARMUPS} RUNS={RUNS}, Attention Torch, average throughput: {round(throughput, 2)} tok/s \n"
    )

    return req_out_list
@@ -413,11 +448,12 @@ if __name__ == "__main__":
        device = "cuda"
    elif args.moore:
        device = "musa"
+        import torch_musa
    elif args.iluvatar:
        device = "cuda"
    else:
        print(
-            "Usage:  python test/qwen3_atteniton_test.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_path>"
+            "Usage:  python test/models/qwen3_moe/attention_test.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_path>"
        )
        sys.exit(1)

@@ -432,26 +468,17 @@ if __name__ == "__main__":
    print("Test Qwen3attention ")
    print("*" * 130)
    print(f"Test Case PREFILL_TESTCASES : {PREFILL_TESTCASES}")
-    req_list = generate_attention_input_torch(
-        model, rotary_emb, PREFILL_TESTCASES, device, dtype=dtype
-    )
-
    output_prefill = benchmark_Qwen3attention_prefill_torch(
-        model, rotary_emb, req_list, PREFILL_TESTCASES
+        model, rotary_emb, PREFILL_TESTCASES, device, dtype=dtype
    )

    print("\n")
    print("-" * 130)
    print(f"\nTest DECODE_TESTCASES: {DECODE_TESTCASES}")
-    #
-    req_list = generate_attention_input_torch(
-        model, rotary_emb, DECODE_TESTCASES, device, dtype=dtype
-    )
-
    output_decode = benchmark_Qwen3attention_decode_torch(
-        model, rotary_emb, req_list, DECODE_TESTCASES
+        model, rotary_emb, DECODE_TESTCASES, device, dtype=dtype
    )

    # clean up device memory
    del model
-    torch.cuda.empty_cache()
+    torch_empty_cache(device)
--- a/test/qwen3_moe_test.py
+++ b/test/qwen3_moe_test.py
@@ -60,6 +60,20 @@ def get_args():
    return parser.parse_args()


+def torch_synchronize(_device):
+    if _device == "cuda":
+        torch.cuda.synchronize()
+    elif _device == "musa":
+        torch.musa.synchronize()
+
+
+def torch_empty_cache(_device):
+    if _device == "cuda":
+        torch.cuda.empty_cache()
+    elif _device == "musa":
+        torch.musa.empty_cache()
+
+
 def create_moe_torch(dir_path, device, dtype=torch.bfloat16):
    config = AutoConfig.from_pretrained(dir_path)
    moe = qwen3_moe.modeling_qwen3_moe.Qwen3MoeSparseMoeBlock(config).to(
@@ -85,9 +99,9 @@ def generate_moe_input_torch(testcase, dtype=torch.bfloat16):
    return input_tensor


-def benchmark_moe_torch(moe, input_host, device, dtype):
+def benchmark_moe_torch(moe, testcase, device, dtype):
    """"""
-
+    input_host = generate_moe_input_torch(testcase, dtype=dtype)
    input_device = input_host.to(device=device)

    output_device, _ = moe(input_device)
@@ -95,15 +109,19 @@ def benchmark_moe_torch(moe, input_host, device, dtype):

    for _ in range(WARMUPS):
        moe(input_device)
-    torch.cuda.synchronize()
+    torch_synchronize(device)

    start_time = time.time()
    for _ in range(RUNS):
        moe(input_device)
-    torch.cuda.synchronize()
+    torch_synchronize(device)
    end_time = time.time()

-    print(f"    MoE Torch average latency: {(end_time - start_time) * 1000 / RUNS} ms")
+    total_time = end_time - start_time
+    total_tokens = sum(testcase["seqlens"]) * RUNS
+    print(
+        f"\t WARMUPS={WARMUPS} RUNS={RUNS}, MoE Torch average latency: {round(total_time * 1000 / RUNS, 2)} ms   throughput: {round(total_tokens / total_time, 2)} tok/s"
+    )
    return output_host


@@ -123,11 +141,12 @@ if __name__ == "__main__":
        device = "cuda"
    elif args.moore:
        device = "musa"
+        import torch_musa
    elif args.iluvatar:
        device = "cuda"
    else:
        print(
-            "Usage:  python test/qwen3_moe_test.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_path>"
+            "Usage:  python test/models/qwen3_moe/moe_test.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_path>"
        )
        sys.exit(1)

@@ -141,16 +160,17 @@ if __name__ == "__main__":
    print("Test Qwen3 MoE")
    print("*" * 130)
    print(f"Test Case PREFILL_TESTCASES : {PREFILL_TESTCASES}")
-
-    input_prefill = generate_moe_input_torch(PREFILL_TESTCASES)
-    output_prefill = benchmark_moe_torch(moe, input_prefill, device=device, dtype=dtype)
+    output_prefill = benchmark_moe_torch(
+        moe, PREFILL_TESTCASES, device=device, dtype=dtype
+    )

    print("\n")
    print("-" * 130)
    print(f"\nTest DECODE_TESTCASES: {DECODE_TESTCASES}")
-    input_decode = generate_moe_input_torch(DECODE_TESTCASES)
-    output_decode = benchmark_moe_torch(moe, input_decode, device=device, dtype=dtype)
+    output_decode = benchmark_moe_torch(
+        moe, DECODE_TESTCASES, device=device, dtype=dtype
+    )

    # clean up device memory
    del moe
-    torch.cuda.empty_cache()
+    torch_empty_cache(device)