issue/80 修复attention prefill计时方式，重构目录

d7d0889d · PanZezhong · 469f2884 · d7d0889d · d7d0889d
Commit d7d0889d authored Nov 22, 2025 by PanZezhong
Show whitespace changes
Inline Side-by-side

Showing with 31 additions and 14 deletions

test/models/qwen3_moe/atteniton_test.py test/models/qwen3_moe/atteniton_test.py +18 -6

test/models/qwen3_moe/moe_test.py test/models/qwen3_moe/moe_test.py +13 -8

No files found.
--- a/test/qwen3_atteniton_test.py
+++ b/test/qwen3_atteniton_test.py
@@ -175,6 +175,13 @@ def benchmark_Qwen3attention_prefill_torch(model, rotary_emb, req_list, test_cas
    torch.cuda.synchronize()

    for _ in range(WARMUPS):
+        for i, req in enumerate(req_list):
+            # ----------------------------------------- #
+            #          恢复 kv chche的长度
+            # ----------------------------------------- #
+            origin_len = test_cases["pastlens"][i]
+            req["past_key_values"].crop(origin_len)
+
        for req in req_list:
            # ----------------------------------------- #
            #         获得每个req的数据
@@ -217,8 +224,12 @@ def benchmark_Qwen3attention_prefill_torch(model, rotary_emb, req_list, test_cas
            req["past_key_values"].crop(origin_len)

        torch.cuda.synchronize()
+        # ----------------------------------------- #
+        #       重要：每个req都按整个batch的起始时间计算
+        # ----------------------------------------- #
        start_time = time.time()

+        for i, req in enumerate(req_list):
            # ----------------------------------------- #
            #         获得每个req的数据
            # ----------------------------------------- #
@@ -252,14 +263,15 @@ def benchmark_Qwen3attention_prefill_torch(model, rotary_emb, req_list, test_cas
            torch.cuda.synchronize()
            end_time = time.time()

-        time_consuming += (end_time - start_time) * 1000
+            # 记录每个req从进入所有req进入推理到自己结束的时间
+            time_consuming += end_time - start_time

    out_token_count = RUNS * len(req_list)

-    latency = time_consuming / out_token_count
+    latency = time_consuming * 1000 / out_token_count

    print(
-        f"\t WARMUPS={WARMUPS} RUNS={RUNS}, Attention Torch, average latency: {round(latency, 2)} ms\n"
+        f"\t WARMUPS={WARMUPS} RUNS={RUNS}, Attention Torch, average TTFT: {round(latency, 2)} ms\n"
    )

    return req_out_list
@@ -390,7 +402,7 @@ def benchmark_Qwen3attention_decode_torch(model, rotary_emb, req_list, test_case
    throughput = out_token_count / time_consuming

    print(
-        f"\t WARMUPS={WARMUPS} RUNS={RUNS}  Attention Torch average throughput: {round(throughput, 2)} /s \n"
+        f"\t WARMUPS={WARMUPS} RUNS={RUNS}, Attention Torch, average throughput: {round(throughput, 2)} tok/s \n"
    )

    return req_out_list

--- a/test/qwen3_moe_test.py
+++ b/test/qwen3_moe_test.py
@@ -85,9 +85,9 @@ def generate_moe_input_torch(testcase, dtype=torch.bfloat16):
    return input_tensor


-def benchmark_moe_torch(moe, input_host, device, dtype):
+def benchmark_moe_torch(moe, testcase, device, dtype):
    """"""
-
+    input_host = generate_moe_input_torch(testcase, dtype=dtype)
    input_device = input_host.to(device=device)

    output_device, _ = moe(input_device)
@@ -103,7 +103,11 @@ def benchmark_moe_torch(moe, input_host, device, dtype):
    torch.cuda.synchronize()
    end_time = time.time()

-    print(f"    MoE Torch average latency: {(end_time - start_time) * 1000 / RUNS} ms")
+    total_time = end_time - start_time
+    total_tokens = sum(testcase["seqlens"]) * RUNS
+    print(
+        f"\t WARMUPS={WARMUPS} RUNS={RUNS}, MoE Torch average latency: {round(total_time * 1000 / RUNS, 2)} ms   throughput: {round(total_tokens / total_time, 2)} tok/s"
+    )
    return output_host


@@ -141,15 +145,16 @@ if __name__ == "__main__":
    print("Test Qwen3 MoE")
    print("*" * 130)
    print(f"Test Case PREFILL_TESTCASES : {PREFILL_TESTCASES}")
-
-    input_prefill = generate_moe_input_torch(PREFILL_TESTCASES)
-    output_prefill = benchmark_moe_torch(moe, input_prefill, device=device, dtype=dtype)
+    output_prefill = benchmark_moe_torch(
+        moe, PREFILL_TESTCASES, device=device, dtype=dtype
+    )

    print("\n")
    print("-" * 130)
    print(f"\nTest DECODE_TESTCASES: {DECODE_TESTCASES}")
-    input_decode = generate_moe_input_torch(DECODE_TESTCASES)
-    output_decode = benchmark_moe_torch(moe, input_decode, device=device, dtype=dtype)
+    output_decode = benchmark_moe_torch(
+        moe, DECODE_TESTCASES, device=device, dtype=dtype
+    )

    # clean up device memory
    del moe