Unverified Commit 5d182420 authored by PanZezhong1725's avatar PanZezhong1725 Committed by GitHub
Browse files

Merge pull request #82 from InfiniTensor/issue/80

修复attention prefill计时方式,重构目录
parents 469f2884 02676be8
......@@ -60,6 +60,20 @@ def get_args():
return parser.parse_args()
def torch_synchronize(_device):
if _device == "cuda":
torch.cuda.synchronize()
elif _device == "musa":
torch.musa.synchronize()
def torch_empty_cache(_device):
if _device == "cuda":
torch.cuda.empty_cache()
elif _device == "musa":
torch.musa.empty_cache()
def create_Qwen3attention_torch(dir_path, *, device, dtype=torch.bfloat16):
config = AutoConfig.from_pretrained(dir_path)
config.num_hidden_layers = 1
......@@ -128,12 +142,16 @@ def generate_attention_input_torch(
return req_list
def benchmark_Qwen3attention_prefill_torch(model, rotary_emb, req_list, test_cases):
def benchmark_Qwen3attention_prefill_torch(
model, rotary_emb, test_cases, device, dtype=torch.bfloat16
):
"""
Test Qwen3attention.
"""
req_list = generate_attention_input_torch(
model, rotary_emb, test_cases, device, dtype=dtype
)
req_out_list = []
for req in req_list:
# ----------------------------------------- #
......@@ -172,9 +190,16 @@ def benchmark_Qwen3attention_prefill_torch(model, rotary_emb, req_list, test_cas
output_host = output_device.to("cpu")
req_out_list.append(output_host)
torch.cuda.synchronize()
torch_synchronize(device)
for _ in range(WARMUPS):
for i, req in enumerate(req_list):
# ----------------------------------------- #
# 恢复 kv chche的长度
# ----------------------------------------- #
origin_len = test_cases["pastlens"][i]
req["past_key_values"].crop(origin_len)
for req in req_list:
# ----------------------------------------- #
# 获得每个req的数据
......@@ -216,9 +241,13 @@ def benchmark_Qwen3attention_prefill_torch(model, rotary_emb, req_list, test_cas
origin_len = test_cases["pastlens"][i]
req["past_key_values"].crop(origin_len)
torch.cuda.synchronize()
start_time = time.time()
torch_synchronize(device)
# ----------------------------------------- #
# 重要:每个req都按整个batch的起始时间计算
# ----------------------------------------- #
start_time = time.time()
for i, req in enumerate(req_list):
# ----------------------------------------- #
# 获得每个req的数据
# ----------------------------------------- #
......@@ -249,26 +278,32 @@ def benchmark_Qwen3attention_prefill_torch(model, rotary_emb, req_list, test_cas
past_key_values=past_key_values,
)
torch.cuda.synchronize()
torch_synchronize(device)
end_time = time.time()
time_consuming += (end_time - start_time) * 1000
# 记录每个req从进入所有req进入推理到自己结束的时间
time_consuming += end_time - start_time
out_token_count = RUNS * len(req_list)
latency = time_consuming / out_token_count
latency = time_consuming * 1000 / out_token_count
print(
f"\t WARMUPS={WARMUPS} RUNS={RUNS}, Attention Torch, average latency: {round(latency, 2)} ms\n"
f"\t WARMUPS={WARMUPS} RUNS={RUNS}, Attention Torch, average TTFT: {round(latency, 2)} ms\n"
)
return req_out_list
def benchmark_Qwen3attention_decode_torch(model, rotary_emb, req_list, test_cases):
def benchmark_Qwen3attention_decode_torch(
model, rotary_emb, test_cases, device, dtype=torch.bfloat16
):
"""
Test Qwen3attention_decode.
"""
req_list = generate_attention_input_torch(
model, rotary_emb, test_cases, device, dtype=dtype
)
req_out_list = []
for req in req_list:
# ----------------------------------------- #
......@@ -302,7 +337,7 @@ def benchmark_Qwen3attention_decode_torch(model, rotary_emb, req_list, test_case
req_out_list.append(output_host)
torch.cuda.synchronize()
torch_synchronize(device)
for req in req_list:
for _ in range(WARMUPS):
......@@ -341,7 +376,7 @@ def benchmark_Qwen3attention_decode_torch(model, rotary_emb, req_list, test_case
origin_len = test_cases["pastlens"][i]
req["past_key_values"].crop(origin_len)
torch.cuda.synchronize()
torch_synchronize(device)
start_time = time.time()
for i, req in enumerate(req_list):
......@@ -381,7 +416,7 @@ def benchmark_Qwen3attention_decode_torch(model, rotary_emb, req_list, test_case
# -------------------------------------------------------------- #
req["hidden_states"] = output_device
torch.cuda.synchronize()
torch_synchronize(device)
end_time = time.time()
time_consuming = end_time - start_time
......@@ -390,7 +425,7 @@ def benchmark_Qwen3attention_decode_torch(model, rotary_emb, req_list, test_case
throughput = out_token_count / time_consuming
print(
f"\t WARMUPS={WARMUPS} RUNS={RUNS} Attention Torch average throughput: {round(throughput, 2)} /s \n"
f"\t WARMUPS={WARMUPS} RUNS={RUNS}, Attention Torch, average throughput: {round(throughput, 2)} tok/s \n"
)
return req_out_list
......@@ -413,11 +448,12 @@ if __name__ == "__main__":
device = "cuda"
elif args.moore:
device = "musa"
import torch_musa
elif args.iluvatar:
device = "cuda"
else:
print(
"Usage: python test/qwen3_atteniton_test.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_path>"
"Usage: python test/models/qwen3_moe/attention_test.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_path>"
)
sys.exit(1)
......@@ -432,26 +468,17 @@ if __name__ == "__main__":
print("Test Qwen3attention ")
print("*" * 130)
print(f"Test Case PREFILL_TESTCASES : {PREFILL_TESTCASES}")
req_list = generate_attention_input_torch(
model, rotary_emb, PREFILL_TESTCASES, device, dtype=dtype
)
output_prefill = benchmark_Qwen3attention_prefill_torch(
model, rotary_emb, req_list, PREFILL_TESTCASES
model, rotary_emb, PREFILL_TESTCASES, device, dtype=dtype
)
print("\n")
print("-" * 130)
print(f"\nTest DECODE_TESTCASES: {DECODE_TESTCASES}")
#
req_list = generate_attention_input_torch(
model, rotary_emb, DECODE_TESTCASES, device, dtype=dtype
)
output_decode = benchmark_Qwen3attention_decode_torch(
model, rotary_emb, req_list, DECODE_TESTCASES
model, rotary_emb, DECODE_TESTCASES, device, dtype=dtype
)
# clean up device memory
del model
torch.cuda.empty_cache()
torch_empty_cache(device)
......@@ -60,6 +60,20 @@ def get_args():
return parser.parse_args()
def torch_synchronize(_device):
if _device == "cuda":
torch.cuda.synchronize()
elif _device == "musa":
torch.musa.synchronize()
def torch_empty_cache(_device):
if _device == "cuda":
torch.cuda.empty_cache()
elif _device == "musa":
torch.musa.empty_cache()
def create_moe_torch(dir_path, device, dtype=torch.bfloat16):
config = AutoConfig.from_pretrained(dir_path)
moe = qwen3_moe.modeling_qwen3_moe.Qwen3MoeSparseMoeBlock(config).to(
......@@ -85,9 +99,9 @@ def generate_moe_input_torch(testcase, dtype=torch.bfloat16):
return input_tensor
def benchmark_moe_torch(moe, input_host, device, dtype):
def benchmark_moe_torch(moe, testcase, device, dtype):
""""""
input_host = generate_moe_input_torch(testcase, dtype=dtype)
input_device = input_host.to(device=device)
output_device, _ = moe(input_device)
......@@ -95,15 +109,19 @@ def benchmark_moe_torch(moe, input_host, device, dtype):
for _ in range(WARMUPS):
moe(input_device)
torch.cuda.synchronize()
torch_synchronize(device)
start_time = time.time()
for _ in range(RUNS):
moe(input_device)
torch.cuda.synchronize()
torch_synchronize(device)
end_time = time.time()
print(f" MoE Torch average latency: {(end_time - start_time) * 1000 / RUNS} ms")
total_time = end_time - start_time
total_tokens = sum(testcase["seqlens"]) * RUNS
print(
f"\t WARMUPS={WARMUPS} RUNS={RUNS}, MoE Torch average latency: {round(total_time * 1000 / RUNS, 2)} ms throughput: {round(total_tokens / total_time, 2)} tok/s"
)
return output_host
......@@ -123,11 +141,12 @@ if __name__ == "__main__":
device = "cuda"
elif args.moore:
device = "musa"
import torch_musa
elif args.iluvatar:
device = "cuda"
else:
print(
"Usage: python test/qwen3_moe_test.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_path>"
"Usage: python test/models/qwen3_moe/moe_test.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_path>"
)
sys.exit(1)
......@@ -141,16 +160,17 @@ if __name__ == "__main__":
print("Test Qwen3 MoE")
print("*" * 130)
print(f"Test Case PREFILL_TESTCASES : {PREFILL_TESTCASES}")
input_prefill = generate_moe_input_torch(PREFILL_TESTCASES)
output_prefill = benchmark_moe_torch(moe, input_prefill, device=device, dtype=dtype)
output_prefill = benchmark_moe_torch(
moe, PREFILL_TESTCASES, device=device, dtype=dtype
)
print("\n")
print("-" * 130)
print(f"\nTest DECODE_TESTCASES: {DECODE_TESTCASES}")
input_decode = generate_moe_input_torch(DECODE_TESTCASES)
output_decode = benchmark_moe_torch(moe, input_decode, device=device, dtype=dtype)
output_decode = benchmark_moe_torch(
moe, DECODE_TESTCASES, device=device, dtype=dtype
)
# clean up device memory
del moe
torch.cuda.empty_cache()
torch_empty_cache(device)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment