# SPDX-License-Identifier: MIT import torch import aiter import pandas as pd from aiter import dtypes # from ater.test_common import checkAllclose, perftest from torch.profiler import profile, ProfilerActivity # input shape: torch.Size([4096, 64, 160]) (20480, 1, 128) # other shape: torch.Size([4096, 64, 160]) (10240, 160, 1) # input shape: torch.Size([4096, 64, 160]) (47360, 1, 296) # other shape: torch.Size([4096, 64, 160]) (10240, 160, 1) shape0 = (4096, 880) stride0 = (880, 1) # shape0 = (16,16) # stride0 = (16, 1) tensor0 = torch.empty_strided(shape0, stride0, dtype=dtypes.fp16, device="cuda") random_data0 = torch.rand(shape0) tensor0.copy_(random_data0) # tensor0.fill_(1) print("shape0", shape0) print("strride0:", stride0) def get_profiler_totals(prof): totals = {} table = prof.key_averages().table(sort_by="cuda_time_total", row_limit=10) for line in table.splitlines(): if line.startswith("Self CPU time total:"): totals["CPU total time"] = line.split(":", 1)[1].strip() elif line.startswith("Self CUDA time total:"): totals["CUDA total time"] = line.split(":", 1)[1].strip() return totals with profile( activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, with_stack=True, with_modules=True, record_shapes=True, ) as prof: for j in range(100): # cache_flush1 = torch.randn(10000, 10000, requires_grad=True, device="cuda", dtype=dtypes.fp32).to(dtypes.i32) result = torch.sigmoid(tensor0) print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) torch_totals = get_profiler_totals(prof) with profile( activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], profile_memory=True, with_stack=True, with_modules=True, record_shapes=True, ) as prof: for j in range(100): # cache_flush1 = torch.randn(10000, 10000, requires_grad=True, device="cuda", dtype=dtypes.fp32).to(dtypes.i32) output = aiter.sigmoid(tensor0) print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) aiter_totals = get_profiler_totals(prof) result_equal = torch.equal(result, output) summary = pd.DataFrame([ { "metric": "CPU total time", "torch.sigmoid": torch_totals.get("CPU total time"), "aiter.sigmoid": aiter_totals.get("CPU total time"), }, { "metric": "CUDA total time", "torch.sigmoid": torch_totals.get("CUDA total time"), "aiter.sigmoid": aiter_totals.get("CUDA total time"), } ]) equal_msg = f"Whether the two outputs are equal: {str(result_equal)}" summary.to_csv("test_aiter_sigmoid.csv", index=False) with open("test_aiter_sigmoid.csv", "a", encoding="utf-8") as f: f.write(equal_msg + "\n")