test_aiter_sigmoid.py

# SPDX-License-Identifier: MIT
import torch
import aiter
import pandas as pd
from aiter import dtypes

# from ater.test_common import checkAllclose, perftest
from torch.profiler import profile, ProfilerActivity

# input shape: torch.Size([4096, 64, 160]) (20480, 1, 128)
# other shape: torch.Size([4096, 64, 160]) (10240, 160, 1)

# input shape: torch.Size([4096, 64, 160]) (47360, 1, 296)
# other shape: torch.Size([4096, 64, 160]) (10240, 160, 1)

shape0 = (4096, 880)
stride0 = (880, 1)

# shape0 = (16,16)
# stride0 = (16, 1)

tensor0 = torch.empty_strided(shape0, stride0, dtype=dtypes.fp16, device="cuda")
random_data0 = torch.rand(shape0)
tensor0.copy_(random_data0)
# tensor0.fill_(1)

print("shape0", shape0)
print("strride0:", stride0)

def get_profiler_totals(prof):
    totals = {}
    table = prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)
    for line in table.splitlines():
        if line.startswith("Self CPU time total:"):
            totals["CPU total time"] = line.split(":", 1)[1].strip()
        elif line.startswith("Self CUDA time total:"):
            totals["CUDA total time"] = line.split(":", 1)[1].strip()
    return totals

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    profile_memory=True,
    with_stack=True,
    with_modules=True,
    record_shapes=True,
) as prof:
    for j in range(100):
        # cache_flush1 = torch.randn(10000, 10000, requires_grad=True, device="cuda", dtype=dtypes.fp32).to(dtypes.i32)
        result = torch.sigmoid(tensor0)
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
torch_totals = get_profiler_totals(prof)

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    profile_memory=True,
    with_stack=True,
    with_modules=True,
    record_shapes=True,
) as prof:
    for j in range(100):
        # cache_flush1 = torch.randn(10000, 10000, requires_grad=True, device="cuda", dtype=dtypes.fp32).to(dtypes.i32)
        output = aiter.sigmoid(tensor0)
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
aiter_totals = get_profiler_totals(prof)

result_equal = torch.equal(result, output)
summary = pd.DataFrame([
    {
        "metric": "CPU total time",
        "torch.sigmoid": torch_totals.get("CPU total time"),
        "aiter.sigmoid": aiter_totals.get("CPU total time"),
    },
    {
        "metric": "CUDA total time",
        "torch.sigmoid": torch_totals.get("CUDA total time"),
        "aiter.sigmoid": aiter_totals.get("CUDA total time"),
    }
])

equal_msg = f"Whether the two outputs are equal: {str(result_equal)}"
summary.to_csv("test_aiter_sigmoid.csv", index=False)
with open("test_aiter_sigmoid.csv", "a", encoding="utf-8") as f:
    f.write(equal_msg + "\n")