import hipdnn import torch from torch.profiler import profile, ProfilerActivity if __name__ == "__main__": # Input dimensions batch = 128 # Batch size channels = 64 # Number of input channels height = 112 # Height width = 112 # Width model = hipdnn.TorchPReLU() input_type = torch.float32 x = torch.rand(batch, channels, height, width, dtype=input_type, device="cuda") with profile( activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True ) as prof: with torch.cuda.amp.autocast(dtype=torch.float16): y = model(x) y.backward(x) print(prof.key_averages(group_by_input_shape=True).table(sort_by="self_cuda_time_total")) torch.cuda.synchronize()