from typing import Callable

import torch

import nerfacc


class Profiler:
    def __init__(self, warmup=10, repeat=1000):
        self.warmup = warmup
        self.repeat = repeat

    def __call__(self, func: Callable):
        # warmup
        for _ in range(self.warmup):
            func()
        torch.cuda.synchronize()

        # profile
        with torch.profiler.profile(
            activities=[
                torch.profiler.ProfilerActivity.CPU,
                torch.profiler.ProfilerActivity.CUDA,
            ],
            profile_memory=True,
        ) as prof:
            for _ in range(self.repeat):
                func()
            torch.cuda.synchronize()

        # return
        events = prof.key_averages()
        self_cpu_time_total = (
            sum([event.self_cpu_time_total for event in events]) / self.repeat
        )
        self_cuda_time_total = (
            sum([event.self_cuda_time_total for event in events]) / self.repeat
        )
        self_cuda_memory_usage = max(
            [event.self_cuda_memory_usage for event in events]
        )
        return (
            self_cpu_time_total,  # in us
            self_cuda_time_total,  # in us
            self_cuda_memory_usage,  # in bytes
        )


def main():
    device = "cuda:0"
    torch.manual_seed(42)
    profiler = Profiler(warmup=10, repeat=1000)

    # contract
    print("* contract")
    x = torch.rand([1024, 3], device=device)
    roi = torch.tensor([0, 0, 0, 1, 1, 1], dtype=torch.float32, device=device)
    fn = lambda: nerfacc.contract(
        x, roi=roi, type=nerfacc.ContractionType.UN_BOUNDED_TANH
    )
    cpu_t, cuda_t, cuda_bytes = profiler(fn)
    print(f"{cpu_t:.2f} us, {cuda_t:.2f} us, {cuda_bytes / 1024 / 1024:.2f} MB")


if __name__ == "__main__":
    main()