run_profiler.py 3.3 KB
Newer Older
1
2
3
from typing import Callable

import torch
Ruilong Li(李瑞龙)'s avatar
Ruilong Li(李瑞龙) committed
4
import tqdm
5
6
7

import nerfacc

Ruilong Li(李瑞龙)'s avatar
Ruilong Li(李瑞龙) committed
8
9
10
# timing
# https://github.com/pytorch/pytorch/commit/d2784c233bfc57a1d836d961694bcc8ec4ed45e4

11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36

class Profiler:
    def __init__(self, warmup=10, repeat=1000):
        self.warmup = warmup
        self.repeat = repeat

    def __call__(self, func: Callable):
        # warmup
        for _ in range(self.warmup):
            func()
        torch.cuda.synchronize()

        # profile
        with torch.profiler.profile(
            activities=[
                torch.profiler.ProfilerActivity.CPU,
                torch.profiler.ProfilerActivity.CUDA,
            ],
            profile_memory=True,
        ) as prof:
            for _ in range(self.repeat):
                func()
            torch.cuda.synchronize()

        # return
        events = prof.key_averages()
Ruilong Li(李瑞龙)'s avatar
Ruilong Li(李瑞龙) committed
37
        # print(events.table(sort_by="self_cpu_time_total", row_limit=10))
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
        self_cpu_time_total = (
            sum([event.self_cpu_time_total for event in events]) / self.repeat
        )
        self_cuda_time_total = (
            sum([event.self_cuda_time_total for event in events]) / self.repeat
        )
        self_cuda_memory_usage = max(
            [event.self_cuda_memory_usage for event in events]
        )
        return (
            self_cpu_time_total,  # in us
            self_cuda_time_total,  # in us
            self_cuda_memory_usage,  # in bytes
        )


def main():
    device = "cuda:0"
    torch.manual_seed(42)
Ruilong Li(李瑞龙)'s avatar
Ruilong Li(李瑞龙) committed
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
    profiler = Profiler(warmup=10, repeat=100)

    # # contract
    # print("* contract")
    # x = torch.rand([1024, 3], device=device)
    # roi = torch.tensor([0, 0, 0, 1, 1, 1], dtype=torch.float32, device=device)
    # fn = lambda: nerfacc.contract(
    #     x, roi=roi, type=nerfacc.ContractionType.UN_BOUNDED_TANH
    # )
    # cpu_t, cuda_t, cuda_bytes = profiler(fn)
    # print(f"{cpu_t:.2f} us, {cuda_t:.2f} us, {cuda_bytes / 1024 / 1024:.2f} MB")

    # rendering
    print("* rendering")
    batch_size = 81920
    rays_o = torch.rand((batch_size, 3), device=device)
    rays_d = torch.randn((batch_size, 3), device=device)
    rays_d = rays_d / rays_d.norm(dim=-1, keepdim=True)

76
    ray_indices, t_starts, t_ends = nerfacc._ray_marching(
Ruilong Li(李瑞龙)'s avatar
Ruilong Li(李瑞龙) committed
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
        rays_o,
        rays_d,
        near_plane=0.1,
        far_plane=1.0,
        render_step_size=1e-1,
    )
    sigmas = torch.randn_like(t_starts, requires_grad=True)
    fn = (
        lambda: nerfacc.render_weight_from_density(
            ray_indices, t_starts, t_ends, sigmas
        )
        .sum()
        .backward()
    )
    fn()
    torch.cuda.synchronize()
    for _ in tqdm.tqdm(range(100)):
        fn()
        torch.cuda.synchronize()

    cpu_t, cuda_t, cuda_bytes = profiler(fn)
    print(f"{cpu_t:.2f} us, {cuda_t:.2f} us, {cuda_bytes / 1024 / 1024:.2f} MB")

100
    packed_info = nerfacc.pack_info(ray_indices, n_rays=batch_size)
Ruilong Li(李瑞龙)'s avatar
Ruilong Li(李瑞龙) committed
101
    fn = (
102
        lambda: nerfacc._vol_rendering._RenderingDensity.apply(
Ruilong Li(李瑞龙)'s avatar
Ruilong Li(李瑞龙) committed
103
104
105
106
            packed_info, t_starts, t_ends, sigmas, 0
        )
        .sum()
        .backward()
107
    )
Ruilong Li(李瑞龙)'s avatar
Ruilong Li(李瑞龙) committed
108
109
110
111
112
    fn()
    torch.cuda.synchronize()
    for _ in tqdm.tqdm(range(100)):
        fn()
        torch.cuda.synchronize()
113
114
115
116
117
118
    cpu_t, cuda_t, cuda_bytes = profiler(fn)
    print(f"{cpu_t:.2f} us, {cuda_t:.2f} us, {cuda_bytes / 1024 / 1024:.2f} MB")


if __name__ == "__main__":
    main()