profiler.py 6.68 KB
Newer Older
PengGao's avatar
PengGao committed
1
import asyncio
2
import threading
PengGao's avatar
PengGao committed
3
import time
PengGao's avatar
PengGao committed
4
from functools import wraps
PengGao's avatar
PengGao committed
5
6

import torch
helloyongyang's avatar
helloyongyang committed
7
import torch.distributed as dist
root's avatar
root committed
8
from loguru import logger
9

PengGao's avatar
PengGao committed
10
from lightx2v.utils.envs import *
11
12
13
from lightx2v_platform.base.global_var import AI_DEVICE

torch_device_module = getattr(torch, AI_DEVICE)
14
15
16
17
18
19
20
_excluded_time_local = threading.local()


def _get_excluded_time_stack():
    if not hasattr(_excluded_time_local, "stack"):
        _excluded_time_local.stack = []
    return _excluded_time_local.stack
PengGao's avatar
PengGao committed
21

22

PengGao's avatar
PengGao committed
23
class _ProfilingContext:
yihuiwen's avatar
yihuiwen committed
24
25
26
27
28
29
    def __init__(self, name, recorder_mode=0, metrics_func=None, metrics_labels=None):
        """
        recorder_mode = 0: disable recorder
        recorder_mode = 1: enable recorder
        recorder_mode = 2: enable recorder and force disable logger
        """
30
        self.name = name
helloyongyang's avatar
helloyongyang committed
31
32
33
34
        if dist.is_initialized():
            self.rank_info = f"Rank {dist.get_rank()}"
        else:
            self.rank_info = "Single GPU"
yihuiwen's avatar
yihuiwen committed
35
36
37
38
        self.enable_recorder = recorder_mode > 0
        self.enable_logger = recorder_mode <= 1
        self.metrics_func = metrics_func
        self.metrics_labels = metrics_labels
39
40

    def __enter__(self):
41
        torch_device_module.synchronize()
42
        self.start_time = time.perf_counter()
43
        _get_excluded_time_stack().append(0.0)
44
45
46
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
47
        torch_device_module.synchronize()
48
49
50
        total_elapsed = time.perf_counter() - self.start_time
        excluded = _get_excluded_time_stack().pop()
        elapsed = total_elapsed - excluded
yihuiwen's avatar
yihuiwen committed
51
52
        if self.enable_recorder and self.metrics_func:
            if self.metrics_labels:
53
                self.metrics_func.labels(*self.metrics_labels).observe(elapsed)
yihuiwen's avatar
yihuiwen committed
54
            else:
Yang Yong (雍洋)'s avatar
Yang Yong (雍洋) committed
55
                self.metrics_func.observe(elapsed)
yihuiwen's avatar
yihuiwen committed
56
57
        if self.enable_logger:
            logger.info(f"[Profile] {self.rank_info} - {self.name} cost {elapsed:.6f} seconds")
58
59
        return False

PengGao's avatar
PengGao committed
60
    async def __aenter__(self):
61
        torch_device_module.synchronize()
PengGao's avatar
PengGao committed
62
        self.start_time = time.perf_counter()
63
        _get_excluded_time_stack().append(0.0)
PengGao's avatar
PengGao committed
64
        return self
65

PengGao's avatar
PengGao committed
66
    async def __aexit__(self, exc_type, exc_val, exc_tb):
67
        torch_device_module.synchronize()
68
69
70
        total_elapsed = time.perf_counter() - self.start_time
        excluded = _get_excluded_time_stack().pop()
        elapsed = total_elapsed - excluded
yihuiwen's avatar
yihuiwen committed
71
72
        if self.enable_recorder and self.metrics_func:
            if self.metrics_labels:
73
                self.metrics_func.labels(*self.metrics_labels).observe(elapsed)
yihuiwen's avatar
yihuiwen committed
74
            else:
Yang Yong (雍洋)'s avatar
Yang Yong (雍洋) committed
75
                self.metrics_func.observe(elapsed)
yihuiwen's avatar
yihuiwen committed
76
77
        if self.enable_logger:
            logger.info(f"[Profile] {self.rank_info} - {self.name} cost {elapsed:.6f} seconds")
PengGao's avatar
PengGao committed
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
        return False

    def __call__(self, func):
        if asyncio.iscoroutinefunction(func):

            @wraps(func)
            async def async_wrapper(*args, **kwargs):
                async with self:
                    return await func(*args, **kwargs)

            return async_wrapper
        else:

            @wraps(func)
            def sync_wrapper(*args, **kwargs):
                with self:
                    return func(*args, **kwargs)

            return sync_wrapper


class _NullContext:
100
101
102
103
104
105
106
107
108
109
    # Context manager without decision branch logic overhead
    def __init__(self, *args, **kwargs):
        pass

    def __enter__(self):
        return self

    def __exit__(self, *args):
        return False

PengGao's avatar
PengGao committed
110
111
112
113
114
115
116
117
118
    async def __aenter__(self):
        return self

    async def __aexit__(self, *args):
        return False

    def __call__(self, func):
        return func

119

120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
class _ExcludedProfilingContext:
    """用于标记应该从外层 profiling 中排除的时间段"""

    def __init__(self, name=None):
        self.name = name
        if dist.is_initialized():
            self.rank_info = f"Rank {dist.get_rank()}"
        else:
            self.rank_info = "Single GPU"

    def __enter__(self):
        torch_device_module.synchronize()
        self.start_time = time.perf_counter()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        torch_device_module.synchronize()
        elapsed = time.perf_counter() - self.start_time
        stack = _get_excluded_time_stack()
        for i in range(len(stack)):
            stack[i] += elapsed
        if self.name and CHECK_PROFILING_DEBUG_LEVEL(1):
            logger.info(f"[Profile-Excluded] {self.rank_info} - {self.name} cost {elapsed:.6f} seconds (excluded from outer profiling)")
        return False

    async def __aenter__(self):
        torch_device_module.synchronize()
        self.start_time = time.perf_counter()
        return self

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        torch_device_module.synchronize()
        elapsed = time.perf_counter() - self.start_time
        stack = _get_excluded_time_stack()
        for i in range(len(stack)):
            stack[i] += elapsed
        if self.name and CHECK_PROFILING_DEBUG_LEVEL(1):
            logger.info(f"[Profile-Excluded] {self.rank_info} - {self.name} cost {elapsed:.6f} seconds (excluded from outer profiling)")
        return False

    def __call__(self, func):
        if asyncio.iscoroutinefunction(func):

            @wraps(func)
            async def async_wrapper(*args, **kwargs):
                async with self:
                    return await func(*args, **kwargs)

            return async_wrapper
        else:

            @wraps(func)
            def sync_wrapper(*args, **kwargs):
                with self:
                    return func(*args, **kwargs)

            return sync_wrapper


179
180
181
class _ProfilingContextL1(_ProfilingContext):
    """Level 1 profiling context with Level1_Log prefix."""

Yang Yong (雍洋)'s avatar
Yang Yong (雍洋) committed
182
183
    def __init__(self, name, recorder_mode=0, metrics_func=None, metrics_labels=None):
        super().__init__(f"Level1_Log {name}", recorder_mode, metrics_func, metrics_labels)
184
185
186
187
188


class _ProfilingContextL2(_ProfilingContext):
    """Level 2 profiling context with Level2_Log prefix."""

Yang Yong (雍洋)'s avatar
Yang Yong (雍洋) committed
189
190
    def __init__(self, name, recorder_mode=0, metrics_func=None, metrics_labels=None):
        super().__init__(f"Level2_Log {name}", recorder_mode, metrics_func, metrics_labels)
191
192
193
194
195
196
197
198
199


"""
PROFILING_DEBUG_LEVEL=0: [Default] disable all profiling
PROFILING_DEBUG_LEVEL=1: enable ProfilingContext4DebugL1
PROFILING_DEBUG_LEVEL=2: enable ProfilingContext4DebugL1 and ProfilingContext4DebugL2
"""
ProfilingContext4DebugL1 = _ProfilingContextL1 if CHECK_PROFILING_DEBUG_LEVEL(1) else _NullContext  # if user >= 1, enable profiling
ProfilingContext4DebugL2 = _ProfilingContextL2 if CHECK_PROFILING_DEBUG_LEVEL(2) else _NullContext  # if user >= 2, enable profiling
200
ExcludedProfilingContext = _ExcludedProfilingContext if CHECK_PROFILING_DEBUG_LEVEL(1) else _NullContext