_time.py 7.47 KB
Newer Older
root's avatar
root committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import math as _math
import time as _time

import numpy as _numpy

import cupy as _cupy
from cupy_backends.cuda.api import runtime


class _PerfCaseResult:
    """ An obscure object encompassing timing results recorded by
    :func:`~cupyx.profiler.benchmark`. Simple statistics can be obtained by
    converting an instance of this class to a string.

    .. warning::
        This API is currently experimental and subject to change in future
        releases.

    """

    def __init__(self, name, ts, devices):
        assert ts.ndim == 2
        assert ts.shape[0] == len(devices) + 1
        assert ts.shape[1] > 0
        self.name = name
        self._ts = ts
        self._devices = devices

    def __repr__(self) -> str:
        """ Returns a string representation of the object.

        Returns:
            str: A string representation of the object.
        """
        return self.to_str(show_gpu=True)

    @property
    def cpu_times(self) -> _numpy.ndarray:
        """A :class:`numpy.ndarray` of shape ``(n_repeat,)``, holding times spent
        on CPU in seconds.

        These values are delta of the host-side performance counter
        (:func:`time.perf_counter`) between each repeat step.
        """  # NOQA
        return self._ts[0]

    @property
    def gpu_times(self) -> _numpy.ndarray:
        """A :class:`numpy.ndarray` of shape ``(len(devices), n_repeat)``,
        holding times spent on GPU in seconds.

        These values are measured using ``cudaEventElapsedTime`` with events
        recoreded before/after each repeat step.
        """
        return self._ts[1:]

    @staticmethod
    def _to_str_per_item(device_name, t):
        assert t.ndim == 1
        assert t.size > 0
        t_us = t * 1e6

        s = '    {}: {:9.03f} us'.format(device_name, t_us.mean())
        if t.size > 1:
            s += '   +/- {:6.03f} (min: {:9.03f} / max: {:9.03f}) us'.format(
                t_us.std(), t_us.min(), t_us.max())
        return s

    def to_str(self, show_gpu=False):
        results = [self._to_str_per_item('CPU', self._ts[0])]
        if show_gpu:
            for i, d in enumerate(self._devices):
                results.append(
                    self._to_str_per_item('GPU-{}'.format(d),
                                          self._ts[1 + i]))
        return '{:<20s}:{}'.format(self.name, ' '.join(results))

    def __str__(self):
        return self.to_str(show_gpu=True)


def benchmark(
        func, args=(), kwargs={}, n_repeat=10000, *,
        name=None, n_warmup=10, max_duration=_math.inf, devices=None):
    """ Timing utility for measuring time spent by both CPU and GPU.

    This function is a very convenient helper for setting up a timing test. The
    GPU time is properly recorded by synchronizing internal streams. As a
    result, to time a multi-GPU function all participating devices must be
    passed as the ``devices`` argument so that this helper knows which devices
    to record. A simple example is given as follows:

    .. code-block:: py

        import cupy as cp
        from cupyx.profiler import benchmark

        def f(a, b):
            return 3 * cp.sin(-a) * b

        a = 0.5 - cp.random.random((100,))
        b = cp.random.random((100,))
        print(benchmark(f, (a, b), n_repeat=1000))


    Args:
        func (callable): a callable object to be timed.
        args (tuple): positional argumens to be passed to the callable.
        kwargs (dict): keyword arguments to be passed to the callable.
        n_repeat (int): number of times the callable is called. Increasing
            this value would improve the collected statistics at the cost
            of longer test time.
        name (str): the function name to be reported. If not given, the
            callable's ``__name__`` attribute is used.
        n_warmup (int): number of times the callable is called. The warm-up
            runs are not timed.
        max_duration (float): the maximum time (in seconds) that the entire
            test can use. If the taken time is longer than this limit, the test
            is stopped and the statistics collected up to the breakpoint is
            reported.
        devices (tuple): a tuple of device IDs (int) that will be timed during
            the timing test. If not given, the current device is used.

    Returns:
        :class:`~cupyx.profiler._time._PerfCaseResult`:
            an object collecting all test results.

    """

    if name is None:
        name = func.__name__

    if devices is None:
        devices = (_cupy.cuda.get_device_id(),)

    if not callable(func):
        raise ValueError('`func` should be a callable object.')
    if not isinstance(args, tuple):
        raise ValueError('`args` should be of tuple type.')
    if not isinstance(kwargs, dict):
        raise ValueError('`kwargs` should be of dict type.')
    if not isinstance(n_repeat, int):
        raise ValueError('`n_repeat` should be an integer.')
    if not isinstance(name, str):
        raise ValueError('`name` should be a string.')
    if not isinstance(n_warmup, int):
        raise ValueError('`n_warmup` should be an integer.')
    if not _numpy.isreal(max_duration):
        raise ValueError('`max_duration` should be given in seconds')
    if not isinstance(devices, tuple):
        raise ValueError('`devices` should be of tuple type')

    return _repeat(
        func, args, kwargs, n_repeat, name, n_warmup, max_duration, devices)


def _repeat(
        func, args, kwargs, n_repeat, name, n_warmup, max_duration, devices):

    events_1 = []
    events_2 = []

    for i in devices:
        prev_device = runtime.getDevice()
        try:
            runtime.setDevice(i)
            events_1.append(_cupy.cuda.stream.Event())
            events_2.append(_cupy.cuda.stream.Event())
        finally:
            runtime.setDevice(prev_device)

    for i in range(n_warmup):
        func(*args, **kwargs)

    for event, device in zip(events_1, devices):
        prev_device = runtime.getDevice()
        try:
            runtime.setDevice(device)
            event.record()
        finally:
            runtime.setDevice(prev_device)
        event.synchronize()

    cpu_times = []
    gpu_times = [[] for i in events_1]
    duration = 0
    for i in range(n_repeat):
        for event, device in zip(events_1, devices):
            prev_device = runtime.getDevice()
            try:
                runtime.setDevice(device)
                event.record()
            finally:
                runtime.setDevice(prev_device)

        t1 = _time.perf_counter()

        func(*args, **kwargs)

        t2 = _time.perf_counter()
        cpu_time = t2 - t1
        cpu_times.append(cpu_time)

        for event, device in zip(events_2, devices):
            prev_device = runtime.getDevice()
            try:
                runtime.setDevice(device)
                event.record()
            finally:
                runtime.setDevice(prev_device)
        for event, device in zip(events_2, devices):
            prev_device = runtime.getDevice()
            try:
                runtime.setDevice(device)
                event.synchronize()
            finally:
                runtime.setDevice(prev_device)
        for i, (ev1, ev2) in enumerate(zip(events_1, events_2)):
            gpu_time = _cupy.cuda.get_elapsed_time(ev1, ev2) * 1e-3
            gpu_times[i].append(gpu_time)

        duration += _time.perf_counter() - t1
        if duration > max_duration:
            break

    ts = _numpy.asarray([cpu_times] + gpu_times, dtype=_numpy.float64)
    return _PerfCaseResult(name, ts, devices=devices)