utils.py 9.44 KB
Newer Older
Chenggang Zhao's avatar
Chenggang Zhao committed
1
import inspect
2
3
4
5
import json
import tempfile
from pathlib import Path

Chenggang Zhao's avatar
Chenggang Zhao committed
6
import numpy as np
Chenggang Zhao's avatar
Chenggang Zhao committed
7
8
9
10
import os
import sys
import torch
import torch.distributed as dist
Chenggang Zhao's avatar
Chenggang Zhao committed
11
from typing import Optional, Union
Chenggang Zhao's avatar
Chenggang Zhao committed
12
13
14
15
16
17
18
19


def init_dist(local_rank: int, num_local_ranks: int):
    # NOTES: you may rewrite this function with your own cluster settings
    ip = os.getenv('MASTER_ADDR', '127.0.0.1')
    port = int(os.getenv('MASTER_PORT', '8361'))
    num_nodes = int(os.getenv('WORLD_SIZE', 1))
    node_rank = int(os.getenv('RANK', 0))
fzyzcjy's avatar
fzyzcjy committed
20

21
22
23
24
25
26
27
28
    sig = inspect.signature(dist.init_process_group)
    params = {
        'backend': 'nccl',
        'init_method': f'tcp://{ip}:{port}',
        'world_size': num_nodes * num_local_ranks,
        'rank': node_rank * num_local_ranks + local_rank,
    }
    if 'device_id' in sig.parameters:
Chenggang Zhao's avatar
Chenggang Zhao committed
29
30
        # noinspection PyTypeChecker
        params['device_id'] = torch.device(f'cuda:{local_rank}')
31
    dist.init_process_group(**params)
Chenggang Zhao's avatar
Chenggang Zhao committed
32
33
34
35
36
37
38
39
40
41
42
43
44
45
    torch.set_default_dtype(torch.bfloat16)
    torch.set_default_device('cuda')
    torch.cuda.set_device(local_rank)

    return dist.get_rank(), dist.get_world_size(), dist.new_group(list(range(num_local_ranks * num_nodes)))


def calc_diff(x: torch.Tensor, y: torch.Tensor):
    x, y = x.double() + 1, y.double() + 1
    denominator = (x * x + y * y).sum()
    sim = 2 * (x * y).sum() / denominator
    return (1 - sim).item()


46
47
48
49
def align_up(x, y):
    return (x + y - 1) // y * y


Chenggang Zhao's avatar
Chenggang Zhao committed
50
def per_token_cast_to_fp8(x: torch.Tensor):
51
    assert x.dim() == 2
Chenggang Zhao's avatar
Chenggang Zhao committed
52
    m, n = x.shape
53
54
55
56
57
    aligned_n = align_up(n, 128)
    x_padded = torch.nn.functional.pad(x, (0, aligned_n - n), mode='constant', value=0)
    x_padded_view = x_padded.view(m, -1, 128)
    x_amax = x_padded_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
    return (x_padded_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, aligned_n)[:, :n].contiguous(), (x_amax / 448.0).view(m, -1)
Chenggang Zhao's avatar
Chenggang Zhao committed
58
59


lijian6's avatar
lijian6 committed
60
61
62
def per_token_cast_pg_back(x: torch.Tensor, x_scales: torch.Tensor):
    if x.numel() == 0:
        return x.to(torch.bfloat16)
63

lijian6's avatar
lijian6 committed
64
65
    assert x.dim() == 2
    m, n = x.shape
66
    aligned_n = align_up(n, 128)
lijian6's avatar
lijian6 committed
67
    x_padded = torch.nn.functional.pad(x, (0, aligned_n - n), mode='constant', value=0)
Shifang Xu's avatar
Shifang Xu committed
68
    if x_scales.dtype == torch.int:
69
        x_scales = x_scales.view(dtype=torch.uint8).to(torch.int) << 23
Shifang Xu's avatar
Shifang Xu committed
70
        x_scales = x_scales.view(dtype=torch.float)
lijian6's avatar
lijian6 committed
71
72
73
    x_fp32_padded = x_padded.to(torch.float32).view(x.size(0), -1, 128)
    x_scales = x_scales.view(x.size(0), -1, 1)
    return (x_fp32_padded * x_scales).view(x_padded.shape).to(torch.bfloat16)[:,:n].contiguous()
Chenggang Zhao's avatar
Chenggang Zhao committed
74

lijian6's avatar
lijian6 committed
75
def per_token_cast_pc_back(x_int8: torch.Tensor, x_scales: torch.Tensor):
lishen's avatar
lishen committed
76
77
78
79
80
81
82
    if x_int8.numel() == 0:
        return x_int8.to(torch.bfloat16)

    assert x_int8.dim() == 2
    m, n = x_int8.shape
    aligned_n = align_up(n, 128)

lijian6's avatar
lijian6 committed
83
    x_int8_padded = torch.nn.functional.pad(x_int8, (0, aligned_n - n), mode='constant', value=0)
lishen's avatar
lishen committed
84
85
86
87
88
89
    x_fp32_padded = x_int8_padded.to(torch.float32).view(m, -1, 1)
    x_scales = x_scales.view(m, -1, 1).to(torch.float32)
    x_deq = (x_fp32_padded * x_scales).view(m, aligned_n)
    return x_deq[:, :n].to(torch.bfloat16).contiguous()


Chenggang Zhao's avatar
Chenggang Zhao committed
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
def inplace_unique(x: torch.Tensor, num_slots: int):
    assert x.dim() == 2
    mask = x < 0
    x_padded = x.masked_fill(mask, num_slots)
    bin_count = torch.zeros((x.size(0), num_slots + 1), dtype=x.dtype, device=x.device)
    bin_count.scatter_add_(1, x_padded, torch.ones_like(x_padded))
    bin_count = bin_count[:, :num_slots]
    sorted_bin_count, sorted_bin_idx = torch.sort(bin_count, dim=-1, descending=True)
    sorted_bin_idx.masked_fill_(sorted_bin_count == 0, -1)
    sorted_bin_idx = torch.sort(sorted_bin_idx, descending=True, dim=-1).values
    x[:, :].fill_(-1)
    valid_len = min(num_slots, x.size(1))
    x[:, :valid_len] = sorted_bin_idx[:, :valid_len]


def create_grouped_scores(scores: torch.Tensor, group_idx: torch.Tensor, num_groups: int):
    num_tokens, num_experts = scores.shape
    scores = scores.view(num_tokens, num_groups, -1)
    mask = torch.zeros((num_tokens, num_groups), dtype=torch.bool, device=scores.device)
    mask = mask.scatter_(1, group_idx, True).unsqueeze(-1).expand_as(scores)
    return (scores * mask).view(num_tokens, num_experts)


113
def bench(fn, num_warmups: int = 50, num_tests: int = 50, post_fn=None):
Chenggang Zhao's avatar
Chenggang Zhao committed
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
    # Flush L2 cache with 256 MB data
    torch.cuda.synchronize()
    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')

    # Warmup
    for _ in range(num_warmups):
        fn()

    # Flush L2
    cache.zero_()

    # Testing
    start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
    end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
    for i in range(num_tests):
        # Record
        start_events[i].record()
        fn()
        end_events[i].record()
        if post_fn is not None:
            post_fn()
    torch.cuda.synchronize()

    times = np.array([s.elapsed_time(e) / 1e3 for s, e in zip(start_events, end_events)])[1:]
    return np.average(times), np.min(times), np.max(times)


class empty_suppress:
    def __enter__(self):
        return self

    def __exit__(self, *_):
        pass


class suppress_stdout_stderr:
    def __enter__(self):
        self.outnull_file = open(os.devnull, 'w')
        self.errnull_file = open(os.devnull, 'w')

        self.old_stdout_fileno_undup = sys.stdout.fileno()
        self.old_stderr_fileno_undup = sys.stderr.fileno()

        self.old_stdout_fileno = os.dup(sys.stdout.fileno())
        self.old_stderr_fileno = os.dup(sys.stderr.fileno())

        self.old_stdout = sys.stdout
        self.old_stderr = sys.stderr

        os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
        os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)

        sys.stdout = self.outnull_file
        sys.stderr = self.errnull_file
        return self

    def __exit__(self, *_):
        sys.stdout = self.old_stdout
        sys.stderr = self.old_stderr

        os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
        os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)

        os.close(self.old_stdout_fileno)
        os.close(self.old_stderr_fileno)

        self.outnull_file.close()
        self.errnull_file.close()


Chenggang Zhao's avatar
Chenggang Zhao committed
184
def bench_kineto(fn, kernel_names: Union[str, tuple], num_tests: int = 30, suppress_kineto_output: bool = False,
185
                 trace_path: Optional[str] = None, barrier_comm_profiling: bool = False,
Chenggang Zhao's avatar
Chenggang Zhao committed
186
                 num_kernels_per_period: int = 1):
Chenggang Zhao's avatar
Chenggang Zhao committed
187
188
189
    # Profile
    suppress = suppress_stdout_stderr if suppress_kineto_output else empty_suppress
    with suppress():
Chenggang Zhao's avatar
Chenggang Zhao committed
190
        schedule = torch.profiler.schedule(wait=1, warmup=0, active=1, repeat=1)
Chenggang Zhao's avatar
Chenggang Zhao committed
191
192
193
194
195
196
197
198
199
200
        with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule) as prof:
            for i in range(2):
                # NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead
                if barrier_comm_profiling:
                    lhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
                    rhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
                    lhs @ rhs
                    dist.all_reduce(torch.ones(1, dtype=torch.float, device='cuda'))
                for _ in range(num_tests):
                    fn()
201
                torch.cuda.synchronize()
Chenggang Zhao's avatar
Chenggang Zhao committed
202
203
204
205
                prof.step()

    # Parse the profiling table
    assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
Chenggang Zhao's avatar
Chenggang Zhao committed
206
    is_tuple = isinstance(kernel_names, tuple)
Chenggang Zhao's avatar
Chenggang Zhao committed
207
208
209
210
211
212
213
214
215
216
    prof_lines = prof.key_averages().table(sort_by='cuda_time_total', max_name_column_width=100).split('\n')
    kernel_names = (kernel_names, ) if isinstance(kernel_names, str) else kernel_names
    assert all([isinstance(name, str) for name in kernel_names])
    for name in kernel_names:
        assert sum([name in line for line in prof_lines]) == 1, f'Errors of the kernel {name} in the profiling table'

    # Save chrome traces
    if trace_path is not None:
        prof.export_chrome_trace(trace_path)

Chenggang Zhao's avatar
Chenggang Zhao committed
217
    # Return average kernel durations
Chenggang Zhao's avatar
Chenggang Zhao committed
218
    units = {'ms': 1e3, 'us': 1e6}
Chenggang Zhao's avatar
Chenggang Zhao committed
219
    kernel_durations = []
Chenggang Zhao's avatar
Chenggang Zhao committed
220
221
222
223
224
225
    for name in kernel_names:
        for line in prof_lines:
            if name in line:
                time_str = line.split()[-2]
                for unit, scale in units.items():
                    if unit in time_str:
Chenggang Zhao's avatar
Chenggang Zhao committed
226
                        kernel_durations.append(float(time_str.replace(unit, '')) / scale)
Chenggang Zhao's avatar
Chenggang Zhao committed
227
228
229
                        break
                break

Chenggang Zhao's avatar
Chenggang Zhao committed
230
231
232
233
234
235
236
237
238
239
240
241
    # Expand the kernels by periods
    if num_kernels_per_period > 1:
        with tempfile.NamedTemporaryFile(suffix='.json') as tmp:
            prof.export_chrome_trace(tmp.name)
            profile_data = json.loads(Path(tmp.name).read_text())

        for i, kernel_name in enumerate(kernel_names):
            events = [event for event in profile_data['traceEvents'] if f'::{kernel_name}' in event['name']]
            events = sorted(events, key=lambda event: event['ts'])
            durations = [event['dur'] / 1e6 for event in events]
            assert len(durations) % num_kernels_per_period == 0
            num_kernel_patterns = len(durations) // num_kernels_per_period
Chenggang Zhao's avatar
Chenggang Zhao committed
242
            kernel_durations[i] = [sum(durations[j::num_kernels_per_period]) / num_kernel_patterns
Chenggang Zhao's avatar
Chenggang Zhao committed
243
244
                               for j in range(num_kernels_per_period)]

Chenggang Zhao's avatar
Chenggang Zhao committed
245
246
    # Return execution durations
    return kernel_durations if is_tuple else kernel_durations[0]
Chenggang Zhao's avatar
Chenggang Zhao committed
247

Chenggang Zhao's avatar
Chenggang Zhao committed
248
249

def hash_tensor(t: torch.Tensor):
250
    return t.view(torch.int).sum().item()