utils.py 8.92 KB
Newer Older
Chenggang Zhao's avatar
Chenggang Zhao committed
1
import inspect
2
3
4
5
import json
import tempfile
from pathlib import Path

Chenggang Zhao's avatar
Chenggang Zhao committed
6
import numpy as np
Chenggang Zhao's avatar
Chenggang Zhao committed
7
8
9
10
import os
import sys
import torch
import torch.distributed as dist
Chenggang Zhao's avatar
Chenggang Zhao committed
11
from typing import Optional, Union
Chenggang Zhao's avatar
Chenggang Zhao committed
12
13
14
15
16
17
18
19


def init_dist(local_rank: int, num_local_ranks: int):
    # NOTES: you may rewrite this function with your own cluster settings
    ip = os.getenv('MASTER_ADDR', '127.0.0.1')
    port = int(os.getenv('MASTER_PORT', '8361'))
    num_nodes = int(os.getenv('WORLD_SIZE', 1))
    node_rank = int(os.getenv('RANK', 0))
fzyzcjy's avatar
fzyzcjy committed
20

21
22
23
24
25
26
27
28
    sig = inspect.signature(dist.init_process_group)
    params = {
        'backend': 'nccl',
        'init_method': f'tcp://{ip}:{port}',
        'world_size': num_nodes * num_local_ranks,
        'rank': node_rank * num_local_ranks + local_rank,
    }
    if 'device_id' in sig.parameters:
Chenggang Zhao's avatar
Chenggang Zhao committed
29
30
        # noinspection PyTypeChecker
        params['device_id'] = torch.device(f'cuda:{local_rank}')
31
    dist.init_process_group(**params)
Chenggang Zhao's avatar
Chenggang Zhao committed
32
33
34
35
36
37
38
39
40
41
42
43
44
45
    torch.set_default_dtype(torch.bfloat16)
    torch.set_default_device('cuda')
    torch.cuda.set_device(local_rank)

    return dist.get_rank(), dist.get_world_size(), dist.new_group(list(range(num_local_ranks * num_nodes)))


def calc_diff(x: torch.Tensor, y: torch.Tensor):
    x, y = x.double() + 1, y.double() + 1
    denominator = (x * x + y * y).sum()
    sim = 2 * (x * y).sum() / denominator
    return (1 - sim).item()


46
47
48
49
def align_up(x, y):
    return (x + y - 1) // y * y


Chenggang Zhao's avatar
Chenggang Zhao committed
50
def per_token_cast_to_fp8(x: torch.Tensor):
51
    assert x.dim() == 2
Chenggang Zhao's avatar
Chenggang Zhao committed
52
    m, n = x.shape
53
54
55
56
57
    aligned_n = align_up(n, 128)
    x_padded = torch.nn.functional.pad(x, (0, aligned_n - n), mode='constant', value=0)
    x_padded_view = x_padded.view(m, -1, 128)
    x_amax = x_padded_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
    return (x_padded_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, aligned_n)[:, :n].contiguous(), (x_amax / 448.0).view(m, -1)
Chenggang Zhao's avatar
Chenggang Zhao committed
58
59
60


def per_token_cast_back(x_fp8: torch.Tensor, x_scales: torch.Tensor):
61
62
    if x_fp8.numel() == 0:
        return x_fp8.to(torch.bfloat16)
63
64
65
66
67

    assert x_fp8.dim() == 2
    m, n = x_fp8.shape
    aligned_n = align_up(n, 128)
    x_fp8_padded = torch.nn.functional.pad(x_fp8, (0, aligned_n - n), mode='constant', value=0)
Shifang Xu's avatar
Shifang Xu committed
68
    if x_scales.dtype == torch.int:
69
        x_scales = x_scales.view(dtype=torch.uint8).to(torch.int) << 23
Shifang Xu's avatar
Shifang Xu committed
70
        x_scales = x_scales.view(dtype=torch.float)
71
    x_fp32_padded = x_fp8_padded.to(torch.float32).view(x_fp8.size(0), -1, 128)
Chenggang Zhao's avatar
Chenggang Zhao committed
72
    x_scales = x_scales.view(x_fp8.size(0), -1, 1)
73
    return (x_fp32_padded * x_scales).view(x_fp8_padded.shape).to(torch.bfloat16)[:,:n].contiguous()
Chenggang Zhao's avatar
Chenggang Zhao committed
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98


def inplace_unique(x: torch.Tensor, num_slots: int):
    assert x.dim() == 2
    mask = x < 0
    x_padded = x.masked_fill(mask, num_slots)
    bin_count = torch.zeros((x.size(0), num_slots + 1), dtype=x.dtype, device=x.device)
    bin_count.scatter_add_(1, x_padded, torch.ones_like(x_padded))
    bin_count = bin_count[:, :num_slots]
    sorted_bin_count, sorted_bin_idx = torch.sort(bin_count, dim=-1, descending=True)
    sorted_bin_idx.masked_fill_(sorted_bin_count == 0, -1)
    sorted_bin_idx = torch.sort(sorted_bin_idx, descending=True, dim=-1).values
    x[:, :].fill_(-1)
    valid_len = min(num_slots, x.size(1))
    x[:, :valid_len] = sorted_bin_idx[:, :valid_len]


def create_grouped_scores(scores: torch.Tensor, group_idx: torch.Tensor, num_groups: int):
    num_tokens, num_experts = scores.shape
    scores = scores.view(num_tokens, num_groups, -1)
    mask = torch.zeros((num_tokens, num_groups), dtype=torch.bool, device=scores.device)
    mask = mask.scatter_(1, group_idx, True).unsqueeze(-1).expand_as(scores)
    return (scores * mask).view(num_tokens, num_experts)


99
def bench(fn, num_warmups: int = 50, num_tests: int = 50, post_fn=None):
Chenggang Zhao's avatar
Chenggang Zhao committed
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
    # Flush L2 cache with 256 MB data
    torch.cuda.synchronize()
    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')

    # Warmup
    for _ in range(num_warmups):
        fn()

    # Flush L2
    cache.zero_()

    # Testing
    start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
    end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
    for i in range(num_tests):
        # Record
        start_events[i].record()
        fn()
        end_events[i].record()
        if post_fn is not None:
            post_fn()
    torch.cuda.synchronize()

    times = np.array([s.elapsed_time(e) / 1e3 for s, e in zip(start_events, end_events)])[1:]
    return np.average(times), np.min(times), np.max(times)


class empty_suppress:
    def __enter__(self):
        return self

    def __exit__(self, *_):
        pass


class suppress_stdout_stderr:
    def __enter__(self):
        self.outnull_file = open(os.devnull, 'w')
        self.errnull_file = open(os.devnull, 'w')

        self.old_stdout_fileno_undup = sys.stdout.fileno()
        self.old_stderr_fileno_undup = sys.stderr.fileno()

        self.old_stdout_fileno = os.dup(sys.stdout.fileno())
        self.old_stderr_fileno = os.dup(sys.stderr.fileno())

        self.old_stdout = sys.stdout
        self.old_stderr = sys.stderr

        os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
        os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)

        sys.stdout = self.outnull_file
        sys.stderr = self.errnull_file
        return self

    def __exit__(self, *_):
        sys.stdout = self.old_stdout
        sys.stderr = self.old_stderr

        os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
        os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)

        os.close(self.old_stdout_fileno)
        os.close(self.old_stderr_fileno)

        self.outnull_file.close()
        self.errnull_file.close()


Chenggang Zhao's avatar
Chenggang Zhao committed
170
def bench_kineto(fn, kernel_names: Union[str, tuple], num_tests: int = 30, suppress_kineto_output: bool = False,
171
                 trace_path: Optional[str] = None, barrier_comm_profiling: bool = False,
Chenggang Zhao's avatar
Chenggang Zhao committed
172
                 num_kernels_per_period: int = 1):
Chenggang Zhao's avatar
Chenggang Zhao committed
173
174
175
    # Profile
    suppress = suppress_stdout_stderr if suppress_kineto_output else empty_suppress
    with suppress():
Chenggang Zhao's avatar
Chenggang Zhao committed
176
        schedule = torch.profiler.schedule(wait=1, warmup=0, active=1, repeat=1)
Chenggang Zhao's avatar
Chenggang Zhao committed
177
178
179
180
181
182
183
184
185
186
        with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule) as prof:
            for i in range(2):
                # NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead
                if barrier_comm_profiling:
                    lhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
                    rhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
                    lhs @ rhs
                    dist.all_reduce(torch.ones(1, dtype=torch.float, device='cuda'))
                for _ in range(num_tests):
                    fn()
187
                torch.cuda.synchronize()
Chenggang Zhao's avatar
Chenggang Zhao committed
188
189
190
191
                prof.step()

    # Parse the profiling table
    assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
Chenggang Zhao's avatar
Chenggang Zhao committed
192
    is_tuple = isinstance(kernel_names, tuple)
Chenggang Zhao's avatar
Chenggang Zhao committed
193
194
195
196
197
198
199
200
201
202
    prof_lines = prof.key_averages().table(sort_by='cuda_time_total', max_name_column_width=100).split('\n')
    kernel_names = (kernel_names, ) if isinstance(kernel_names, str) else kernel_names
    assert all([isinstance(name, str) for name in kernel_names])
    for name in kernel_names:
        assert sum([name in line for line in prof_lines]) == 1, f'Errors of the kernel {name} in the profiling table'

    # Save chrome traces
    if trace_path is not None:
        prof.export_chrome_trace(trace_path)

Chenggang Zhao's avatar
Chenggang Zhao committed
203
    # Return average kernel durations
Chenggang Zhao's avatar
Chenggang Zhao committed
204
    units = {'ms': 1e3, 'us': 1e6}
Chenggang Zhao's avatar
Chenggang Zhao committed
205
    kernel_durations = []
Chenggang Zhao's avatar
Chenggang Zhao committed
206
207
208
209
210
211
    for name in kernel_names:
        for line in prof_lines:
            if name in line:
                time_str = line.split()[-2]
                for unit, scale in units.items():
                    if unit in time_str:
Chenggang Zhao's avatar
Chenggang Zhao committed
212
                        kernel_durations.append(float(time_str.replace(unit, '')) / scale)
Chenggang Zhao's avatar
Chenggang Zhao committed
213
214
215
                        break
                break

Chenggang Zhao's avatar
Chenggang Zhao committed
216
217
218
219
220
221
222
223
224
225
226
227
    # Expand the kernels by periods
    if num_kernels_per_period > 1:
        with tempfile.NamedTemporaryFile(suffix='.json') as tmp:
            prof.export_chrome_trace(tmp.name)
            profile_data = json.loads(Path(tmp.name).read_text())

        for i, kernel_name in enumerate(kernel_names):
            events = [event for event in profile_data['traceEvents'] if f'::{kernel_name}' in event['name']]
            events = sorted(events, key=lambda event: event['ts'])
            durations = [event['dur'] / 1e6 for event in events]
            assert len(durations) % num_kernels_per_period == 0
            num_kernel_patterns = len(durations) // num_kernels_per_period
Chenggang Zhao's avatar
Chenggang Zhao committed
228
            kernel_durations[i] = [sum(durations[j::num_kernels_per_period]) / num_kernel_patterns
Chenggang Zhao's avatar
Chenggang Zhao committed
229
230
                               for j in range(num_kernels_per_period)]

Chenggang Zhao's avatar
Chenggang Zhao committed
231
232
    # Return execution durations
    return kernel_durations if is_tuple else kernel_durations[0]
Chenggang Zhao's avatar
Chenggang Zhao committed
233

Chenggang Zhao's avatar
Chenggang Zhao committed
234
235

def hash_tensor(t: torch.Tensor):
236
    return t.view(torch.int).sum().item()