utils.py 8.5 KB
Newer Older
Chenggang Zhao's avatar
Chenggang Zhao committed
1
import inspect
2
3
4
5
import json
import tempfile
from pathlib import Path

Chenggang Zhao's avatar
Chenggang Zhao committed
6
import numpy as np
Chenggang Zhao's avatar
Chenggang Zhao committed
7
8
9
10
import os
import sys
import torch
import torch.distributed as dist
Chenggang Zhao's avatar
Chenggang Zhao committed
11
from typing import Optional, Union
Chenggang Zhao's avatar
Chenggang Zhao committed
12
13
14
15
16
17
18
19


def init_dist(local_rank: int, num_local_ranks: int):
    # NOTES: you may rewrite this function with your own cluster settings
    ip = os.getenv('MASTER_ADDR', '127.0.0.1')
    port = int(os.getenv('MASTER_PORT', '8361'))
    num_nodes = int(os.getenv('WORLD_SIZE', 1))
    node_rank = int(os.getenv('RANK', 0))
fzyzcjy's avatar
fzyzcjy committed
20

21
22
23
24
25
26
27
28
    sig = inspect.signature(dist.init_process_group)
    params = {
        'backend': 'nccl',
        'init_method': f'tcp://{ip}:{port}',
        'world_size': num_nodes * num_local_ranks,
        'rank': node_rank * num_local_ranks + local_rank,
    }
    if 'device_id' in sig.parameters:
Chenggang Zhao's avatar
Chenggang Zhao committed
29
30
        # noinspection PyTypeChecker
        params['device_id'] = torch.device(f'cuda:{local_rank}')
31
    dist.init_process_group(**params)
Chenggang Zhao's avatar
Chenggang Zhao committed
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
    torch.set_default_dtype(torch.bfloat16)
    torch.set_default_device('cuda')
    torch.cuda.set_device(local_rank)

    return dist.get_rank(), dist.get_world_size(), dist.new_group(list(range(num_local_ranks * num_nodes)))


def calc_diff(x: torch.Tensor, y: torch.Tensor):
    x, y = x.double() + 1, y.double() + 1
    denominator = (x * x + y * y).sum()
    sim = 2 * (x * y).sum() / denominator
    return (1 - sim).item()


def per_token_cast_to_fp8(x: torch.Tensor):
    assert x.dim() == 2 and x.size(1) % 128 == 0
    m, n = x.shape
    x_view = x.view(m, -1, 128)
    x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
    return (x_view * (448.0 / x_amax.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)


def per_token_cast_back(x_fp8: torch.Tensor, x_scales: torch.Tensor):
55
56
    if x_fp8.numel() == 0:
        return x_fp8.to(torch.bfloat16)
Shifang Xu's avatar
Shifang Xu committed
57
    if x_scales.dtype == torch.int:
58
        x_scales = x_scales.view(dtype=torch.uint8).to(torch.int) << 23
Shifang Xu's avatar
Shifang Xu committed
59
        x_scales = x_scales.view(dtype=torch.float)
Chenggang Zhao's avatar
Chenggang Zhao committed
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
    x_fp32 = x_fp8.to(torch.float32).view(x_fp8.size(0), -1, 128)
    x_scales = x_scales.view(x_fp8.size(0), -1, 1)
    return (x_fp32 * x_scales).view(x_fp8.shape).to(torch.bfloat16)


def inplace_unique(x: torch.Tensor, num_slots: int):
    assert x.dim() == 2
    mask = x < 0
    x_padded = x.masked_fill(mask, num_slots)
    bin_count = torch.zeros((x.size(0), num_slots + 1), dtype=x.dtype, device=x.device)
    bin_count.scatter_add_(1, x_padded, torch.ones_like(x_padded))
    bin_count = bin_count[:, :num_slots]
    sorted_bin_count, sorted_bin_idx = torch.sort(bin_count, dim=-1, descending=True)
    sorted_bin_idx.masked_fill_(sorted_bin_count == 0, -1)
    sorted_bin_idx = torch.sort(sorted_bin_idx, descending=True, dim=-1).values
    x[:, :].fill_(-1)
    valid_len = min(num_slots, x.size(1))
    x[:, :valid_len] = sorted_bin_idx[:, :valid_len]


def create_grouped_scores(scores: torch.Tensor, group_idx: torch.Tensor, num_groups: int):
    num_tokens, num_experts = scores.shape
    scores = scores.view(num_tokens, num_groups, -1)
    mask = torch.zeros((num_tokens, num_groups), dtype=torch.bool, device=scores.device)
    mask = mask.scatter_(1, group_idx, True).unsqueeze(-1).expand_as(scores)
    return (scores * mask).view(num_tokens, num_experts)


88
def bench(fn, num_warmups: int = 50, num_tests: int = 50, post_fn=None):
Chenggang Zhao's avatar
Chenggang Zhao committed
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
    # Flush L2 cache with 256 MB data
    torch.cuda.synchronize()
    cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda')

    # Warmup
    for _ in range(num_warmups):
        fn()

    # Flush L2
    cache.zero_()

    # Testing
    start_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
    end_events = [torch.cuda.Event(enable_timing=True) for _ in range(num_tests)]
    for i in range(num_tests):
        # Record
        start_events[i].record()
        fn()
        end_events[i].record()
        if post_fn is not None:
            post_fn()
    torch.cuda.synchronize()

    times = np.array([s.elapsed_time(e) / 1e3 for s, e in zip(start_events, end_events)])[1:]
    return np.average(times), np.min(times), np.max(times)


class empty_suppress:
    def __enter__(self):
        return self

    def __exit__(self, *_):
        pass


class suppress_stdout_stderr:
    def __enter__(self):
        self.outnull_file = open(os.devnull, 'w')
        self.errnull_file = open(os.devnull, 'w')

        self.old_stdout_fileno_undup = sys.stdout.fileno()
        self.old_stderr_fileno_undup = sys.stderr.fileno()

        self.old_stdout_fileno = os.dup(sys.stdout.fileno())
        self.old_stderr_fileno = os.dup(sys.stderr.fileno())

        self.old_stdout = sys.stdout
        self.old_stderr = sys.stderr

        os.dup2(self.outnull_file.fileno(), self.old_stdout_fileno_undup)
        os.dup2(self.errnull_file.fileno(), self.old_stderr_fileno_undup)

        sys.stdout = self.outnull_file
        sys.stderr = self.errnull_file
        return self

    def __exit__(self, *_):
        sys.stdout = self.old_stdout
        sys.stderr = self.old_stderr

        os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup)
        os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup)

        os.close(self.old_stdout_fileno)
        os.close(self.old_stderr_fileno)

        self.outnull_file.close()
        self.errnull_file.close()


Chenggang Zhao's avatar
Chenggang Zhao committed
159
def bench_kineto(fn, kernel_names: Union[str, tuple], num_tests: int = 30, suppress_kineto_output: bool = False,
160
                 trace_path: Optional[str] = None, barrier_comm_profiling: bool = False,
Chenggang Zhao's avatar
Chenggang Zhao committed
161
                 num_kernels_per_period: int = 1):
Chenggang Zhao's avatar
Chenggang Zhao committed
162
163
164
165
166
167
168
169
170
171
172
173
174
175
    # Profile
    suppress = suppress_stdout_stderr if suppress_kineto_output else empty_suppress
    with suppress():
        schedule = torch.profiler.schedule(wait=0, warmup=1, active=1, repeat=1)
        with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA], schedule=schedule) as prof:
            for i in range(2):
                # NOTES: use a large kernel and a barrier to eliminate the unbalanced CPU launch overhead
                if barrier_comm_profiling:
                    lhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
                    rhs = torch.randn((8192, 8192), dtype=torch.float, device='cuda')
                    lhs @ rhs
                    dist.all_reduce(torch.ones(1, dtype=torch.float, device='cuda'))
                for _ in range(num_tests):
                    fn()
176
                torch.cuda.synchronize()
Chenggang Zhao's avatar
Chenggang Zhao committed
177
178
179
180
                prof.step()

    # Parse the profiling table
    assert isinstance(kernel_names, str) or isinstance(kernel_names, tuple)
Chenggang Zhao's avatar
Chenggang Zhao committed
181
    is_tuple = isinstance(kernel_names, tuple)
Chenggang Zhao's avatar
Chenggang Zhao committed
182
183
184
185
186
187
188
189
190
191
    prof_lines = prof.key_averages().table(sort_by='cuda_time_total', max_name_column_width=100).split('\n')
    kernel_names = (kernel_names, ) if isinstance(kernel_names, str) else kernel_names
    assert all([isinstance(name, str) for name in kernel_names])
    for name in kernel_names:
        assert sum([name in line for line in prof_lines]) == 1, f'Errors of the kernel {name} in the profiling table'

    # Save chrome traces
    if trace_path is not None:
        prof.export_chrome_trace(trace_path)

Chenggang Zhao's avatar
Chenggang Zhao committed
192
    # Return average kernel durations
Chenggang Zhao's avatar
Chenggang Zhao committed
193
    units = {'ms': 1e3, 'us': 1e6}
Chenggang Zhao's avatar
Chenggang Zhao committed
194
    kernel_durations = []
Chenggang Zhao's avatar
Chenggang Zhao committed
195
196
197
198
199
200
    for name in kernel_names:
        for line in prof_lines:
            if name in line:
                time_str = line.split()[-2]
                for unit, scale in units.items():
                    if unit in time_str:
Chenggang Zhao's avatar
Chenggang Zhao committed
201
                        kernel_durations.append(float(time_str.replace(unit, '')) / scale)
Chenggang Zhao's avatar
Chenggang Zhao committed
202
203
204
                        break
                break

Chenggang Zhao's avatar
Chenggang Zhao committed
205
206
207
208
209
210
211
212
213
214
215
216
    # Expand the kernels by periods
    if num_kernels_per_period > 1:
        with tempfile.NamedTemporaryFile(suffix='.json') as tmp:
            prof.export_chrome_trace(tmp.name)
            profile_data = json.loads(Path(tmp.name).read_text())

        for i, kernel_name in enumerate(kernel_names):
            events = [event for event in profile_data['traceEvents'] if f'::{kernel_name}' in event['name']]
            events = sorted(events, key=lambda event: event['ts'])
            durations = [event['dur'] / 1e6 for event in events]
            assert len(durations) % num_kernels_per_period == 0
            num_kernel_patterns = len(durations) // num_kernels_per_period
Chenggang Zhao's avatar
Chenggang Zhao committed
217
            kernel_durations[i] = [sum(durations[j::num_kernels_per_period]) / num_kernel_patterns
Chenggang Zhao's avatar
Chenggang Zhao committed
218
219
                               for j in range(num_kernels_per_period)]

Chenggang Zhao's avatar
Chenggang Zhao committed
220
221
    # Return execution durations
    return kernel_durations if is_tuple else kernel_durations[0]
Chenggang Zhao's avatar
Chenggang Zhao committed
222

Chenggang Zhao's avatar
Chenggang Zhao committed
223
224

def hash_tensor(t: torch.Tensor):
225
    return t.view(torch.int).sum().item()