test_low_latency.py 22.2 KB
Newer Older
lishen's avatar
lishen committed
1
import argparse
Chenggang Zhao's avatar
Chenggang Zhao committed
2
3
4
5
import random
import torch
import torch.distributed as dist
from functools import partial
lishen's avatar
lishen committed
6
from typing import Literal, Set
Chenggang Zhao's avatar
Chenggang Zhao committed
7
8

import deep_ep
lishen's avatar
lishen committed
9
from utils import init_dist, bench, bench_kineto, calc_diff, hash_tensor, per_token_cast_pg_back, per_token_cast_pc_back
Chenggang Zhao's avatar
Chenggang Zhao committed
10
11


lishen's avatar
lishen committed
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def simulate_failure_and_skip(rank: int, api: Literal["dispatch", "combine", "clean"], expected_masked_ranks: Set[int]):
    # Simulates rank failure when the rank first calls the corresponding communication API
    failed_api_ranks = {
        # API -> rank to fail (rank fails when it first calls the corresponding communication API)
        'dispatch': 1,
        'combine': 3,
        'clean': 5
    }
    if rank in expected_masked_ranks:
        # Rank already failed
        return True
    if api in failed_api_ranks.keys():
        expected_masked_ranks.add(failed_api_ranks[api])
        if failed_api_ranks[api] == rank:
            print(f"Rank {rank} failed when first calling {api} communication API, exit...", flush=True)
            return True
    return False


def query_mask_buffer_and_check(api: Literal["dispatch", "combine", "clean"], buffer: deep_ep.Buffer, mask_status: torch.Tensor,
                                expected_masked_ranks: Set[int]):
    buffer.low_latency_query_mask_buffer(mask_status)
    assert set(mask_status.nonzero().squeeze(-1).tolist()) == expected_masked_ranks


37
38
39
40
def ceil_div(a, b):
    return (a + b - 1) // b


lishen's avatar
lishen committed
41
42
43
44
45
46
47
48
def test_main(num_tokens: int,
              hidden: int,
              num_experts: int,
              num_topk: int,
              rank: int,
              num_ranks: int,
              group: dist.ProcessGroup,
              buffer: deep_ep.Buffer,
49
50
              enable_dispatch_ll_layered: bool = False,
              enable_combine_overlap: bool = False,
lishen's avatar
lishen committed
51
52
              use_logfmt: bool = False,
              seed: int = 0):
Chenggang Zhao's avatar
Chenggang Zhao committed
53
54
    torch.manual_seed(seed + rank)
    random.seed(seed + rank)
55
56
    if rank == 0:
        print(f"enable_dispatch_ll_layered={enable_dispatch_ll_layered}, enable_combine_overlap={enable_combine_overlap}, use_logfmt={use_logfmt}")
Chenggang Zhao's avatar
Chenggang Zhao committed
57

58
59
    assert not (use_logfmt and (enable_dispatch_ll_layered or enable_combine_overlap)), \
        "use_logfmt=True and enable_dispatch_ll_layered/enable_combine_overlap conflict"
Chenggang Zhao's avatar
Chenggang Zhao committed
60
61
62
    assert num_experts % num_ranks == 0
    num_local_experts = num_experts // num_ranks

lishen's avatar
lishen committed
63
    # NOTES: the integers greater than 256 exceed the BF16 precision limit
Chenggang Zhao's avatar
Chenggang Zhao committed
64
65
66
67
68
    rank_offset = 128
    assert num_ranks - rank_offset < 257, 'Too many ranks (exceeding test precision limit)'

    x = torch.ones((num_tokens, hidden), dtype=torch.bfloat16, device='cuda') * (rank - rank_offset)
    x[:, -128:] = torch.arange(num_tokens, device='cuda').to(torch.bfloat16).view(-1, 1)
lishen's avatar
lishen committed
69
70
71
72
73
74
75
76
    x_list = [x]
    for _ in range(4 if use_logfmt else 0):
        # NOTES: make more LogFMT casts and also with some BF16
        x_list.append(torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda') * 0.5 * random.random())
    # NOTES: the last one is for performance testing
    # Most of the values in the perf case is lower than the threshold, casting most channels
    x_list.append(torch.randn((num_tokens, hidden), dtype=torch.bfloat16, device='cuda') * 0.1)

Chenggang Zhao's avatar
Chenggang Zhao committed
77
78
79
    scores = torch.randn((num_tokens, num_experts), dtype=torch.float32, device='cuda').abs() + 1
    topk_idx = torch.topk(scores, num_topk, dim=-1, largest=True, sorted=True)[1]
    topk_weights = torch.randn((num_tokens, num_topk), dtype=torch.float32, device='cuda').abs()
lishen's avatar
lishen committed
80

Chenggang Zhao's avatar
Chenggang Zhao committed
81
    # Randomly mask some positions
lishen's avatar
lishen committed
82
    for _ in range(10):
Chenggang Zhao's avatar
Chenggang Zhao committed
83
84
        topk_idx[random.randint(0, num_tokens - 1), random.randint(0, num_topk - 1)] = -1

lishen's avatar
lishen committed
85
86
87
88
89
90
91
    all_topk_idx = torch.empty((num_ranks, num_tokens, num_topk), dtype=topk_idx.dtype, device='cuda')
    dist.all_gather_into_tensor(all_topk_idx, topk_idx, group=group)

    # For failure simulation and shrink testing
    mask_status = torch.zeros((num_ranks,), dtype=torch.int, device='cuda')
    expected_masked_ranks = set()

Chenggang Zhao's avatar
Chenggang Zhao committed
92
93
94
    # Check dispatch correctness
    do_check = True
    hash_value, num_times = 0, 0
lishen's avatar
lishen committed
95
96
    for x_i, current_x in enumerate(x_list):
        for return_recv_hook in (False, True):
97
98
99
100
            if enable_combine_overlap and (not return_recv_hook):  # return_recv_hook 为False 时,不能启用 overlop
                continue

            for quant_type in (0, 1, 2, 3,):  # 0: 不量化, 1: int8, 2: FP8_E4M3, 3: FP8_UE8M0 (仅支持round_scale=True), 4: FP8_E5M2
lishen's avatar
lishen committed
101
                dispatch_use_quant = quant_type > 0
102
103
                for fp8_round_scale in (False, True) if quant_type != 3 else (True,):
                    for quant_group_size in (0, 128,) if quant_type >= 2 else (0,):
lishen's avatar
lishen committed
104
105
                        if quant_type == 3 and (fp8_round_scale == False or quant_group_size == 0):
                            continue
lishen's avatar
lishen committed
106

lishen's avatar
lishen committed
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
                        num_times += 1
                        for _ in range((num_times % 2) + 1):
                            packed_recv_x, packed_recv_count, handle, event, hook = \
                                buffer.low_latency_dispatch(current_x, topk_idx, num_tokens, num_experts,
                                                            quant_type=quant_type, fp8_round_scale=fp8_round_scale, quant_group_size=quant_group_size,
                                                            async_finish=not return_recv_hook, return_recv_hook=return_recv_hook)
                            hook() if return_recv_hook else event.current_stream_wait()
                        packed_recv_x = (packed_recv_x[0], packed_recv_x[1].contiguous()) if dispatch_use_quant else packed_recv_x
                        if not dispatch_use_quant:
                            simulated_gemm_x = packed_recv_x.clone()
                        elif quant_group_size == 0:
                            simulated_gemm_x = per_token_cast_pc_back(packed_recv_x[0].view(-1, hidden), packed_recv_x[1].reshape(-1)).view(packed_recv_x[0].shape)
                        elif quant_group_size == 128:
                            simulated_gemm_x = per_token_cast_pg_back(packed_recv_x[0].view(-1, hidden), packed_recv_x[1].view(-1, hidden // 128)).view(packed_recv_x[0].shape)
                        for i in range(num_local_experts if do_check else 0):
                            expert_id = rank * num_local_experts + i
                            if not dispatch_use_quant:
                                recv_x = packed_recv_x[i]
                            elif quant_group_size == 0:
                                recv_x = per_token_cast_pc_back(packed_recv_x[0][i], packed_recv_x[1][i])
                            elif quant_group_size == 128:
                                recv_x = per_token_cast_pg_back(packed_recv_x[0][i], packed_recv_x[1][i])
                            recv_count, recv_src_info, recv_layout_range = packed_recv_count[i], handle[0][i], handle[1][i]

                            # Check expert indices
                            int_mask = (2 ** 32) - 1
                            num_valid_tokens = recv_count.item()
                            assert num_valid_tokens == (
                                    recv_layout_range
                                    & int_mask).sum().item(), f'{num_valid_tokens} != {recv_layout_range & int_mask}.sum().item()'
                            assert num_valid_tokens == (all_topk_idx == expert_id).sum(dim=[1, 2])[mask_status == 0].sum().item(
                            ), f'{num_valid_tokens} != {(all_topk_idx == expert_id).sum(dim=[1, 2])[mask_status == 0].sum().item()}'

                            if num_valid_tokens == 0:
                                continue
                            # Check received data
                            if current_x is x:
                                recv_x = recv_x[:num_valid_tokens]
                                recv_x_amin = recv_x[:, :-128].amin(dim=-1)
                                recv_x_amax = recv_x[:, :-128].amax(dim=-1)
147

148
                                if enable_dispatch_ll_layered or enable_combine_overlap:
149
150
151
152
                                    recv_src_info = recv_src_info[:num_valid_tokens] & int_mask  # 掩掉多余的信息
                                else:
                                    recv_src_info = recv_src_info[:num_valid_tokens]

lishen's avatar
lishen committed
153
                                assert torch.equal(recv_x_amin, recv_x_amax)
154

lishen's avatar
lishen committed
155
156
157
158
159
160
161
162
163
164
165
166
167
168
                                if dispatch_use_quant:
                                    assert calc_diff(recv_x[:, -1], recv_src_info.view(-1)) < 0.007

                                assert torch.equal(recv_x_amin, recv_x[:, :-128].amax(dim=-1))
                                if quant_group_size != 0:
                                    if fp8_round_scale:
                                        assert calc_diff(recv_x[:, -1], recv_src_info.view(-1)) < 0.007
                                    else:
                                        assert (recv_x[:, -128:] - recv_src_info.view(-1, 1) % num_tokens).sum().item() == 0
                                    for j in range(num_ranks):
                                        begin_idx, count = (recv_layout_range[j] >> 32).item(), (recv_layout_range[j] & int_mask).item()
                                        if not fp8_round_scale:
                                            assert (recv_x_amin == j - rank_offset).sum().item() == (all_topk_idx[j] == expert_id).sum().item()
                                            assert (recv_x[begin_idx:begin_idx + count, :-128] - j + rank_offset).sum().item() == 0
169

lishen's avatar
lishen committed
170
171
172
173
174
175
176
                            if dispatch_use_quant:
                                hash_value ^= hash_tensor(packed_recv_x[0][i, :num_valid_tokens])
                                hash_value ^= hash_tensor(packed_recv_x[1][i, :num_valid_tokens])
                            else:
                                hash_value ^= hash_tensor(packed_recv_x[i, :num_valid_tokens])

                        # Check combine correctness
177
                        for zero_copy in (False,) if use_logfmt else (False, True,):
lishen's avatar
lishen committed
178
179
180
                            if zero_copy:
                                buffer.get_next_low_latency_combine_buffer(handle)[:, :, :] = simulated_gemm_x
                            out = torch.empty((num_tokens, hidden), dtype=torch.bfloat16, device='cuda')
181
182
                            if enable_combine_overlap:
                                block_m, threshold, num_sms = 64, 10, 3
183
                                total_num_per_expert = ceil_div(num_tokens * num_ranks, block_m)  # 每个本地专家 总的信号数
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
                                comp_signal = torch.zeros(num_local_experts * total_num_per_expert, dtype=torch.int32, device='cuda')

                                for i in range(num_local_experts):
                                    vaild_num = ceil_div(packed_recv_count[i], block_m)
                                    comp_signal[i * total_num_per_expert:i * total_num_per_expert + vaild_num] = threshold
                                combined_x, event, hook = buffer.low_latency_combine(simulated_gemm_x,
                                                                                     topk_idx,
                                                                                     topk_weights,
                                                                                     handle,
                                                                                     packed_recv_count=packed_recv_count,
                                                                                     comp_signal=comp_signal,
                                                                                     block_m=block_m,
                                                                                     threshold=threshold,
                                                                                     num_sms=num_sms,
                                                                                     async_finish=not return_recv_hook,
                                                                                     zero_copy=zero_copy,
                                                                                     return_recv_hook=return_recv_hook,
                                                                                     out=out)
                            else:
                                combined_x, event, hook = buffer.low_latency_combine(simulated_gemm_x,
                                                                                     topk_idx,
                                                                                     topk_weights,
                                                                                     handle,
                                                                                     use_logfmt=use_logfmt,
                                                                                     async_finish=not return_recv_hook,
                                                                                     zero_copy=zero_copy,
                                                                                     return_recv_hook=return_recv_hook,
                                                                                     out=out)

lishen's avatar
lishen committed
213
214
215
216
217
218
219
220
221
                            hook() if return_recv_hook else event.current_stream_wait()
                            if do_check:
                                diff = calc_diff(current_x * topk_weights.masked_fill(topk_idx == -1, 0).sum(dim=1).view(-1, 1), combined_x)
                                assert torch.isnan(combined_x).sum().item() == 0
                                # if not fp8_round_scale:
                                assert diff < (9e-4 if dispatch_use_quant else 1e-5), f'Error: diff={diff}, dispatch_use_quant={dispatch_use_quant}, zero_copy={zero_copy}'
                                hash_value ^= hash_tensor(combined_x)

                        if rank == 0:
222
                            print(f"data:{x_i}, return_recv_hook:{return_recv_hook}, quant_type:{quant_type}, ",
lishen's avatar
lishen committed
223
                                  f"fp8_round_scale:{fp8_round_scale}, quant_group_size:{quant_group_size} pass")
Chenggang Zhao's avatar
Chenggang Zhao committed
224

225
226
227
228
    print("deep_ep 全部正确性测试完成")
    if enable_dispatch_ll_layered or enable_combine_overlap:
        return hash_value

Chenggang Zhao's avatar
Chenggang Zhao committed
229
230
231
232
233
234
235
236
    # noinspection PyShadowingNames
    def large_gemm_with_hook(hook):
        mat_0 = torch.randn((8192, 8192), dtype=torch.float)
        mat_1 = torch.randn((8192, 8192), dtype=torch.float)
        mat_0 @ mat_1
        hook()

    # noinspection PyShadowingNames
lishen's avatar
lishen committed
237
    def test_func(return_recv_hook: bool):
Chenggang Zhao's avatar
Chenggang Zhao committed
238
        recv_x, recv_count, handle, event, hook = \
lishen's avatar
lishen committed
239
240
            buffer.low_latency_dispatch(current_x, topk_idx, num_tokens, num_experts,
                                        quant_type=2, quant_group_size=0,
lishen's avatar
lishen committed
241
                                        async_finish=False, return_recv_hook=return_recv_hook)
Chenggang Zhao's avatar
Chenggang Zhao committed
242
        large_gemm_with_hook(hook) if return_recv_hook else None
lishen's avatar
lishen committed
243
244
245
246
247
248
        combined_x, event, hook = buffer.low_latency_combine(simulated_gemm_x,
                                                             topk_idx,
                                                             topk_weights,
                                                             handle,
                                                             use_logfmt=use_logfmt,
                                                             return_recv_hook=return_recv_hook)
Chenggang Zhao's avatar
Chenggang Zhao committed
249
250
251
252
        large_gemm_with_hook(hook) if return_recv_hook else None

    # Calculate bandwidth
    num_fp8_bytes, num_bf16_bytes = (hidden + hidden / 128 * 4 + 16), hidden * 2
lishen's avatar
lishen committed
253
    num_logfmt10_bytes = hidden * 10 / 8 + hidden / 128 * 4
Chenggang Zhao's avatar
Chenggang Zhao committed
254
255
256
257
    num_dispatch_comm_bytes, num_combine_comm_bytes = 0, 0
    for i in range(num_tokens):
        num_selections = (topk_idx[i] != -1).sum().item()
        num_dispatch_comm_bytes += num_fp8_bytes * num_selections
lishen's avatar
lishen committed
258
        num_combine_comm_bytes += num_bf16_bytes * num_selections
Chenggang Zhao's avatar
Chenggang Zhao committed
259
260

    # Dispatch + combine testing
lishen's avatar
lishen committed
261
    avg_t, min_t, max_t = bench(partial(test_func, return_recv_hook=False))
Chenggang Zhao's avatar
Chenggang Zhao committed
262
    print(f'[rank {rank}] Dispatch + combine bandwidth: {(num_dispatch_comm_bytes + num_combine_comm_bytes) / 1e9 / avg_t:.2f} GB/s, '
lishen's avatar
lishen committed
263
264
          f'avg_t={avg_t * 1e6:.2f} us, min_t={min_t * 1e6:.2f} us, max_t={max_t * 1e6:.2f} us',
          flush=True)
Chenggang Zhao's avatar
Chenggang Zhao committed
265
266
267
268

    # Separate profiling
    for return_recv_hook in (False, True):
        group.barrier()
lishen's avatar
lishen committed
269
270
271
272
273
        dispatch_t, combine_t = bench_kineto(partial(test_func, return_recv_hook=return_recv_hook),
                                             kernel_names=('dispatch', 'combine'),
                                             barrier_comm_profiling=True,
                                             suppress_kineto_output=True,
                                             num_kernels_per_period=2 if return_recv_hook else 1)
Chenggang Zhao's avatar
Chenggang Zhao committed
274
275
        if not return_recv_hook:
            print(f'[rank {rank}] Dispatch bandwidth: {num_dispatch_comm_bytes / 1e9 / dispatch_t:.2f} GB/s, avg_t={dispatch_t * 1e6:.2f} us | '
lishen's avatar
lishen committed
276
277
                  f'Combine bandwidth: {num_combine_comm_bytes / 1e9 / combine_t:.2f} GB/s, avg_t={combine_t * 1e6:.2f} us',
                  flush=True)
Chenggang Zhao's avatar
Chenggang Zhao committed
278
        else:
lishen's avatar
lishen committed
279
280
281
            print(f'[rank {rank}] Dispatch send/recv time: {dispatch_t[0] * 1e6:.2f} + {dispatch_t[1] * 1e6:.2f} us | '
                  f'Combine send/recv time: {combine_t[0] * 1e6:.2f} + {combine_t[1] * 1e6:.2f} us',
                  flush=True)
Chenggang Zhao's avatar
Chenggang Zhao committed
282
283
284
    return hash_value


lishen's avatar
lishen committed
285
286
# noinspection PyUnboundLocalVariable,PyShadowingNames
def test_loop(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
Chenggang Zhao's avatar
Chenggang Zhao committed
287
    rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
lishen's avatar
lishen committed
288
289
    num_tokens, hidden = args.num_tokens, args.hidden
    num_topk, num_experts = args.num_topk, args.num_experts
Chenggang Zhao's avatar
Chenggang Zhao committed
290

291
292
293
294
295
296
297
    enable_dispatch_ll_layered = args.enable_dispatch_ll_layered
    enable_combine_overlap = args.enable_combine_overlap
    if enable_dispatch_ll_layered:
        enable_combine_overlap = True

    num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(num_tokens, hidden, num_ranks, num_experts,
                                                                   enable_dispatch_ll_layered=enable_dispatch_ll_layered)
Chenggang Zhao's avatar
Chenggang Zhao committed
298
299
    if local_rank == 0:
        print(f'Allocating buffer size: {num_rdma_bytes / 1e6} MB ...', flush=True)
lishen's avatar
lishen committed
300
301
302
303
304
305
    buffer = deep_ep.Buffer(group,
                            num_rdma_bytes=num_rdma_bytes,
                            low_latency_mode=True,
                            num_qps_per_rank=num_experts // num_ranks,
                            allow_nvlink_for_low_latency_mode=not args.disable_nvlink,
                            explicitly_destroy=True,
306
307
308
309
310
                            allow_mnnvl=args.allow_mnnvl,
                            enable_dispatch_ll_layered=enable_dispatch_ll_layered,
                            enable_combine_overlap=enable_combine_overlap
                            )
    print("deep_ep 初始化完成")
lishen's avatar
lishen committed
311
312
313
314
315
316
317
318
319
    test_main(num_tokens,
              hidden,
              num_experts,
              num_topk,
              rank,
              num_ranks,
              group,
              buffer,
              use_logfmt=args.use_logfmt,
320
321
              enable_dispatch_ll_layered=enable_dispatch_ll_layered,
              enable_combine_overlap=enable_combine_overlap,
lishen's avatar
lishen committed
322
              seed=1)
Chenggang Zhao's avatar
Chenggang Zhao committed
323

lishen's avatar
lishen committed
324
    do_pressure_test = args.pressure_test
Chenggang Zhao's avatar
Chenggang Zhao committed
325
326
327
    for seed in range(int(1e9) if do_pressure_test else 0):
        if local_rank == 0:
            print(f'Testing with seed {seed} ...', flush=True)
lishen's avatar
lishen committed
328
329
330
331
332
333
334
335
336
        ref_hash = test_main(num_tokens,
                             hidden,
                             num_experts,
                             num_topk,
                             rank,
                             num_ranks,
                             group,
                             buffer,
                             use_logfmt=args.use_logfmt,
337
338
                             enable_dispatch_ll_layered=enable_dispatch_ll_layered,
                             enable_combine_overlap=enable_combine_overlap,
lishen's avatar
lishen committed
339
340
341
342
343
344
345
346
347
348
349
                             seed=seed)
        for _ in range(20):
            assert test_main(num_tokens,
                             hidden,
                             num_experts,
                             num_topk,
                             rank,
                             num_ranks,
                             group,
                             buffer,
                             use_logfmt=args.use_logfmt,
350
351
                             enable_dispatch_ll_layered=enable_dispatch_ll_layered,
                             enable_combine_overlap=enable_combine_overlap,
lishen's avatar
lishen committed
352
                             seed=seed) == ref_hash, f'Error: seed={seed}'
Chenggang Zhao's avatar
Chenggang Zhao committed
353

lishen's avatar
lishen committed
354
    # Destroy the buffer runtime and communication group
lijian6's avatar
lijian6 committed
355
356
357
358
    buffer.destroy()
    dist.barrier()
    dist.destroy_process_group()

lishen's avatar
lishen committed
359

Chenggang Zhao's avatar
Chenggang Zhao committed
360
361
if __name__ == '__main__':
    # TODO: you may modify NUMA binding for less CPU overhead
lishen's avatar
lishen committed
362
363
364
365
366
367
368
369
370
371
372
373
    # TODO: buggy with `num_tokens=512`
    parser = argparse.ArgumentParser(description='Test low-latency EP kernels')
    parser.add_argument('--num-processes', type=int, default=8, help='Number of processes to spawn (default: 8)')
    parser.add_argument('--num-tokens', type=int, default=128, help='Number of tokens (default: 128)')
    parser.add_argument('--hidden', type=int, default=7168, help='Hidden dimension size (default: 7168)')
    parser.add_argument('--num-topk', type=int, default=8, help='Number of top-k experts (default: 8)')
    parser.add_argument('--num-experts', type=int, default=288, help='Number of experts (default: 288)')
    parser.add_argument('--allow-mnnvl', action="store_true", help='Allow MNNVL for communication')
    parser.add_argument('--disable-nvlink', action='store_true', help='Whether to disable NVLink for testing')
    parser.add_argument("--pressure-test", action='store_true', help='Whether to do pressure test')
    parser.add_argument("--shrink-test", action='store_true', help='Whether to simulate failure and test shrink mode')
    parser.add_argument('--use-logfmt', action='store_true', help='Whether to test LogFMT combine')
374
375
376
377
    # 新版 sbo 需要的
    parser.add_argument('--enable-dispatch-ll-layered', action='store_true', help='Enable low-latency layered dispatch optimization')
    parser.add_argument("--enable-combine-overlap", action='store_true', help='Enable GEMM-compute/communication overlap in the combine phase')

lishen's avatar
lishen committed
378
379
380
381
    args = parser.parse_args()

    num_processes = args.num_processes
    torch.multiprocessing.spawn(test_loop, args=(num_processes, args), nprocs=num_processes)