"vscode:/vscode.git/clone" did not exist on "2241085d4de7574b8a63327e75b8842997e0b147"
utils.py 5.82 KB
Newer Older
1
2
3
4
5
6
# SPDX-License-Identifier: Apache-2.0
"""
DeepEP test utilities
"""
import dataclasses
import importlib
7
import os
8
9
10
11
12
13
14
15
16
import traceback
from typing import Callable, Optional

import torch
from torch.distributed import ProcessGroup
from torch.multiprocessing import (
    spawn)  # pyright: ignore[reportPrivateImportUsage]
from typing_extensions import Concatenate, ParamSpec

17
18
from vllm.model_executor.layers.fused_moe.utils import find_free_port

19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
has_deep_ep = importlib.util.find_spec("deep_ep") is not None
if has_deep_ep:
    from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (  # noqa: E501
        DeepEPHTPrepareAndFinalize)
    from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (  # noqa: E501
        DeepEPLLPrepareAndFinalize)

## Parallel Processes Utils

P = ParamSpec("P")


@dataclasses.dataclass
class ProcessGroupInfo:
    world_size: int
    world_local_size: int
    rank: int
    node_rank: int
    local_rank: int
    device: torch.device


def _worker_parallel_launch(
    local_rank: int,
    world_size: int,
    world_local_size: int,
    node_rank: int,
    init_method: str,
    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
    *args: P.args,
    **kwargs: P.kwargs,
) -> None:
    rank = node_rank * world_local_size + local_rank
    torch.cuda.set_device(local_rank)
    device = torch.device("cuda", local_rank)
    torch.distributed.init_process_group(
        backend="cpu:gloo,cuda:nccl",
        init_method=init_method,
        rank=rank,
        world_size=world_size,
        device_id=device,
    )
    barrier = torch.tensor([rank], device=device)
    torch.distributed.all_reduce(barrier)

    try:
        worker(
            ProcessGroupInfo(
                world_size=world_size,
                world_local_size=world_local_size,
                rank=rank,
                node_rank=node_rank,
                local_rank=local_rank,
                device=device,
            ),
            *args,
            **kwargs,
        )
    except Exception as ex:
        print(ex)
        traceback.print_exc()
        raise
    finally:
        torch.distributed.destroy_process_group()


def parallel_launch(
    world_size: int,
    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
    *args: P.args,
    **kwargs: P.kwargs,
) -> None:
    assert not kwargs
    spawn(
        _worker_parallel_launch,
        args=(
            world_size,
            world_size,
            0,
98
            f"tcp://{os.getenv('LOCALHOST', 'localhost')}:{find_free_port()}",
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
            worker,
        ) + args,
        nprocs=world_size,
        join=True,
    )


## DeepEP specific utils


@dataclasses.dataclass
class DeepEPHTArgs:
    num_local_experts: int


@dataclasses.dataclass
class DeepEPLLArgs:
    max_tokens_per_rank: int
    hidden_size: int
    num_experts: int
    use_fp8_dispatch: bool


def make_deepep_ht_a2a(pg: ProcessGroup,
                       pgi: ProcessGroupInfo,
                       dp_size: int,
                       ht_args: DeepEPHTArgs,
                       q_dtype: Optional[torch.dtype] = None,
                       block_shape: Optional[list[int]] = None):

    import deep_ep

    # high throughput a2a
    num_nvl_bytes = 1024 * 1024 * 1024  # 1GB
    num_rdma_bytes, low_latency_mode, num_qps_per_rank = 0, False, 1
    buffer = deep_ep.Buffer(group=pg,
                            num_nvl_bytes=num_nvl_bytes,
                            num_rdma_bytes=num_rdma_bytes,
                            low_latency_mode=low_latency_mode,
                            num_qps_per_rank=num_qps_per_rank)
    return DeepEPHTPrepareAndFinalize(buffer=buffer,
                                      world_size=pgi.world_size,
                                      rank=pgi.rank,
                                      dp_size=dp_size,
                                      rank_expert_offset=pgi.rank *
                                      ht_args.num_local_experts,
                                      quant_dtype=q_dtype,
                                      block_shape=block_shape)


def make_deepep_ll_a2a(pg: ProcessGroup,
                       pgi: ProcessGroupInfo,
                       dp_size: int,
                       deepep_ll_args: DeepEPLLArgs,
                       q_dtype: Optional[torch.dtype] = None,
                       block_shape: Optional[list[int]] = None):

    import deep_ep

    # low-latency a2a
    num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
        deepep_ll_args.max_tokens_per_rank, deepep_ll_args.hidden_size,
        pgi.world_size, deepep_ll_args.num_experts)

    buffer = deep_ep.Buffer(group=pg,
                            num_rdma_bytes=num_rdma_bytes,
                            low_latency_mode=True,
                            num_qps_per_rank=deepep_ll_args.num_experts //
                            pgi.world_size)
168

169
170
171
172
173
174
    return DeepEPLLPrepareAndFinalize(
        buffer=buffer,
        world_size=pgi.world_size,
        dp_size=dp_size,
        max_tokens_per_rank=deepep_ll_args.max_tokens_per_rank,
        quant_dtype=q_dtype,
175
        block_shape=block_shape,
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
        use_fp8_dispatch=deepep_ll_args.use_fp8_dispatch,
    )


def make_deepep_a2a(pg: ProcessGroup,
                    pgi: ProcessGroupInfo,
                    dp_size: int,
                    deepep_ht_args: Optional[DeepEPHTArgs],
                    deepep_ll_args: Optional[DeepEPLLArgs],
                    q_dtype: Optional[torch.dtype] = None,
                    block_shape: Optional[list[int]] = None):
    if deepep_ht_args is not None:
        assert deepep_ll_args is None
        return make_deepep_ht_a2a(pg, pgi, dp_size, deepep_ht_args, q_dtype,
                                  block_shape)

    assert deepep_ll_args is not None
193
194
    return make_deepep_ll_a2a(pg, pgi, dp_size, deepep_ll_args, q_dtype,
                              block_shape)