dp_utils.py 8.72 KB
Newer Older
1
2
3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

4

5
6
7
8
import numpy as np
import torch
import torch.distributed as dist

9
import vllm.envs as envs
10
from vllm.config import ParallelConfig
11
from vllm.distributed.parallel_state import get_dp_group
12
13
14
from vllm.logger import init_logger
from vllm.v1.worker.ubatch_utils import (
    check_ubatch_thresholds,
15
    is_last_ubatch_empty,
16
17
18
19
20
21
)

logger = init_logger(__name__)


def _get_device_and_group(parallel_config: ParallelConfig):
22
23
    # Use the actual device assigned to the DP group, not just the device type
    device = get_dp_group().device
24
25
    group = get_dp_group().device_group

26
    # Transferring this tensor from GPU to CPU will introduce a GPU sync
27
28
29
30
    # point that could adversely affect performance of vllm with asynch
    # scheduling. This environment variable exists to quickly disable
    # this optimization if we run into this case.
    if parallel_config.disable_nccl_for_dp_synchronization:
31
32
33
        logger.info_once(
            "Using CPU all reduce to synchronize DP padding between ranks."
        )
34
35
36
37
38
39
40
        device = "cpu"
        group = get_dp_group().cpu_group
    return device, group


def _run_ar(
    should_ubatch: bool,
41
    should_dp_pad: bool,
42
43
    orig_num_tokens_per_ubatch: int,
    padded_num_tokens_per_ubatch: int,
44
    cudagraph_mode: int,
45
46
47
48
49
    parallel_config: ParallelConfig,
) -> torch.Tensor:
    dp_size = parallel_config.data_parallel_size
    dp_rank = parallel_config.data_parallel_rank
    device, group = _get_device_and_group(parallel_config)
50
    tensor = torch.zeros(5, dp_size, device=device, dtype=torch.int32)
51
52
53
    tensor[0][dp_rank] = orig_num_tokens_per_ubatch
    tensor[1][dp_rank] = padded_num_tokens_per_ubatch
    tensor[2][dp_rank] = 1 if should_ubatch else 0
54
    tensor[3][dp_rank] = 1 if should_dp_pad else 0
55
    tensor[4][dp_rank] = cudagraph_mode
56
57
58
59
    dist.all_reduce(tensor, group=group)
    return tensor


60
def _post_process_ubatch(tensor: torch.Tensor, num_ubatches: int) -> bool:
61
62
63
64
65
66
67
68
69
70
71
    orig_num_tokens_tensor = tensor[0, :]
    padded_num_tokens_tensor = tensor[1, :]

    # First determine if we are going to be ubatching.
    should_ubatch: bool = bool(torch.all(tensor[2] == 1).item())
    if not should_ubatch:
        return False
    # If the DP ranks are planning to ubatch, make sure that
    # there are no "empty" second ubatches
    orig_min_num_tokens = int(orig_num_tokens_tensor.min().item())
    padded_max_num_tokens = int(padded_num_tokens_tensor.max().item())
72
    if is_last_ubatch_empty(orig_min_num_tokens, padded_max_num_tokens, num_ubatches):
73
74
75
76
77
78
79
        logger.debug(
            "Aborting ubatching %s %s", orig_min_num_tokens, padded_max_num_tokens
        )
        should_ubatch = False
    return should_ubatch


80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
def _post_process_dp_padding(tensor: torch.Tensor, should_dp_pad: bool) -> torch.Tensor:
    num_tokens_across_dp = tensor[1, :]
    if should_dp_pad:
        # If DP padding is enabled, ensure that each rank is processing the same number
        # of tokens
        max_num_tokens = int(num_tokens_across_dp.max().item())
        return torch.tensor(
            [max_num_tokens] * len(num_tokens_across_dp),
            device="cpu",
            dtype=torch.int32,
        )
    else:
        return num_tokens_across_dp.cpu()


95
96
97
98
99
100
101
102
103
def _post_process_cudagraph_mode(tensor: torch.Tensor) -> int:
    """
    Synchronize cudagraph_mode across DP ranks by taking the minimum.
    If any rank has NONE (0), all ranks use NONE.
    This ensures all ranks send consistent values (all padded or all unpadded).
    """
    return int(tensor[4, :].min().item())


104
105
106
107
def _synchronize_dp_ranks(
    num_tokens_unpadded: int,
    num_tokens_padded: int,
    should_attempt_ubatching: bool,
108
    should_attempt_dp_padding: bool,
109
    cudagraph_mode: int,
110
    parallel_config: ParallelConfig,
111
) -> tuple[bool, torch.Tensor | None, int]:
112
113
114
115
116
    """
    1. Decides if each DP rank is going to microbatch. Either all ranks
    run with microbatching or none of them do.

    2. Determines the total number of tokens that each rank will run.
117
118
    When running microbatched or if should_attempt_dp_padding is True, all
    ranks will be padded out so that the run with the same number of tokens
119

120
121
    3. Synchronizes cudagraph_mode across ranks by taking the minimum.

122
123
124
    Returns: tuple[
        should_ubatch: Are all DP ranks going to microbatch
        num_tokens_after_padding: A tensor containing the total number of
125
        tokens per-microbatch for each DP rank including any DP padding.
126
        synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks)
127
128
129
130
131
    ]

    """
    assert num_tokens_padded >= num_tokens_unpadded

132
    # Coordinate between the DP ranks via an All Reduce
133
134
135
136
    # to determine the total number of tokens that each rank
    # will run and if we are using ubatching or not.
    tensor = _run_ar(
        should_ubatch=should_attempt_ubatching,
137
        should_dp_pad=should_attempt_dp_padding,
138
139
        orig_num_tokens_per_ubatch=num_tokens_unpadded,
        padded_num_tokens_per_ubatch=num_tokens_padded,
140
        cudagraph_mode=cudagraph_mode,
141
142
143
        parallel_config=parallel_config,
    )

144
145
146
147
    should_dp_pad = bool(torch.all(tensor[3] == 1).item())

    # DP ranks should all have the same value for should_attempt_dp_padding.
    assert should_attempt_dp_padding == should_dp_pad
148

149
    # Check conditions for microbatching
150
    should_ubatch = _post_process_ubatch(tensor, parallel_config.num_ubatches)
151

152
    if should_ubatch and not should_dp_pad:
153
154
155
156
157
158
        logger.debug_once(
            "Microbatching has been triggered and requires DP padding. "
            "Enabling DP padding even though it has been explicitly "
            "disabled.",
            scope="global",
        )
159
160
161
162
163
164
165
166
167
        should_dp_pad = True

    # Pad all DP ranks up to the maximum token count across ranks if
    # should_dp_pad is True
    num_tokens_after_padding = _post_process_dp_padding(
        tensor,
        should_dp_pad,
    )

168
169
170
171
    # Synchronize cudagraph_mode across ranks (take min)
    synced_cudagraph_mode = _post_process_cudagraph_mode(tensor)

    return should_ubatch, num_tokens_after_padding, synced_cudagraph_mode
172
173
174
175
176


def coordinate_batch_across_dp(
    num_tokens_unpadded: int,
    allow_microbatching: bool,
177
178
    allow_dp_padding: bool,
    parallel_config: ParallelConfig,
179
180
181
    num_tokens_padded: int | None = None,
    uniform_decode: bool | None = None,
    num_scheduled_tokens_per_request: np.ndarray | None = None,
182
183
    cudagraph_mode: int = 0,
) -> tuple[bool, torch.Tensor | None, int]:
184
185
186
187
    """
    Coordinates amongst all DP ranks to determine if and how the full batch
    should be split into microbatches.

188
189
190
191
192
193
194
195
196
197
198
    Args:
        num_tokens_unpadded: Number of tokens without accounting for padding
        allow_microbatching: If microbatching should be attempted
        allow_dp_padding: If all DP ranks should be padded up to the same value
        parallel_config: The parallel config
        num_tokens_padded: Number of tokens including any non-DP padding (CUDA graphs,
            TP, etc)
        uniform_decode: Only used if allow_microbatching is True. True if the batch
            only contains single token decodes
        num_scheduled_tokens_per_request: Only used if allow_microbatching is True. The
            number of tokens per request.
199
        cudagraph_mode: The cudagraph mode for this rank (0=NONE, 1=PIECEWISE, 2=FULL)
200

201
202
203
204
    Returns: tuple[
        ubatch_slices: if this is set then all DP ranks have agreed to
        microbatch
        num_tokens_after_padding: A tensor containing the total number of
205
206
207
        tokens per-microbatch for each DP rank including padding. Will be
        padded up to the max value across all DP ranks when allow_dp_padding
        is True.
208
        synced_cudagraph_mode: The synchronized cudagraph mode (min across ranks)
209
210
211
    ]

    """
212
    if parallel_config.data_parallel_size == 1 or envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency":
213
        # Early exit.
214
        return False, None, cudagraph_mode
215

216
217
218
219
220
221
222
223
224
225
    # If the caller has explicitly enabled microbatching.
    should_attempt_ubatching = False
    if allow_microbatching:
        # Check preconditions for microbatching
        assert uniform_decode is not None
        should_attempt_ubatching = check_ubatch_thresholds(
            parallel_config,
            num_tokens_unpadded,
            uniform_decode=uniform_decode,
        )
226

227
228
    if num_tokens_padded is None:
        num_tokens_padded = num_tokens_unpadded
229

230
231
232
233
234
235
236
237
238
    (should_ubatch, num_tokens_after_padding, synced_cudagraph_mode) = (
        _synchronize_dp_ranks(
            num_tokens_unpadded,
            num_tokens_padded,
            should_attempt_ubatching,
            allow_dp_padding,
            cudagraph_mode,
            parallel_config,
        )
239
240
    )

241
    return (should_ubatch, num_tokens_after_padding, synced_cudagraph_mode)