symm_mem.py 5.3 KB
Newer Older
1
2
3
4
5
6
7
8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import torch
import torch.distributed as dist
from torch.distributed import ProcessGroup

from vllm.distributed.device_communicators.all_reduce_utils import (
9
10
    SYMM_MEM_ALL_REDUCE_MAX_SIZES,
)
11
from vllm.logger import init_logger
12
from vllm.model_executor.layers.batch_invariant import (
13
    vllm_is_batch_invariant,
14
)
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from vllm.platforms import current_platform

try:
    import torch.distributed._symmetric_memory as torch_symm_mem

    symm_mem_available = True
except ImportError:
    symm_mem_available = False

logger = init_logger(__name__)


class SymmMemCommunicator:
    _WORLD_SIZES_MULTIMEM = {
        "9.0": [4, 6, 8],
        "10.0": [6, 8],
31
        "10.3": [6, 8],
32
33
    }

34
    def __init__(
35
36
        self,
        group: ProcessGroup,
37
        device: int | str | torch.device,
38
        # add options for testing
39
40
        force_multimem: bool | None = None,
        max_size_override: int | None = None,
41
    ):
42
43
44
45
46
47
        self.disabled = True

        if not symm_mem_available:
            return

        if not current_platform.is_cuda():
48
            logger.warning("SymmMemCommunicator: symmetric memory is not available.")
49
50
51
52
53
            return
        if isinstance(device, int):
            device = torch.device(f"cuda:{device}")
        elif isinstance(device, str):
            device = torch.device(device)
54
        torch.accelerator.set_device_index(device)
55
56
57
58
        self.dtype = torch.bfloat16
        self.device = device
        self.group = group
        self.world_size = dist.get_world_size(self.group)
59
60
61
62
63
64
65
66
        capability = current_platform.get_device_capability()
        if capability is None:
            logger.warning(
                "SymmMemCommunicator: device capability is unknown, "
                "communicator is not available."
            )
            return
        self.device_capability = capability.as_version_str()
67
68
69
70
71
72
73
        if self.device_capability not in SYMM_MEM_ALL_REDUCE_MAX_SIZES:
            logger.warning(
                "SymmMemCommunicator: Device capability %s not supported, "
                "communicator is not available.",
                self.device_capability,
            )
            return
74
        if self.world_size not in SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability]:
75
76
77
78
79
80
            logger.warning(
                "SymmMemCommunicator: World size %d not supported, "
                "communicator is not available.",
                self.world_size,
            )
            return
81
82
83
84
85
86
87
88
        # Use override max_size if provided, otherwise use default
        if max_size_override is not None:
            self.max_size = max_size_override
            logger.info(
                "SymmMemCommunicator: Using override max_size: %s bytes",
                self.max_size,
            )
        else:
89
90
91
            self.max_size = SYMM_MEM_ALL_REDUCE_MAX_SIZES[self.device_capability][
                self.world_size
            ]
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
        try:
            self.buffer = torch_symm_mem.empty(
                self.max_size // self.dtype.itemsize,
                device=self.device,
                dtype=self.dtype,
            )
            handle = torch_symm_mem.rendezvous(self.buffer, self.group.group_name)
        except RuntimeError as e:
            logger.warning_once(
                "SymmMemCommunicator: symmetric memory initialization failed: %s "
                "Communicator is not available. To suppress this warning set "
                "VLLM_ALLREDUCE_USE_SYMM_MEM=0",
                str(e),
            )
            return
107
        if handle.multicast_ptr == 0:
108
109
110
111
            logger.warning(
                "SymmMemCommunicator: symmetric memory "
                "multicast operations are not supported."
            )
112
            return
113
        self.force_multimem = force_multimem
114
        self.disabled = False
115
        if vllm_is_batch_invariant():
116
            self.disabled = True
117
118
119
120
121
122
123
124
125
126
127
128

    def should_use_symm_mem(self, inp: torch.Tensor):
        if self.disabled:
            return False
        if inp.dtype != self.dtype:
            return False
        inp_size = inp.numel() * inp.element_size()
        if inp_size % 4 != 0:
            return False
        return inp_size < self.max_size

    def all_reduce(
129
130
        self, inp: torch.Tensor, *, out: torch.Tensor | None = None
    ) -> torch.Tensor | None:
131
132
133
134
        if not self.should_use_symm_mem(inp):
            return None
        if out is None:
            out = torch.empty_like(inp)
135
        self.buffer[: inp.numel()].copy_(inp.view(-1))
136
137
138
139
140
141
142
143

        # Determine which algorithm to use
        use_multimem = False
        if self.force_multimem is not None:
            # Test override: use forced setting
            use_multimem = self.force_multimem
        else:
            # Normal logic: use multimem for supported world sizes
144
145
146
            use_multimem = (
                self.world_size in self._WORLD_SIZES_MULTIMEM[self.device_capability]
            )
147
148

        if use_multimem:
149
150
151
            torch.ops.symm_mem.multimem_all_reduce_(
                self.buffer[: inp.numel()], "sum", self.group.group_name
            )
152
        else:
153
154
155
156
            torch.ops.symm_mem.two_shot_all_reduce_(
                self.buffer[: inp.numel()], "sum", self.group.group_name
            )
        out.copy_(self.buffer[: inp.numel()].view(out.shape))
157
        return out