xpu.py 13.6 KB
Newer Older
1
# SPDX-License-Identifier: Apache-2.0
2
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3

4
import contextlib
5
import os
6
from typing import TYPE_CHECKING
7

8
9
import torch

10
11
12
13
14
# import custom ops, trigger op registration
import vllm_xpu_kernels._C  # noqa
import vllm_xpu_kernels._moe_C  # noqa
import vllm_xpu_kernels._xpu_C  # noqa

15
import vllm.envs as envs
16
from vllm.logger import init_logger
17
from vllm.utils.torch_utils import supports_xpu_graph
18
from vllm.v1.attention.backends.registry import AttentionBackendEnum
19

20
from .interface import DeviceCapability, Platform, PlatformEnum
21

22
if TYPE_CHECKING:
23
    from vllm.config import VllmConfig
24
    from vllm.config.kernel import IrOpPriorityConfig
25
    from vllm.v1.attention.selector import AttentionSelectorConfig
26
27
28
else:
    VllmConfig = None

29
logger = init_logger(__name__)
30
31
32
33


class XPUPlatform(Platform):
    _enum = PlatformEnum.XPU
34
    device_name: str = "xpu"
35
    device_type: str = "xpu"
36
    dispatch_key: str = "XPU"
37
38
39
    # Intel XPU's device key is "GPU" for Ray.
    # see https://github.com/ray-project/ray/blob/6a5eb5865eeb9ccf058a79b44f107e327e360673/python/ray/_private/accelerators/intel_gpu.py#L20 # noqa: E501
    ray_device_key: str = "GPU"
40
    dist_backend: str = "xccl"  # xccl only
41
    device_control_env_var: str = "ZE_AFFINITY_MASK"
42

43
    @classmethod
44
45
46
47
    def import_kernels(cls) -> None:
        # Do not import vllm._C
        with contextlib.suppress(ImportError):
            import vllm._moe_C  # noqa: F401
48

49
    @classmethod
50
51
    def get_attn_backend_cls(
        cls,
52
        selected_backend: "AttentionBackendEnum",
53
        attn_selector_config: "AttentionSelectorConfig",
54
        num_heads: int | None = None,
55
    ) -> str:
56
57
58
59
60
61
62
63
        from vllm.v1.attention.backends.utils import set_kv_cache_layout

        set_kv_cache_layout("NHD")
        logger.info(
            "Setting VLLM_KV_CACHE_LAYOUT to 'NHD' for XPU; "
            "only NHD layout is supported by XPU attention kernels."
        )

64
        dtype = attn_selector_config.dtype
65
        if attn_selector_config.use_sparse:
66
67
            logger.info_once("Using XPU MLA Sparse backend.")
            return AttentionBackendEnum.XPU_MLA_SPARSE.get_path()
68
69
70
        if attn_selector_config.use_mla:
            logger.info_once("Using Triton MLA backend on V1 engine.")
            return AttentionBackendEnum.TRITON_MLA.get_path()
71
        if selected_backend == AttentionBackendEnum.TRITON_ATTN:
72
            logger.info_once("Using Triton backend.")
73
            return AttentionBackendEnum.TRITON_ATTN.get_path()
74
75
76
77
78
79
        elif dtype == torch.float32:
            logger.warning_once(
                "Flash Attention on XPU does not support float32 dtype. "
                "Falling back to Triton Attention backend."
            )
            return AttentionBackendEnum.TRITON_ATTN.get_path()
80
        elif selected_backend == AttentionBackendEnum.FLASH_ATTN:
81
            logger.info_once("Using Flash Attention backend.")
82
            return AttentionBackendEnum.FLASH_ATTN.get_path()
83
84
85
        elif selected_backend:
            raise ValueError(
                f"Invalid attention backend for {cls.device_name}, "
86
                f"with use_mla: {attn_selector_config.use_mla}"
87
            )
88

89
        logger.info("Using Flash Attention backend.")
90
        return AttentionBackendEnum.FLASH_ATTN.get_path()
91

92
93
94
95
    @classmethod
    def get_supported_vit_attn_backends(cls) -> list["AttentionBackendEnum"]:
        return [
            AttentionBackendEnum.FLASH_ATTN,
96
            AttentionBackendEnum.TRITON_ATTN,
97
            AttentionBackendEnum.TORCH_SDPA,
98
99
100
101
102
103
104
        ]

    @classmethod
    def get_vit_attn_backend(
        cls,
        head_size: int,
        dtype: torch.dtype,
105
        backend: "AttentionBackendEnum | None" = None,
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
    ) -> "AttentionBackendEnum":
        if backend is not None:
            assert backend in cls.get_supported_vit_attn_backends(), (
                f"Backend {backend} is not supported for vit attention. "
                f"Supported backends are: "
                f"{cls.get_supported_vit_attn_backends()}."
            )
            logger.info_once(f"Using backend {backend} for vit attention")
            return backend

        logger.info_once(
            f"Using backend {AttentionBackendEnum.FLASH_ATTN} for vit attention"
        )
        return AttentionBackendEnum.FLASH_ATTN

121
122
123
124
125
126
127
    @classmethod
    def set_device(cls, device: torch.device) -> None:
        """
        Set the device for the current platform.
        """
        torch.xpu.set_device(device)

128
    @classmethod
129
    def get_device_capability(
130
131
        cls,
        device_id: int = 0,
132
    ) -> DeviceCapability | None:
133
134
135
        # capacity format differs from cuda's and will cause unexpected
        # failure, so use None directly
        return None
136

137
138
    @classmethod
    def get_device_name(cls, device_id: int = 0) -> str:
139
        return torch.xpu.get_device_name(device_id)
140

141
142
    @classmethod
    def get_punica_wrapper(cls) -> str:
143
144
145
146
147
        xpu_use_triton_kernel = os.getenv("XPU_USE_TRITON_KERNEL", "0") == "1"
        if not xpu_use_triton_kernel:
            return "vllm.lora.punica_wrapper.punica_xpu.PunicaWrapperXPU"
        else:
            return "vllm.lora.punica_wrapper.punica_gpu.PunicaWrapperGPU"
148

149
150
151
152
    @classmethod
    def get_device_total_memory(cls, device_id: int = 0) -> int:
        device_props = torch.xpu.get_device_properties(device_id)
        return device_props.total_memory
153

154
155
    @classmethod
    def inference_mode(cls):
156
        return torch.no_grad()
157

158
159
160
161
    @classmethod
    def get_static_graph_wrapper_cls(cls) -> str:
        return "vllm.compilation.cuda_graph.CUDAGraphWrapper"

162
163
    @classmethod
    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
164
        parallel_config = vllm_config.parallel_config
165

166
        # lazy import to avoid circular import
167
        from vllm.config import CUDAGraphMode
168

169
        compilation_config = vllm_config.compilation_config
170
171
        if compilation_config.compile_sizes is None:
            compilation_config.compile_sizes = []
172

173
174
175
176
177
178
179
180
181
        attention_config = vllm_config.attention_config
        if attention_config.backend is None:
            attention_config.backend = AttentionBackendEnum.FLASH_ATTN
        if not supports_xpu_graph():
            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
            logger.warning(
                "XPU Graph is not supported in the current PyTorch version, "
                "disabling cudagraph_mode."
            )
182
183
184
185
186
187
        elif not envs.VLLM_XPU_ENABLE_XPU_GRAPH:
            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
            logger.warning(
                "XPU Graph is disabled by environment variable, "
                "please set VLLM_XPU_ENABLE_XPU_GRAPH=1 to enable it."
            )
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
        elif parallel_config.world_size_across_dp > 1:
            compilation_config.cudagraph_mode = CUDAGraphMode.NONE
            logger.warning(
                "XPU Graph doesn't support capture communication ops, "
                "disabling cudagraph_mode."
            )
        else:
            if (
                attention_config.backend == AttentionBackendEnum.FLASH_ATTN
                and compilation_config.cudagraph_mode
                not in {CUDAGraphMode.NONE, CUDAGraphMode.PIECEWISE}
            ):
                compilation_config.cudagraph_mode = CUDAGraphMode.PIECEWISE
                logger.warning(
                    "FMHA sycl-tla kernels cannot be captured with XPU graphs, "
                    "falling back to PIECEWISE graph mode on XPU platform."
                )
205
206
207

        # check and update parallel config
        parallel_config = vllm_config.parallel_config
208
209
210
211
        # Only override worker_cls if it's still the default "auto"
        # This allows custom workers (like vllm-omni workers) to be used on XPU
        if parallel_config.worker_cls == "auto":
            parallel_config.worker_cls = "vllm.v1.worker.xpu_worker.XPUWorker"
212
213
        if vllm_config.kv_transfer_config is not None:
            vllm_config.kv_transfer_config.enable_permute_local_kv = True
214

215
216
217
218
219
220
        # In some cases, the internal memory type cache can misdetect GPU
        # memory as host memory, also leading to invalid memory access.
        # This cache can be disabled by setting UCX_MEMTYPE_CACHE=n.
        # ref. https://openucx.readthedocs.io/en/master/faq.html
        os.environ["UCX_MEMTYPE_CACHE"] = "n"

221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
    @classmethod
    def update_block_size_for_backend(cls, vllm_config: "VllmConfig") -> None:
        super().update_block_size_for_backend(vllm_config)
        from vllm.config.vllm import get_layers_from_vllm_config
        from vllm.model_executor.layers.attention_layer_base import (
            AttentionLayerBase,
        )
        from vllm.utils.math_utils import cdiv

        cache_config = vllm_config.cache_config
        # special fix for GDN since kernel only supports block size dividable by 64
        attn_layers = get_layers_from_vllm_config(
            vllm_config,
            AttentionLayerBase,  # type: ignore[type-abstract]
        )

        kernel_block_size = None
        for layer in attn_layers.values():
            b = layer.get_attn_backend()
            if b.get_name() == "GDN_ATTN":
                kernel_block_size = 64
                break

        if kernel_block_size is None:
            return
        new_block_size = (
            cdiv(cache_config.block_size, kernel_block_size) * kernel_block_size
        )
        if new_block_size == cache_config.block_size:
            return

        if cache_config.mamba_cache_mode == "align":
            cache_config.mamba_block_size = new_block_size
        original_mamba_page_size_padded = cache_config.mamba_page_size_padded
        if cache_config.mamba_page_size_padded is not None:
            attn_page_size_1_token = (
                cache_config.mamba_page_size_padded // cache_config.block_size
            )
            cache_config.mamba_page_size_padded = (
                new_block_size * attn_page_size_1_token
            )
        cache_config.block_size = new_block_size
        logger.info(
            "[XPU]Setting attention block size to %d tokens to ensure multiple of %d, "
            "set mamba_page_size_padded to %d bytes accordingly, before was %d bytes.",
            new_block_size,
            kernel_block_size,
            cache_config.mamba_page_size_padded,
            original_mamba_page_size_padded,
        )

272
273
274
275
    @classmethod
    def support_hybrid_kv_cache(cls) -> bool:
        return True

276
277
    @classmethod
    def support_static_graph_mode(cls) -> bool:
278
        return True
279

280
281
    @classmethod
    def is_pin_memory_available(cls):
282
        return True
283
284

    @classmethod
285
    def get_current_memory_usage(
286
        cls, device: torch.types.Device | None = None
287
    ) -> float:
288
        torch.xpu.empty_cache()
289
290
        torch.xpu.reset_peak_memory_stats(device)
        return torch.xpu.max_memory_allocated(device)
291

292
293
    @classmethod
    def fp8_dtype(cls) -> torch.dtype:
294
        return torch.float8_e4m3fn
295

296
297
298
299
300
    @classmethod
    def is_data_center_gpu(cls) -> bool:
        device_name = cls.get_device_name().lower()
        return device_name.count("data center gpu") > 0

301
302
    @classmethod
    def get_device_communicator_cls(cls) -> str:
303
304
305
306
307
308
309
        from vllm.utils.torch_utils import supports_xccl

        if not supports_xccl():
            logger.warning(
                "xccl is not enabled in this torch build, communication"
                " is not available."
            )
310
        return "vllm.distributed.device_communicators.xpu_communicator.XpuCommunicator"  # noqa
311

312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
    @classmethod
    def get_default_ir_op_priority(
        cls, vllm_config: "VllmConfig"
    ) -> "IrOpPriorityConfig":
        from vllm.config.compilation import CompilationMode
        from vllm.config.kernel import IrOpPriorityConfig

        # Native used by default when compiling,
        # use fused kernels where available when no codegen
        cc = vllm_config.compilation_config
        using_inductor = cc.backend == "inductor" and cc.mode != CompilationMode.NONE
        default = ["native"] if using_inductor else ["xpu_kernels", "native"]

        return IrOpPriorityConfig.with_default(default)

327
328
    @classmethod
    def device_count(cls) -> int:
329
        return torch.xpu.device_count()
330
331

    @classmethod
332
333
    def check_if_supports_dtype(cls, dtype: torch.dtype):
        if dtype == torch.bfloat16:  # noqa: SIM102
334
335
336
337
338
339
            device_name = cls.get_device_name().lower()
            # client gpu a770
            if device_name.count("a770") > 0:
                raise ValueError(
                    "Intel Arc A770 have bfloat16 accuracy known issue. "
                    "You can use float16 instead by explicitly setting the "
340
341
                    "`dtype` flag in CLI, for example: --dtype=half."
                )
342
343
344
345

    @classmethod
    def opaque_attention_op(cls) -> bool:
        return True
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369

    @classmethod
    def insert_blocks_to_device(
        cls,
        src_cache: torch.Tensor,
        dst_cache: torch.Tensor,
        src_block_indices: torch.Tensor,
        dst_block_indices: torch.Tensor,
    ) -> None:
        """Copy blocks from src_cache to dst_cache on XPU."""
        _src_cache = src_cache[:, src_block_indices]
        dst_cache[:, dst_block_indices] = _src_cache.to(dst_cache.device)

    @classmethod
    def swap_out_blocks_to_host(
        cls,
        src_cache: torch.Tensor,
        dst_cache: torch.Tensor,
        src_block_indices: torch.Tensor,
        dst_block_indices: torch.Tensor,
    ) -> None:
        """Copy blocks from XPU to host (CPU)."""
        _src_cache = src_cache[:, src_block_indices]
        dst_cache[:, dst_block_indices] = _src_cache.cpu()
370
371
372
373

    @classmethod
    def num_compute_units(cls, device_id: int = 0) -> int:
        return torch.xpu.get_device_properties(device_id).max_compute_units