Merge tag 'v0.7.2' into v0.7.2-dev

66b809cc · zhuwenwen · 37b63c24 · 0408efc6 · 66b809cc · 66b809cc
Commit 66b809cc authored Feb 08, 2025 by zhuwenwen
20 changed files
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
+# SPDX-License-Identifier: Apache-2.0
+
 import os
 from functools import lru_cache
 from typing import TYPE_CHECKING, Dict, List, Optional
@@ -77,6 +79,9 @@ class RocmPlatform(Platform):
    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
                             kv_cache_dtype, block_size, use_v1,
                             use_mla) -> str:
+        if use_mla:
+            logger.info("Using Triton MLA backend.")
+            return "vllm.attention.backends.triton_mla.TritonMLABackend"
        selected_backend = (_Backend.ROCM_FLASH if selected_backend
                            == _Backend.FLASH_ATTN else selected_backend)
        if selected_backend == _Backend.ROCM_FLASH:

--- a/vllm/platforms/tpu.py
+++ b/vllm/platforms/tpu.py
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import TYPE_CHECKING, Optional

 import torch

--- a/vllm/platforms/xpu.py
+++ b/vllm/platforms/xpu.py
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import TYPE_CHECKING, Optional

 import torch
@@ -66,9 +68,14 @@ class XPUPlatform(Platform):
        # check and update model config
        model_config = vllm_config.model_config
        if model_config.dtype == torch.bfloat16:
-            logger.warning(
-                "bfloat16 is not fully supported on XPU, casting to float16.")
-            model_config.dtype = torch.float16
+            bf16_supported = cls.device_support_bf16()
+            if not bf16_supported:
+                logger.warning(
+                    "bfloat16 is only supported on Intel Data Center GPU, "
+                    "Intel Arc GPU is not supported yet. Your device is %s,"
+                    "which is not supported. will fallback to float16",
+                    cls.get_device_name())
+                model_config.dtype = torch.float16
        if not model_config.enforce_eager:
            logger.warning(
                "CUDA graph is not supported on XPU, fallback to the eager "
@@ -116,3 +123,15 @@ class XPUPlatform(Platform):
                                 ) -> float:
        torch.xpu.reset_peak_memory_stats(device)
        return torch.xpu.max_memory_allocated(device)
+
+    @classmethod
+    def device_support_bf16(cls) -> bool:
+        device_name = cls.get_device_name().lower()
+        if device_name.count("arc") > 0:
+            return False
+        elif device_name.count("data center gpu") > 0:
+            return True
+        else:
+            logger.warning("Unknown device name %s, always use float16",
+                           device_name)
+            return False
--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
+# SPDX-License-Identifier: Apache-2.0
+
 import logging
 import os
 from typing import Callable, Dict

--- a/vllm/pooling_params.py
+++ b/vllm/pooling_params.py
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import Any, Optional

 import msgspec

--- a/vllm/profiler/__init__.py
+++ b/vllm/profiler/__init__.py
+# SPDX-License-Identifier: Apache-2.0
+
 from .layerwise_profile import layerwise_profile

 __all__ = [

--- a/vllm/profiler/layerwise_profile.py
+++ b/vllm/profiler/layerwise_profile.py
+# SPDX-License-Identifier: Apache-2.0
+
 import copy
 from collections import defaultdict
 from dataclasses import asdict, dataclass, field

--- a/vllm/profiler/utils.py
+++ b/vllm/profiler/utils.py
+# SPDX-License-Identifier: Apache-2.0
+
 import dataclasses
 from typing import Callable, Dict, List, Type, Union


--- a/vllm/prompt_adapter/layers.py
+++ b/vllm/prompt_adapter/layers.py
+# SPDX-License-Identifier: Apache-2.0
+
 from dataclasses import dataclass
 from typing import Optional


--- a/vllm/prompt_adapter/models.py
+++ b/vllm/prompt_adapter/models.py
+# SPDX-License-Identifier: Apache-2.0
+
 import logging
 import math
 from typing import Any, Callable, Dict, List, Optional, Type

--- a/vllm/prompt_adapter/request.py
+++ b/vllm/prompt_adapter/request.py
+# SPDX-License-Identifier: Apache-2.0
+
 import msgspec

 from vllm.adapter_commons.request import AdapterRequest

--- a/vllm/prompt_adapter/utils.py
+++ b/vllm/prompt_adapter/utils.py
+# SPDX-License-Identifier: Apache-2.0
+
 # code borrowed from: https://github.com/huggingface/peft/blob/v0.12.0/src/peft/utils/save_and_load.py#L420

 import os

--- a/vllm/prompt_adapter/worker_manager.py
+++ b/vllm/prompt_adapter/worker_manager.py
+# SPDX-License-Identifier: Apache-2.0
+
 import logging
 from typing import Any, Optional, Set, Type


--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
+# SPDX-License-Identifier: Apache-2.0
 """Sampling parameters for text generation."""
 import copy
 from dataclasses import dataclass

--- a/vllm/scalar_type.py
+++ b/vllm/scalar_type.py
+# SPDX-License-Identifier: Apache-2.0
+
 import functools
 import struct
 from dataclasses import dataclass

--- a/vllm/scripts.py
+++ b/vllm/scripts.py
+# SPDX-License-Identifier: Apache-2.0
+
 # The CLI entrypoint to vLLM.
 import argparse
 import os

--- a/vllm/sequence.py
+++ b/vllm/sequence.py
+# SPDX-License-Identifier: Apache-2.0
 """Sequence and its related classes."""
 import copy
 import enum

--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
+# SPDX-License-Identifier: Apache-2.0
+
 from array import array
 from itertools import chain, count
 from typing import Iterator, List, Optional, Tuple

--- a/vllm/spec_decode/draft_model_runner.py
+++ b/vllm/spec_decode/draft_model_runner.py
+# SPDX-License-Identifier: Apache-2.0
+
 from typing import List, Optional

 import torch

--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
+# SPDX-License-Identifier: Apache-2.0
+
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import List, Optional, Set, Union