[Hardware][Intel-Gaudi] Add Intel Gaudi (HPU) inference backend (#6143)

Signed-off-by: yuwenzho <yuwen.zhou@intel.com> Signed-off-by: Chendi.Xue <chendi.xue@intel.com> Signed-off-by: Bob Zhu <bob.zhu@intel.com> Signed-off-by: zehao-intel <zehao.huang@intel.com> Signed-off-by: Konrad Zawora <kzawora@habana.ai> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Sanju C Sudhakaran <scsudhakaran@habana.ai> Co-authored-by: Michal Adamczyk <madamczyk@habana.ai> Co-authored-by: Marceli Fylcek <mfylcek@habana.ai> Co-authored-by: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com> Co-authored-by: Vivek Goel <vgoel@habana.ai> Co-authored-by: yuwenzho <yuwen.zhou@intel.com> Co-authored-by: Dominika Olszewska <dolszewska@habana.ai> Co-authored-by: barak goldberg <149692267+bgoldberg-habana@users.noreply.github.com> Co-authored-by: Michal Szutenberg <37601244+szutenberg@users.noreply.github.com> Co-authored-by: Jan Kaniecki <jkaniecki@habana.ai> Co-authored-by: Agata Dobrzynie...

[Hardware][Intel-Gaudi] Add Intel Gaudi (HPU) inference backend (#6143)
Signed-off-by: yuwenzho <yuwen.zhou@intel.com> Signed-off-by: Chendi.Xue <chendi.xue@intel.com> Signed-off-by: Bob Zhu <bob.zhu@intel.com> Signed-off-by: zehao-intel <zehao.huang@intel.com> Signed-off-by: Konrad Zawora <kzawora@habana.ai> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com> Co-authored-by: Sanju C Sudhakaran <scsudhakaran@habana.ai> Co-authored-by: Michal Adamczyk <madamczyk@habana.ai> Co-authored-by: Marceli Fylcek <mfylcek@habana.ai> Co-authored-by: Himangshu Lahkar <49579433+hlahkar@users.noreply.github.com> Co-authored-by: Vivek Goel <vgoel@habana.ai> Co-authored-by: yuwenzho <yuwen.zhou@intel.com> Co-authored-by: Dominika Olszewska <dolszewska@habana.ai> Co-authored-by: barak goldberg <149692267+bgoldberg-habana@users.noreply.github.com> Co-authored-by: Michal Szutenberg <37601244+szutenberg@users.noreply.github.com> Co-authored-by: Jan Kaniecki <jkaniecki@habana.ai> Co-authored-by: Agata Dobrzynie...
a02a50e6 · Konrad Zawora · GitHub · a5fda50a · a02a50e6 · a02a50e6
Unverified Commit a02a50e6 authored Nov 06, 2024 by Konrad Zawora Committed by GitHub Nov 06, 2024
20 changed files
--- a/Dockerfile.hpu
+++ b/Dockerfile.hpu
+FROM vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+RUN pip install -v -r requirements-hpu.txt
+
+ENV no_proxy=localhost,127.0.0.1
+ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
+
+RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
+
+WORKDIR /workspace/
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/docs/source/getting_started/gaudi-installation.rst
+++ b/docs/source/getting_started/gaudi-installation.rst
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -43,7 +43,7 @@ vLLM is flexible and easy to use with:
 * Tensor parallelism and pipeline parallelism support for distributed inference
 * Streaming outputs
 * OpenAI-compatible API server
-* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
+* Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs, Gaudi® accelerators and GPUs, PowerPC CPUs, TPU, and AWS Trainium and Inferentia Accelerators.
 * Prefix caching support
 * Multi-lora support

@@ -66,6 +66,7 @@ Documentation
   getting_started/amd-installation
   getting_started/openvino-installation
   getting_started/cpu-installation
+   getting_started/gaudi-installation
   getting_started/neuron-installation
   getting_started/tpu-installation
   getting_started/xpu-installation

--- a/requirements-hpu.txt
+++ b/requirements-hpu.txt
+# Common dependencies
+-r requirements-common.txt
+
+# Dependencies for HPU code
+ray
+triton
+pandas
+tabulate
+setuptools>=61
+setuptools-scm>=8
+vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@fd7f2e6
--- a/setup.py
+++ b/setup.py
@@ -253,6 +253,24 @@ class cmake_build_ext(build_ext):
            self.copy_file(file, dst_file)


+def _is_hpu() -> bool:
+    is_hpu_available = True
+    try:
+        subprocess.run(["hl-smi"], capture_output=True, check=True)
+    except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
+        if not os.path.exists('/dev/accel/accel0') and not os.path.exists(
+                '/dev/accel/accel_controlD0'):
+            # last resort...
+            try:
+                output = subprocess.check_output(
+                    'lsmod | grep habanalabs | wc -l', shell=True)
+                is_hpu_available = int(output) > 0
+            except (ValueError, FileNotFoundError, PermissionError,
+                    subprocess.CalledProcessError):
+                is_hpu_available = False
+    return is_hpu_available or VLLM_TARGET_DEVICE == "hpu"
+
+
 def _no_device() -> bool:
    return VLLM_TARGET_DEVICE == "empty"

@@ -260,7 +278,7 @@ def _no_device() -> bool:
 def _is_cuda() -> bool:
    has_cuda = torch.version.cuda is not None
    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
-            and not (_is_neuron() or _is_tpu()))
+            and not (_is_neuron() or _is_tpu() or _is_hpu()))


 def _is_hip() -> bool:
@@ -356,6 +374,23 @@ def get_path(*filepath) -> str:
    return os.path.join(ROOT_DIR, *filepath)


+def get_gaudi_sw_version():
+    """
+    Returns the driver version.
+    """
+    # Enable console printing for `hl-smi` check
+    output = subprocess.run("hl-smi",
+                            shell=True,
+                            text=True,
+                            stdout=subprocess.PIPE,
+                            stderr=subprocess.PIPE,
+                            env={"ENABLE_CONSOLE": "true"})
+    if output.returncode == 0 and output.stdout:
+        return output.stdout.split("\n")[2].replace(
+            " ", "").split(":")[1][:-1].split("-")[0]
+    return "0.0.0"  # when hl-smi is not available
+
+
 def get_vllm_version() -> str:
    version = get_version(
        write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
@@ -385,6 +420,12 @@ def get_vllm_version() -> str:
        if neuron_version != MAIN_CUDA_VERSION:
            neuron_version_str = neuron_version.replace(".", "")[:3]
            version += f"{sep}neuron{neuron_version_str}"
+    elif _is_hpu():
+        # Get the Intel Gaudi Software Suite version
+        gaudi_sw_version = str(get_gaudi_sw_version())
+        if gaudi_sw_version != MAIN_CUDA_VERSION:
+            gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
+            version += f"{sep}gaudi{gaudi_sw_version}"
    elif _is_openvino():
        version += f"{sep}openvino"
    elif _is_tpu():
@@ -443,6 +484,8 @@ def get_requirements() -> List[str]:
        requirements = _read_requirements("requirements-rocm.txt")
    elif _is_neuron():
        requirements = _read_requirements("requirements-neuron.txt")
+    elif _is_hpu():
+        requirements = _read_requirements("requirements-hpu.txt")
    elif _is_openvino():
        requirements = _read_requirements("requirements-openvino.txt")
    elif _is_tpu():
@@ -453,7 +496,7 @@ def get_requirements() -> List[str]:
        requirements = _read_requirements("requirements-xpu.txt")
    else:
        raise ValueError(
-            "Unsupported platform, please use CUDA, ROCm, Neuron, "
+            "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
            "OpenVINO, or CPU.")
    return requirements


--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -12,7 +12,7 @@ from vllm.scalar_type import ScalarType

 logger = init_logger(__name__)

-if not current_platform.is_tpu():
+if not current_platform.is_tpu() and not current_platform.is_hpu():
    try:
        import vllm._C
    except ImportError as e:

--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+import torch
+import vllm_hpu_extension.ops as ops
+from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata, AttentionType)
+from vllm.attention.backends.utils import CommonAttentionState
+from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention,
+                                               HPUPagedAttentionMetadata)
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class HPUAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_impl_cls() -> Type["HPUAttentionImpl"]:
+        return HPUAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return HPUAttentionMetadata
+
+    @staticmethod
+    def get_state_cls() -> Type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return HPUPagedAttention.get_kv_cache_shape(num_blocks, block_size,
+                                                    num_kv_heads, head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: Dict[int, int],
+    ) -> None:
+        HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: Dict[int, List[int]],
+    ) -> None:
+        HPUPagedAttention.copy_blocks(kv_caches, src_to_dists)
+
+
+@dataclass
+class HPUAttentionMetadata(HPUPagedAttentionMetadata, AttentionMetadata):
+    """Metadata for HPUAttentionbackend."""
+    # Currently, input sequences can only contain all prompts
+    # or all decoding. True if all sequences are prompts.
+    is_prompt: bool
+    attn_bias: Optional[torch.Tensor]
+    seq_lens_tensor: Optional[torch.Tensor]
+
+
+class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
+    """
+    If the input tensors contain prompt tokens, the layout is as follows:
+    |<--------------- num_prefill_tokens ----------------->|
+    |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|
+
+    Otherwise, the layout is as follows:
+    |<----------------- num_decode_tokens ------------------>|
+    |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|
+
+    Generation tokens can contain padding when cuda-graph is used.
+    Currently, prompt tokens don't contain any padding.
+
+    The prompts might have different lengths, while the generation tokens
+    always have length 1.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]] = None,
+        max_seq_len: int = 4096,
+    ) -> None:
+        super(AttentionImpl, self).__init__()
+        self.kv_cache_dtype = kv_cache_dtype
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.matmul_qk = Matmul()
+        self.softmax = Softmax()
+        self.matmul_av = Matmul()
+        self.k_cache = VLLMKVCache()
+        self.v_cache = VLLMKVCache()
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+        self.sliding_window = sliding_window
+        self.alibi_slopes = alibi_slopes
+        if alibi_slopes is not None:
+            alibi_slopes_tensor = torch.tensor(alibi_slopes,
+                                               dtype=torch.bfloat16)
+            self.alibi_slopes = alibi_slopes_tensor
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+        self.prefill_usefusedsdpa = os.getenv('VLLM_PROMPT_USE_FUSEDSDPA',
+                                              '0').lower() in ['1', 'true']
+        if self.prefill_usefusedsdpa:
+            assert alibi_slopes is None, \
+                'Prefill with FusedSDPA not supported with alibi slopes!'
+
+        suppored_head_sizes = HPUPagedAttention.get_supported_head_sizes()
+        if head_size not in suppored_head_sizes:
+            raise ValueError(
+                f"Head size {head_size} is not supported by PagedAttention. "
+                f"Supported head sizes are: {suppored_head_sizes}.")
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: HPUAttentionMetadata,
+        k_scale: float = 1.0,
+        v_scale: float = 1.0,
+        attn_type: AttentionType = AttentionType.DECODER,
+    ) -> torch.Tensor:
+        """Forward pass with xFormers and PagedAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads * head_size]
+            key: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [num_tokens, num_kv_heads * head_size]
+            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "HPUAttentionImpl")
+        batch_size, seq_len, hidden_size = query.shape
+        _, seq_len_kv, _ = key.shape
+
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_kv_heads, self.head_size)
+        value = value.view(-1, self.num_kv_heads, self.head_size)
+        block_indices = attn_metadata.block_indices
+        block_offsets = attn_metadata.block_offsets
+        if attn_metadata.is_prompt:
+            key = key.unflatten(0, (block_indices.size(0), -1))
+            value = value.unflatten(0, (block_indices.size(0), -1))
+        if kv_cache is not None:
+            key_cache, value_cache = HPUPagedAttention.split_kv_cache(
+                kv_cache, self.num_kv_heads, self.head_size)
+
+            # Reshape the input keys and values and store them in the cache.
+            # If kv_cache is not provided, the new key and value tensors are
+            # not cached. This happens during the initial memory profiling run.
+            key_cache = self.k_cache(key, key_cache, block_indices,
+                                     block_offsets)
+            value_cache = self.v_cache(value, value_cache, block_indices,
+                                       block_offsets)
+
+        if attn_metadata.is_prompt:
+            # Prompt run.
+            if not self.prefill_usefusedsdpa:
+                # TODO: move this outside of model
+                assert attn_metadata.attn_bias is not None, \
+                        'attn_bias must be set before calling model.forward!'
+                attn_bias = attn_metadata.attn_bias
+                if self.alibi_slopes is not None:
+                    position_bias = _make_alibi_bias(self.alibi_slopes,
+                                                     self.num_kv_heads,
+                                                     attn_bias.dtype,
+                                                     attn_bias.shape[-1])
+                    attn_bias = attn_bias.tile((1, self.num_kv_heads, 1, 1))
+                    attn_bias.add_(position_bias)
+            else:
+                attn_bias = None
+
+            query_shape = (batch_size, seq_len, self.num_heads, self.head_size)
+            kv_shape = (batch_size, seq_len_kv, self.num_kv_heads,
+                        self.head_size)
+            out = ops.prompt_attention(
+                query.view(query_shape),
+                key.view(kv_shape),
+                value.view(kv_shape),
+                attn_bias=attn_bias,
+                p=0.0,
+                scale=self.scale,
+                matmul_qk_op=self.matmul_qk,
+                softmax_op=self.softmax,
+                matmul_av_op=self.matmul_av,
+            )
+            output = out.reshape(batch_size, seq_len, hidden_size)
+        else:
+            # Decoding run.
+            output = HPUPagedAttention.forward_decode(
+                query=query,
+                key_cache=key_cache,
+                value_cache=value_cache,
+                block_list=attn_metadata.block_list,
+                block_mapping=attn_metadata.block_mapping,
+                block_bias=attn_metadata.attn_bias,
+                block_scales=attn_metadata.block_scales,
+                scale=self.scale,
+                matmul_qk_op=self.matmul_qk,
+                matmul_av_op=self.matmul_av,
+                keys_fetch_func=self.k_cache.fetch_from_cache,
+                values_fetch_func=self.v_cache.fetch_from_cache)
+        # Reshape the output tensor.
+        return output.view(batch_size, seq_len, hidden_size)
+
+
+def _make_alibi_bias(
+    alibi_slopes: torch.Tensor,
+    num_kv_heads: int,
+    dtype: torch.dtype,
+    seq_len: int,
+) -> torch.Tensor:
+    bias = torch.arange(seq_len, dtype=dtype)
+    # NOTE(zhuohan): HF uses
+    #     `bias = bias[None, :].repeat(seq_len, 1)`
+    # here. We find that both biases give the same results, but
+    # the bias below more accurately follows the original ALiBi
+    # paper.
+    # Calculate a matrix where each element represents ith element- jth
+    # element.
+    bias = bias[None, :] - bias[:, None]
+
+    padded_len = (seq_len + 7) // 8 * 8
+    num_heads = alibi_slopes.shape[0]
+    bias = torch.empty(
+        1,  # batch size
+        num_heads,
+        seq_len,
+        padded_len,
+        device=alibi_slopes.device,
+        dtype=dtype,
+    )[:, :, :, :seq_len].copy_(bias)
+    bias.mul_(alibi_slopes[:, None, None])
+    if num_heads != num_kv_heads:
+        bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
+    return bias
--- a/vllm/attention/ops/hpu_paged_attn.py
+++ b/vllm/attention/ops/hpu_paged_attn.py
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from vllm_hpu_extension import cache_ops, ops
+
+# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
+_PARTITION_SIZE = 512
+
+
+@dataclass
+class HPUPagedAttentionMetadata:
+    """Metadata for PagedAttention."""
+    block_list: Optional[torch.Tensor]
+    block_mapping: Optional[torch.Tensor]
+    block_usage: Optional[torch.Tensor]
+    block_indices: Optional[torch.Tensor]
+    block_offsets: Optional[torch.Tensor]
+    block_scales: Optional[torch.Tensor]
+
+
+class HPUPagedAttention:
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [64, 80, 96, 112, 128, 256]
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (num_blocks, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def split_kv_cache(
+        kv_cache: torch.Tensor,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        key_cache = kv_cache[0]
+        value_cache = kv_cache[1]
+        return key_cache, value_cache
+
+    @staticmethod
+    def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor,
+                             key_cache: torch.Tensor,
+                             value_cache: torch.Tensor,
+                             slot_mapping: torch.Tensor, kv_cache_dtype: str,
+                             is_prompt: bool) -> None:
+        cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
+                                    slot_mapping, kv_cache_dtype, is_prompt)
+
+    @staticmethod
+    def forward_decode(**kwargs) -> torch.Tensor:
+        return ops.flat_pa(**kwargs)
+
+    @staticmethod
+    def forward_prefix(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        block_tables: torch.Tensor,
+        subquery_start_loc: torch.Tensor,
+        seq_lens_tensor: torch.Tensor,
+        context_lens: torch.Tensor,
+        max_query_len: int,
+        alibi_slopes: Optional[torch.Tensor],
+        sliding_window: Optional[int],
+    ) -> torch.Tensor:
+        raise NotImplementedError(
+            "forward_prefix is not implemented for HPUPagedAttention")
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: torch.Tensor,
+        dst_kv_cache: torch.Tensor,
+        src_to_dst: Dict[int, int],
+    ) -> None:
+        src_key_cache = src_kv_cache[0]
+        dst_key_cache = dst_kv_cache[0]
+        cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
+
+        src_value_cache = src_kv_cache[1]
+        dst_value_cache = dst_kv_cache[1]
+        cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: Dict[int, List[int]],
+    ) -> None:
+        key_caches = [kv_cache[0] for kv_cache in kv_caches]
+        value_caches = [kv_cache[1] for kv_cache in kv_caches]
+        cache_ops.copy_blocks(key_caches, value_caches, src_to_dists)
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -23,6 +23,7 @@ class _Backend(enum.Enum):
    TORCH_SDPA = enum.auto()
    OPENVINO = enum.auto()
    FLASHINFER = enum.auto()
+    HPU_ATTN = enum.auto()
    PALLAS = enum.auto()
    IPEX = enum.auto()
    NO_ATTENTION = enum.auto()
@@ -145,6 +146,10 @@ def get_attn_backend(
        logger.info("Using Flashinfer backend.")
        from vllm.attention.backends.flashinfer import FlashInferBackend
        return FlashInferBackend
+    elif backend == _Backend.HPU_ATTN:
+        logger.info("Using HPUAttention backend.")
+        from vllm.attention.backends.hpu_attn import HPUAttentionBackend
+        return HPUAttentionBackend
    elif backend == _Backend.PALLAS:
        logger.info("Using Pallas backend.")
        from vllm.attention.backends.pallas import PallasAttentionBackend
@@ -220,6 +225,9 @@ def which_attn_to_use(
            logger.info("%s is not supported in AMD GPUs.", selected_backend)
        return _Backend.ROCM_FLASH

+    if current_platform.is_hpu():
+        return _Backend.HPU_ATTN
+
    if envs.VLLM_USE_V1:
        return _Backend.FLASH_ATTN_VLLM_V1


--- a/vllm/config.py
+++ b/vllm/config.py
@@ -466,9 +466,10 @@ class ModelConfig:

        # Reminder: Please update docs/source/serving/compatibility_matrix.rst
        # If the feature combo become valid
-        if device_config.device_type not in ("cuda", "tpu", "xpu"):
+        if device_config.device_type not in ("cuda", "tpu", "xpu", "hpu"):
            logger.warning(
-                "Async output processing is only supported for CUDA, TPU, XPU. "
+                "Async output processing is only supported for CUDA, TPU, XPU "
+                "and HPU."
                "Disabling it for other platforms.")
            self.use_async_output_proc = False
            return
@@ -860,7 +861,6 @@ class LoadConfig:
        ignore_patterns: The list of patterns to ignore when loading the model.
            Default to "original/**/*" to avoid repeated loading of llama's
            checkpoints.
-
    """

    load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
@@ -964,6 +964,13 @@ class ParallelConfig:
                raise ValueError(
                    "TPU backend only supports Ray for distributed inference.")

+        if current_platform.is_hpu() and self.world_size > 1:
+            if self.distributed_executor_backend is None:
+                self.distributed_executor_backend = "ray"
+            if self.distributed_executor_backend != "ray":
+                raise ValueError(
+                    "HPU backend only supports Ray for distributed inference.")
+
        if self.distributed_executor_backend is None and self.world_size > 1:
            # We use multiprocessing by default if world_size fits on the
            # current node and we aren't in a ray placement group.
@@ -1166,6 +1173,8 @@ class DeviceConfig:
                self.device_type = "cuda"
            elif current_platform.is_neuron():
                self.device_type = "neuron"
+            elif current_platform.is_hpu():
+                self.device_type = "hpu"
            elif current_platform.is_openvino():
                self.device_type = "openvino"
            elif current_platform.is_tpu():
@@ -1745,6 +1754,13 @@ def _get_and_verify_dtype(
                    torch_dtype = torch.float16
            else:
                torch_dtype = config_dtype
+
+            if current_platform.is_hpu() and config_dtype == torch.float16:
+                logger.info(
+                    "For HPU, we cast models to bfloat16 instead of"
+                    "using float16 by default. Please specify `dtype` if you "
+                    "want to use float16.")
+                torch_dtype = torch.bfloat16
        else:
            if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
                raise ValueError(f"Unknown dtype: {dtype}")

--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -4,6 +4,7 @@ from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
                                        DeviceAwareBlockAllocator)
 from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
 from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
+from vllm.platforms import current_platform
 from vllm.utils import Device


@@ -52,7 +53,11 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
            - The block IDs are assigned contiguously, with GPU block IDs coming
                before CPU block IDs.
        """
-        block_ids = list(range(num_gpu_blocks + num_cpu_blocks))
+        # For HPU, block id 0 is used only for padding
+        reserved_blocks = 1 if current_platform.is_hpu() else 0
+        block_ids = list(
+            range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
+        num_gpu_blocks -= reserved_blocks
        gpu_block_ids = block_ids[:num_gpu_blocks]
        cpu_block_ids = block_ids[num_gpu_blocks:]


--- a/vllm/distributed/device_communicators/hpu_communicator.py
+++ b/vllm/distributed/device_communicators/hpu_communicator.py
+import torch
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from vllm.platforms import current_platform
+
+if current_platform.is_hpu():
+    import habana_frameworks.torch as htorch  # noqa: F401
+
+
+class HpuCommunicator:
+
+    def __init__(self, group: ProcessGroup):
+        if not current_platform.is_hpu():
+            self.disabled = True
+            return
+        self.disabled = False
+        self.group = group
+        self.world_size = dist.get_world_size(self.group)
+
+    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
+        # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
+        # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
+        # (which is required for tensor parallel HPUGraph inference)
+        htorch.core.mark_step()
+        dist.all_reduce(x, group=self.group)
+        return x
+
+    def all_gather(self, x: torch.Tensor, dim: int = -1) -> torch.Tensor:
+        world_size = self.world_size
+        if dim < 0:
+            # Convert negative dim to positive.
+            dim += x.dim()
+        input_size = x.size()
+        # Allocate output tensor.
+        output_tensor = torch.empty((world_size, ) + input_size,
+                                    dtype=x.dtype,
+                                    device=x.device)
+        # All-gather.
+        htorch.core.mark_step()
+        dist.all_gather_into_tensor(output_tensor, x, group=self.group)
+        # Reshape
+        output_tensor = output_tensor.movedim(0, dim)
+        output_tensor = output_tensor.reshape(input_size[:dim] +
+                                              (world_size *
+                                               input_size[dim], ) +
+                                              input_size[dim + 1:])
+        return output_tensor
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -177,6 +177,7 @@ class GroupCoordinator:
        use_pynccl: bool,
        use_custom_allreduce: bool,
        use_tpu_communicator: bool,
+        use_hpu_communicator: bool,
        use_message_queue_broadcaster: bool = False,
        group_name: Optional[str] = None,
    ):
@@ -213,6 +214,7 @@ class GroupCoordinator:
        self.use_pynccl = use_pynccl
        self.use_custom_allreduce = use_custom_allreduce
        self.use_tpu_communicator = use_tpu_communicator
+        self.use_hpu_communicator = use_hpu_communicator

        # lazy import to avoid documentation build error
        from vllm.distributed.device_communicators.custom_all_reduce import (
@@ -241,6 +243,12 @@ class GroupCoordinator:
        if use_tpu_communicator and self.world_size > 1:
            self.tpu_communicator = TpuCommunicator(group=self.cpu_group)

+        from vllm.distributed.device_communicators.hpu_communicator import (
+            HpuCommunicator)
+        self.hpu_communicator: Optional[HpuCommunicator]
+        if use_hpu_communicator and self.world_size > 1:
+            self.hpu_communicator = HpuCommunicator(group=self.device_group)
+
        from vllm.distributed.device_communicators.shm_broadcast import (
            MessageQueue)
        self.mq_broadcaster: Optional[MessageQueue] = None
@@ -362,6 +370,10 @@ class GroupCoordinator:
            # TPU handles Dynamo with its own logic.
            return self.tpu_communicator.all_reduce(input_)

+        if self.hpu_communicator is not None and \
+            not self.hpu_communicator.disabled:
+            return self.hpu_communicator.all_reduce(input_)
+
        if self.ca_comm is not None and \
            not self.ca_comm.disabled and \
                self.ca_comm.should_custom_ar(input_):
@@ -400,6 +412,11 @@ class GroupCoordinator:
        if tpu_comm is not None and not tpu_comm.disabled:
            return tpu_comm.all_gather(input_, dim)

+        # For HPUs, use HPU communicator.
+        hpu_comm = self.hpu_communicator
+        if hpu_comm is not None and not hpu_comm.disabled:
+            return hpu_comm.all_gather(input_, dim)
+
        if dim < 0:
            # Convert negative dim to positive.
            dim += input_.dim()
@@ -879,6 +896,7 @@ def init_world_group(ranks: List[int], local_rank: int,
        use_pynccl=False,
        use_custom_allreduce=False,
        use_tpu_communicator=False,
+        use_hpu_communicator=False,
        group_name="world",
    )

@@ -900,6 +918,7 @@ def init_model_parallel_group(
        use_pynccl=True,
        use_custom_allreduce=use_custom_allreduce,
        use_tpu_communicator=True,
+        use_hpu_communicator=True,
        use_message_queue_broadcaster=use_message_queue_broadcaster,
        group_name=group_name,
    )

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -17,6 +17,7 @@ from vllm.config import (CacheConfig, ConfigFormat, DecodingConfig,
 from vllm.executor.executor_base import ExecutorBase
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.platforms import current_platform
 from vllm.transformers_utils.config import (
    maybe_register_config_serialize_by_value)
 from vllm.transformers_utils.utils import check_gguf_file
@@ -37,6 +38,7 @@ DEVICE_OPTIONS = [
    "openvino",
    "tpu",
    "xpu",
+    "hpu",
 ]


@@ -110,7 +112,9 @@ class EngineArgs:
    pipeline_parallel_size: int = 1
    tensor_parallel_size: int = 1
    max_parallel_loading_workers: Optional[int] = None
-    block_size: int = 16
+    # NOTE(kzawora): default block size for Gaudi should be 128
+    # smaller sizes still work, but very inefficiently
+    block_size: int = 16 if not current_platform.is_hpu() else 128
    enable_prefix_caching: bool = False
    disable_sliding_window: bool = False
    use_v2_block_manager: bool = True
@@ -397,7 +401,7 @@ class EngineArgs:
        parser.add_argument('--block-size',
                            type=int,
                            default=EngineArgs.block_size,
-                            choices=[8, 16, 32],
+                            choices=[8, 16, 32, 64, 128],
                            help='Token block size for contiguous chunks of '
                            'tokens. This is ignored on neuron devices and '
                            'set to max-model-len')
@@ -1132,8 +1136,7 @@ class EngineArgs:
            multi_step_stream_outputs=self.multi_step_stream_outputs,
            send_delta_data=(envs.VLLM_USE_RAY_SPMD_WORKER
                             and parallel_config.use_ray),
-            policy=self.scheduling_policy,
-        )
+            policy=self.scheduling_policy)
        lora_config = LoRAConfig(
            max_lora_rank=self.max_lora_rank,
            max_loras=self.max_loras,

--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -627,6 +627,14 @@ class AsyncLLMEngine(EngineClient):
        elif engine_config.device_config.device_type == "cpu":
            from vllm.executor.cpu_executor import CPUExecutorAsync
            executor_class = CPUExecutorAsync
+        elif engine_config.device_config.device_type == "hpu":
+            if distributed_executor_backend == "ray":
+                initialize_ray_cluster(engine_config.parallel_config)
+                from vllm.executor.ray_hpu_executor import RayHPUExecutorAsync
+                executor_class = RayHPUExecutorAsync
+            else:
+                from vllm.executor.hpu_executor import HPUExecutorAsync
+                executor_class = HPUExecutorAsync
        elif engine_config.device_config.device_type == "openvino":
            assert distributed_executor_backend is None, (
                "Distributed execution is not supported with "

--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -528,6 +528,14 @@ class LLMEngine:
        elif engine_config.device_config.device_type == "cpu":
            from vllm.executor.cpu_executor import CPUExecutor
            executor_class = CPUExecutor
+        elif engine_config.device_config.device_type == "hpu":
+            if distributed_executor_backend == "ray":
+                initialize_ray_cluster(engine_config.parallel_config)
+                from vllm.executor.ray_hpu_executor import RayHPUExecutor
+                executor_class = RayHPUExecutor
+            else:
+                from vllm.executor.hpu_executor import HPUExecutor
+                executor_class = HPUExecutor
        elif engine_config.device_config.device_type == "openvino":
            from vllm.executor.openvino_executor import OpenVINOExecutor
            executor_class = OpenVINOExecutor

--- a/vllm/executor/hpu_executor.py
+++ b/vllm/executor/hpu_executor.py
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+###############################################################################
+
+import contextlib
+import os
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sequence import ExecuteModelRequest
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+                        make_async)
+from vllm.worker.worker_base import WorkerWrapperBase
+
+logger = init_logger(__name__)
+
+
+class HPUExecutor(ExecutorBase):
+
+    uses_ray: bool = False
+
+    def _init_executor(self) -> None:
+        """Initialize the worker and load the model."""
+        self._init_worker()
+
+    def _get_worker_kwargs(
+            self,
+            local_rank: int = 0,
+            rank: int = 0,
+            distributed_init_method: Optional[str] = None) -> Dict[str, Any]:
+        """Return worker init args for a given rank."""
+        if distributed_init_method is None:
+            distributed_init_method = get_distributed_init_method(
+                get_ip(), get_open_port())
+        return dict(
+            vllm_config=self.vllm_config,
+            local_rank=local_rank,
+            rank=rank,
+            distributed_init_method=distributed_init_method,
+            is_driver_worker=rank == 0,
+        )
+
+    def _create_worker(self,
+                       local_rank: int = 0,
+                       rank: int = 0,
+                       distributed_init_method: Optional[str] = None):
+        wrapper = WorkerWrapperBase(
+            worker_module_name="vllm.worker.hpu_worker",
+            worker_class_name="HPUWorker",
+        )
+        wrapper.init_worker(**self._get_worker_kwargs(local_rank, rank,
+                                                      distributed_init_method))
+        return wrapper.worker
+
+    def _init_worker(self):
+        assert self.parallel_config.world_size == 1, (
+            "GPUExecutor only supports single GPU.")
+
+        self.driver_worker = self._create_worker()
+        self.driver_worker.init_device()
+        self.driver_worker.load_model()
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Determine the number of available KV blocks by invoking the
+        underlying worker.
+        """
+        return self.driver_worker.determine_num_available_blocks()
+
+    def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
+        """Initialize the KV cache by invoking the underlying worker.
+        """
+        # NOTE: This is logged in the executor because there can be >1 worker
+        # with other executors. We could log in the engine level, but work
+        # remains to abstract away the device for non-GPU configurations.
+        logger.info("# HPU blocks: %d, # CPU blocks: %d", num_gpu_blocks,
+                    num_cpu_blocks)
+        from vllm_hpu_extension.profiler import HabanaMemoryProfiler
+        with HabanaMemoryProfiler() as cache_init_m:
+            self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
+        msg = f"init_cache_engine took {cache_init_m.get_summary_string()}"
+        logger.info(msg)
+
+    def finish_measurements(self):
+        self.driver_worker.finish_measurements()
+
+    def execute_model(
+            self,
+            execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
+        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION     - will log graph compilations per engine step, only when there was any - highly recommended to use alongside PT_HPU_METRICS_GC_DETAILS! # noqa:E501
+        # VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL - will log graph compilations per engine step, always, even if there were none # noqa:E501
+        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS         - will log cpu fallbacks per engine step, only when there was any # noqa:E501
+        # VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL     - will log cpu fallbacks per engine step, always, even if there were none # noqa:E501
+        log_graph_compilation_all = os.environ.get(
+            'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION_ALL', '0') != '0'
+        log_graph_compilation = os.environ.get(
+            'VLLM_HPU_LOG_STEP_GRAPH_COMPILATION',
+            '0') != '0' or log_graph_compilation_all
+        log_cpu_fallbacks_all = os.environ.get(
+            'VLLM_HPU_LOG_STEP_CPU_FALLBACKS_ALL', '0') != '0'
+        log_cpu_fallbacks = os.environ.get('VLLM_HPU_LOG_STEP_CPU_FALLBACKS',
+                                           '0') != '0' or log_cpu_fallbacks_all
+        if log_graph_compilation or log_cpu_fallbacks:
+            from habana_frameworks.torch.hpu.metrics import metric_localcontext
+            seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+            is_prompt = any([
+                seq_group_metadata.is_prompt
+                for seq_group_metadata in seq_group_metadata_list
+            ])
+            max_context_len = max([
+                max([
+                    len(v.prompt_token_ids) + len(v.output_token_ids)
+                    for v in seq_group_metadata.seq_data.values()
+                ]) for seq_group_metadata in seq_group_metadata_list
+            ])  # whoa, that's some spicy stuff right here
+            max_num_blocks = (
+                (max_context_len - 1) // self.cache_config.block_size) + 1
+            input_stats = (f'is_prompt: {is_prompt}, '
+                           f'num_seqs: {len(seq_group_metadata_list)}, '
+                           f'max_context_len: {max_context_len}, '
+                           f'max_num_blocks {max_num_blocks}')
+            gc_ctx = metric_localcontext(
+                "graph_compilation"
+            ) if log_graph_compilation else contextlib.nullcontext()
+            cpu_fallback_ctx = metric_localcontext(
+                "cpu_fallback"
+            ) if log_cpu_fallbacks else contextlib.nullcontext()
+            with gc_ctx as gc_local_metric, \
+                cpu_fallback_ctx as cpu_fallback_local_metric:
+                output = self.driver_worker.execute_model(execute_model_req)
+            if (log_graph_compilation and gc_local_metric.stats()[0][1] > 0
+                ) or log_graph_compilation_all:
+                msg = ("VLLM_HPU_STEP_GRAPH_COMPILATION: "
+                       f"{gc_local_metric.stats()}, {input_stats}")
+                logger.warning(msg)
+            if (log_cpu_fallbacks and cpu_fallback_local_metric.stats()[0][1] >
+                    0) or log_cpu_fallbacks_all:
+                msg = ("VLLM_HPU_STEP_CPU_FALLBACK: "
+                       f"{cpu_fallback_local_metric.stats()}, {input_stats}")
+                logger.warning(msg)
+
+            return output
+
+        output = self.driver_worker.execute_model(execute_model_req)
+        return output
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
+        return self.driver_worker.add_lora(lora_request)
+
+    def remove_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self.driver_worker.remove_lora(lora_id)
+
+    def pin_lora(self, lora_id: int) -> bool:
+        assert lora_id > 0, "lora_id must be greater than 0."
+        return self.driver_worker.pin_lora(lora_id)
+
+    def list_loras(self) -> Set[int]:
+        return self.driver_worker.list_loras()
+
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
+
+    def list_prompt_adapters(self) -> Set[int]:
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for HPU backend.")
+
+    def check_health(self) -> None:
+        # GPUExecutor will always be healthy as long as
+        # it's running.
+        return
+
+    def start_profile(self) -> None:
+        self.driver_worker.start_profile()
+
+    def stop_profile(self) -> None:
+        self.driver_worker.stop_profile()
+
+    def shutdown(self) -> None:
+        self.driver_worker.shutdown_inc()
+
+
+class HPUExecutorAsync(HPUExecutor, ExecutorAsyncBase):
+
+    async def execute_model_async(
+        self,
+        execute_model_req: ExecuteModelRequest,
+    ) -> List[SamplerOutput]:
+        output = await make_async(self.driver_worker.execute_model
+                                  )(execute_model_req=execute_model_req, )
+        return output
--- a/vllm/executor/ray_hpu_executor.py
+++ b/vllm/executor/ray_hpu_executor.py
--- a/vllm/executor/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -249,7 +249,11 @@ def initialize_ray_cluster(
        # Placement group is already set.
        return

-    device_str = "GPU" if not current_platform.is_tpu() else "TPU"
+    device_str = "GPU"
+    if current_platform.is_tpu():
+        device_str = "TPU"
+    elif current_platform.is_hpu():
+        device_str = 'HPU'
    # Create placement group for worker processes
    current_placement_group = ray.util.get_current_placement_group()
    if current_placement_group:

--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -55,10 +55,9 @@ class CustomOp(nn.Module):
        # NOTE(woosuk): This is a placeholder for future extensions.
        return self.forward_native(*args, **kwargs)

-    def forward_gaudi(self, *args, **kwargs):
+    def forward_hpu(self, *args, **kwargs):
        # By default, we assume that Gaudi ops are compatible with the
        # PyTorch-native implementation.
-        # NOTE(woosuk): This is a placeholder for future extensions.
        return self.forward_native(*args, **kwargs)

    def dispatch_forward(self):
@@ -76,6 +75,8 @@ class CustomOp(nn.Module):
            return self.forward_hip
        elif current_platform.is_cpu():
            return self.forward_cpu
+        elif current_platform.is_hpu():
+            return self.forward_hpu
        elif current_platform.is_tpu():
            return self.forward_tpu
        elif current_platform.is_xpu():