[V0 deprecation] Remove V0 HPU backend (#21131)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>

[V0 deprecation] Remove V0 HPU backend (#21131)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
4de71463 · Woosuk Kwon · GitHub · ac9fb732 · ac9fb732 · ac9fb732
Unverified Commit 4de71463 authored Jul 17, 2025 by Woosuk Kwon Committed by GitHub Jul 17, 2025
20 changed files
--- a/docker/Dockerfile.hpu
+++ b/docker/Dockerfile.hpu
-FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
-COPY ./ /workspace/vllm
-WORKDIR /workspace/vllm
-RUN pip install -v -r requirements/hpu.txt
-ENV no_proxy=localhost,127.0.0.1
-ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
-RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
-# install development dependencies (for testing)
-RUN python3 -m pip install -e tests/vllm_test_utils
-WORKDIR /workspace/
-RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
--- a/requirements/hpu.txt
+++ b/requirements/hpu.txt
-# Common dependencies
-r common.txt
-# Dependencies for HPU code
-ray
-triton==3.1.0
-pandas
-numpy==1.26.4
-tabulate
-setuptools>=77.0.3,<80.0.0
-setuptools-scm>=8
-vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@f1f6624
--- a/setup.py
+++ b/setup.py
@@ -410,29 +410,6 @@ class repackage_wheel(build_ext):
                package_data[package_name].append(file_name)
-def _is_hpu() -> bool:
-    # if VLLM_TARGET_DEVICE env var was set explicitly, skip HPU autodetection
-    if os.getenv("VLLM_TARGET_DEVICE", None) == VLLM_TARGET_DEVICE:
-        return VLLM_TARGET_DEVICE == "hpu"
-    # if VLLM_TARGET_DEVICE was not set explicitly, check if hl-smi succeeds,
-    # and if it doesn't, check if habanalabs driver is loaded
-    is_hpu_available = False
-    try:
-        out = subprocess.run(["hl-smi"], capture_output=True, check=True)
-        is_hpu_available = out.returncode == 0
-    except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
-        if sys.platform.startswith("linux"):
-            try:
-                output = subprocess.check_output(
-                    'lsmod | grep habanalabs | wc -l', shell=True)
-                is_hpu_available = int(output) > 0
-            except (ValueError, FileNotFoundError, PermissionError,
-                    subprocess.CalledProcessError):
-                pass
-    return is_hpu_available
 def _no_device() -> bool:
    return VLLM_TARGET_DEVICE == "empty"
@@ -440,7 +417,7 @@ def _no_device() -> bool:
 def _is_cuda() -> bool:
    has_cuda = torch.version.cuda is not None
    return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
-            and not (_is_neuron() or _is_tpu() or _is_hpu()))
+            and not (_is_neuron() or _is_tpu()))
 def _is_hip() -> bool:
@@ -573,12 +550,6 @@ def get_vllm_version() -> str:
        if neuron_version != MAIN_CUDA_VERSION:
            neuron_version_str = neuron_version.replace(".", "")[:3]
            version += f"{sep}neuron{neuron_version_str}"
-    elif _is_hpu():
-        # Get the Intel Gaudi Software Suite version
-        gaudi_sw_version = str(get_gaudi_sw_version())
-        if gaudi_sw_version != MAIN_CUDA_VERSION:
-            gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
-            version += f"{sep}gaudi{gaudi_sw_version}"
    elif _is_tpu():
        version += f"{sep}tpu"
    elif _is_cpu():
@@ -625,8 +596,6 @@ def get_requirements() -> list[str]:
        requirements = _read_requirements("rocm.txt")
    elif _is_neuron():
        requirements = _read_requirements("neuron.txt")
-    elif _is_hpu():
-        requirements = _read_requirements("hpu.txt")
    elif _is_tpu():
        requirements = _read_requirements("tpu.txt")
    elif _is_cpu():
@@ -635,8 +604,7 @@ def get_requirements() -> list[str]:
        requirements = _read_requirements("xpu.txt")
    else:
        raise ValueError(
-            "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
+            "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
-            "or CPU.")
    return requirements

--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -13,8 +13,7 @@ from vllm.scalar_type import ScalarType
 logger = init_logger(__name__)
-if not current_platform.is_tpu() and not current_platform.is_hpu()\
+if not current_platform.is_tpu() and not current_platform.is_xpu():
-        and not current_platform.is_xpu():
    try:
        import vllm._C
    except ImportError as e:

--- a/vllm/attention/backends/hpu_attn.py
+++ b/vllm/attention/backends/hpu_attn.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-###############################################################################
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Type
-import torch
-import vllm_hpu_extension.kernels as kernels
-import vllm_hpu_extension.ops as ops
-from vllm_hpu_extension.flags import enabled_flags
-from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionLayer,
-                                              AttentionMetadata, AttentionType,
-                                              is_quantized_kv_cache)
-from vllm.attention.backends.utils import CommonAttentionState
-from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention,
-                                               HPUPagedAttentionMetadata)
-from vllm.logger import init_logger
-logger = init_logger(__name__)
-class HPUAttentionBackend(AttentionBackend):
-    @staticmethod
-    def get_name() -> str:
-        return "HPU_ATTN"
-    @staticmethod
-    def get_impl_cls() -> Type["HPUAttentionImpl"]:
-        return HPUAttentionImpl
-    @staticmethod
-    def get_metadata_cls() -> Type["AttentionMetadata"]:
-        return HPUAttentionMetadata
-    @staticmethod
-    def get_state_cls() -> Type["CommonAttentionState"]:
-        return CommonAttentionState
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return HPUPagedAttention.get_kv_cache_shape(num_blocks, block_size,
-                                                    num_kv_heads, head_size)
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: torch.Tensor,
-        dst_kv_cache: torch.Tensor,
-        src_to_dsts: torch.Tensor,
-    ) -> None:
-        HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dsts)
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[torch.Tensor],
-        src_to_dsts: torch.Tensor,
-    ) -> None:
-        HPUPagedAttention.copy_blocks(kv_caches, src_to_dsts)
-@dataclass
-class HPUAttentionMetadata(HPUPagedAttentionMetadata, AttentionMetadata):
-    """Metadata for HPUAttentionbackend."""
-    # Currently, input sequences can only contain all prompts
-    # or all decoding. True if all sequences are prompts.
-    is_prompt: bool
-    attn_bias: Optional[torch.Tensor]
-    seq_lens_tensor: Optional[torch.Tensor]
-    context_lens_tensor: Optional[torch.Tensor]
-class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
-    """
-    If the input tensors contain prompt tokens, the layout is as follows:
-    |<--------------- num_prefill_tokens ----------------->|
-    |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|
-    Otherwise, the layout is as follows:
-    |<----------------- num_decode_tokens ------------------>|
-    |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|
-    Generation tokens can contain padding when cuda-graph is used.
-    Currently, prompt tokens don't contain any padding.
-    The prompts might have different lengths, while the generation tokens
-    always have length 1.
-    """
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[List[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        blocksparse_params: Optional[Dict[str, Any]] = None,
-        max_seq_len: int = 4096,
-        attn_type: str = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-        use_irope: bool = False,
-    ) -> None:
-        super(AttentionImpl, self).__init__()
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0 "
-                                      "HPU_ATTN backend.")
-        if use_irope:
-            logger.warning_once(
-                "Using irope in HPU is not supported yet, it will fall back "
-                "to global attention for long context.")
-        self.kv_cache_dtype = kv_cache_dtype
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.matmul_qk = Matmul()
-        self.softmax = Softmax()
-        self.matmul_av = Matmul()
-        self.batch2block_matmul = Matmul()
-        self.block2batch_matmul = Matmul()
-        self.k_cache = VLLMKVCache()
-        self.v_cache = VLLMKVCache()
-        self.fused_scaled_dot_product_attention = kernels.fsdpa()
-        self.prefill_impl = 'naive'
-        if "flex_attention" in enabled_flags():
-            self.prefill_impl = 'flex'
-        if "fsdpa" in enabled_flags():
-            assert alibi_slopes is None, \
-                'Prefill with FusedSDPA not supported with alibi slopes!'
-            self.prefill_impl = 'fsdpa'
-        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
-        self.sliding_window = sliding_window
-        self.alibi_slopes = alibi_slopes
-        if alibi_slopes is not None:
-            alibi_slopes_tensor = torch.tensor(alibi_slopes,
-                                               dtype=torch.bfloat16)
-            self.alibi_slopes = alibi_slopes_tensor
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        if self.prefill_impl == 'fsdpa':
-            assert alibi_slopes is None, \
-                'Prefill with FusedSDPA not supported with alibi slopes!'
-        supported_head_sizes = HPUPagedAttention.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            raise ValueError(
-                f"Head size {head_size} is not supported by PagedAttention. "
-                f"Supported head sizes are: {supported_head_sizes}.")
-        self.attn_type = attn_type
-        if self.attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "HPUAttentionImpl")
-        if is_quantized_kv_cache(self.kv_cache_dtype):
-            raise NotImplementedError(
-                "HPUAttention with FP8 KV cache not yet supported")
-    def forward(
-        self,
-        layer: AttentionLayer,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: HPUAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Forward pass with xFormers and PagedAttention.
-        Args:
-            query: shape = [num_tokens, num_heads * head_size]
-            key: shape = [num_tokens, num_kv_heads * head_size]
-            value: shape = [num_tokens, num_kv_heads * head_size]
-            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
-            attn_metadata: Metadata for attention.
-        Returns:
-            shape = [num_tokens, num_heads * head_size]
-        """
-        if output_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for HPUAttentionImpl")
-        batch_size, seq_len, hidden_size = query.shape
-        _, seq_len_kv, _ = key.shape
-        key = key.view(-1, self.num_kv_heads, self.head_size)
-        value = value.view(-1, self.num_kv_heads, self.head_size)
-        block_indices = attn_metadata.block_indices
-        block_offsets = attn_metadata.block_offsets
-        key_cache = None
-        value_cache = None
-        if attn_metadata.is_prompt and self.attn_type \
-           is not AttentionType.ENCODER_ONLY:
-            key = key.unflatten(0, (block_indices.size(0), -1))
-            value = value.unflatten(0, (block_indices.size(0), -1))
-        if kv_cache is not None and isinstance(kv_cache, tuple):
-            key_cache, value_cache = HPUPagedAttention.split_kv_cache(
-                kv_cache, self.num_kv_heads, self.head_size)
-            # Reshape the input keys and values and store them in the cache.
-            # If kv_cache is not provided, the new key and value tensors are
-            # not cached. This happens during the initial memory profiling run.
-            key_cache = self.k_cache(key, key_cache, block_indices,
-                                     block_offsets)
-            value_cache = self.v_cache(value, value_cache, block_indices,
-                                       block_offsets)
-        if attn_metadata.is_prompt:
-            # Prompt run.
-            query_shape = (batch_size, seq_len, self.num_heads, self.head_size)
-            kv_shape = (batch_size, seq_len_kv, self.num_kv_heads,
-                        self.head_size)
-            attn_bias = attn_metadata.attn_bias
-            if attn_bias is not None and self.alibi_slopes is not None:
-                position_bias = _make_alibi_bias(self.alibi_slopes,
-                                                 self.num_kv_heads,
-                                                 attn_bias.dtype,
-                                                 attn_bias.shape[-1])
-                attn_bias = attn_bias.tile((1, self.num_kv_heads, 1, 1))
-                attn_bias.add_(position_bias)
-            block_list = attn_metadata.block_list if attn_metadata \
-                and attn_metadata.block_list is not None else None
-            out = ops.prompt_attention(
-                impl=self.prefill_impl,
-                query=query.view(query_shape),
-                key=key.view(kv_shape),
-                value=value.view(kv_shape),
-                is_causal=True,
-                attn_bias=attn_bias,
-                valid_seq_lengths=attn_metadata.seq_lens_tensor,
-                **self.common_attention_args(block_list, key_cache,
-                                             value_cache))
-            output = out.reshape(batch_size, seq_len, hidden_size)
-        else:
-            # Decoding run.
-            output = HPUPagedAttention.forward_decode(
-                query=query,
-                block_mapping=attn_metadata.block_mapping,
-                block_bias=attn_metadata.attn_bias,
-                block_groups=attn_metadata.block_groups,
-                **self.common_attention_args(attn_metadata.block_list,
-                                             key_cache, value_cache))
-        # Reshape the output tensor.
-        return output.view(batch_size, seq_len, hidden_size)
-    def common_attention_args(self,
-                              block_list=None,
-                              key_cache=None,
-                              value_cache=None):
-        fsdpa_op = self.fused_scaled_dot_product_attention.apply \
-            if self.fused_scaled_dot_product_attention is not None else None
-        return {
-            'scale': self.scale,
-            'matmul_qk_op': self.matmul_qk,
-            'matmul_av_op': self.matmul_av,
-            'batch2block_matmul_op': self.batch2block_matmul,
-            'block2batch_matmul_op': self.block2batch_matmul,
-            'fsdpa_op': fsdpa_op,
-            'keys_fetch_func': self.k_cache.fetch_from_cache,
-            'values_fetch_func': self.v_cache.fetch_from_cache,
-            'softmax_op': self.softmax,
-            'block_list': block_list,
-            'key_cache': key_cache,
-            'value_cache': value_cache,
-        }
-def _make_alibi_bias(
-    alibi_slopes: torch.Tensor,
-    num_kv_heads: int,
-    dtype: torch.dtype,
-    seq_len: int,
-) -> torch.Tensor:
-    bias = torch.arange(seq_len, dtype=dtype)
-    # NOTE(zhuohan): HF uses
-    #     `bias = bias[None, :].repeat(seq_len, 1)`
-    # here. We find that both biases give the same results, but
-    # the bias below more accurately follows the original ALiBi
-    # paper.
-    # Calculate a matrix where each element represents ith element- jth
-    # element.
-    bias = bias[None, :] - bias[:, None]
-    padded_len = (seq_len + 7) // 8 * 8
-    num_heads = alibi_slopes.shape[0]
-    bias = torch.empty(
-        1,  # batch size
-        num_heads,
-        seq_len,
-        padded_len,
-        device=alibi_slopes.device,
-        dtype=dtype,
-    )[:, :, :, :seq_len].copy_(bias)
-    bias.mul_(alibi_slopes[:, None, None])
-    if num_heads != num_kv_heads:
-        bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
-    return bias
--- a/vllm/attention/ops/hpu_paged_attn.py
+++ b/vllm/attention/ops/hpu_paged_attn.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-###############################################################################
-# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
-###############################################################################
-from dataclasses import dataclass
-from typing import List, Optional, Tuple
-import torch
-from vllm_hpu_extension import cache_ops, ops
-# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
-_PARTITION_SIZE = 512
-@dataclass
-class HPUPagedAttentionMetadata:
-    """Metadata for PagedAttention."""
-    block_list: Optional[torch.Tensor]
-    block_mapping: Optional[torch.Tensor]
-    block_usage: Optional[torch.Tensor]
-    block_indices: Optional[torch.Tensor]
-    block_offsets: Optional[torch.Tensor]
-    block_groups: Optional[torch.Tensor]
-class HPUPagedAttention:
-    @staticmethod
-    def get_supported_head_sizes() -> List[int]:
-        return [64, 80, 96, 112, 128, 256]
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[int, ...]:
-        return (num_blocks, block_size, num_kv_heads, head_size)
-    @staticmethod
-    def split_kv_cache(
-        kv_cache: torch.Tensor,
-        num_kv_heads: int,
-        head_size: int,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        key_cache = kv_cache[0]
-        value_cache = kv_cache[1]
-        return key_cache, value_cache
-    @staticmethod
-    def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor,
-                             key_cache: torch.Tensor,
-                             value_cache: torch.Tensor,
-                             slot_mapping: torch.Tensor, kv_cache_dtype: str,
-                             is_prompt: bool) -> None:
-        cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
-                                    slot_mapping, kv_cache_dtype, is_prompt)
-    @staticmethod
-    def forward_decode(**kwargs) -> torch.Tensor:
-        return ops.flat_pa(**kwargs)
-    @staticmethod
-    def swap_blocks(
-        src_kv_cache: Tuple[torch.Tensor, torch.Tensor],
-        dst_kv_cache: Tuple[torch.Tensor, torch.Tensor],
-        src_to_dsts: torch.Tensor,
-    ) -> None:
-        src_key_cache = src_kv_cache[0]
-        dst_key_cache = dst_kv_cache[0]
-        cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dsts)
-        src_value_cache = src_kv_cache[1]
-        dst_value_cache = dst_kv_cache[1]
-        cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dsts)
-    @staticmethod
-    def copy_blocks(
-        kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
-        src_to_dsts: torch.Tensor,
-    ) -> None:
-        key_caches = [kv_cache[0] for kv_cache in kv_caches]
-        value_caches = [kv_cache[1] for kv_cache in kv_caches]
-        cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts)
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2452,7 +2452,7 @@ class SchedulerConfig:
        return self.num_scheduler_steps > 1
-Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"]
+Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu"]
 @config

--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -7,7 +7,6 @@ from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
                                        DeviceAwareBlockAllocator)
 from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
 from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
-from vllm.platforms import current_platform
 from vllm.utils import Device
@@ -56,8 +55,7 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
            - The block IDs are assigned contiguously, with GPU block IDs coming
                before CPU block IDs.
        """
-        # For HPU, block id 0 is used only for padding
+        reserved_blocks = 0
-        reserved_blocks = 1 if current_platform.is_hpu() else 0
        block_ids = list(
            range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
        num_gpu_blocks -= reserved_blocks

--- a/vllm/distributed/device_communicators/hpu_communicator.py
+++ b/vllm/distributed/device_communicators/hpu_communicator.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import torch
-import torch.distributed as dist
-from vllm.platforms import current_platform
-from .base_device_communicator import DeviceCommunicatorBase
-if current_platform.is_hpu():
-    import habana_frameworks.torch as htorch  # noqa: F401
-class HpuCommunicator(DeviceCommunicatorBase):
-    def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
-        # FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
-        # occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
-        # (which is required for tensor parallel HPUGraph inference)
-        htorch.core.mark_step()
-        dist.all_reduce(input_, group=self.device_group)
-        return input_
-    def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
-        world_size = self.world_size
-        if dim < 0:
-            # Convert negative dim to positive.
-            dim += input_.dim()
-        input_size = input_.size()
-        # Allocate output tensor.
-        output_tensor = torch.empty((world_size, ) + input_size,
-                                    dtype=input_.dtype,
-                                    device=input_.device)
-        # All-gather.
-        htorch.core.mark_step()
-        dist.all_gather_into_tensor(output_tensor,
-                                    input_,
-                                    group=self.device_group)
-        # Reshape
-        output_tensor = output_tensor.movedim(0, dim)
-        output_tensor = output_tensor.reshape(input_size[:dim] +
-                                              (world_size *
-                                               input_size[dim], ) +
-                                              input_size[dim + 1:])
-        return output_tensor
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1365,9 +1365,8 @@ class EngineArgs:
            supported = False
            if current_platform.is_rocm() or (
                    current_platform.is_cuda()
-                    and current_platform.is_device_capability(100)) or (
+                    and current_platform.is_device_capability(100)
-                        current_platform.device_name
+            ):  # handle hpu also for OOT platform
-                        == "hpu"):  # handle hpu also for OOT platform
                supported = True
            elif fp8_attention and will_use_fa:
                from vllm.attention.utils.fa_utils import (

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -106,8 +106,6 @@ if TYPE_CHECKING:
    VLLM_RAY_PER_WORKER_GPUS: float = 1.0
    VLLM_RAY_BUNDLE_INDICES: str = ""
    VLLM_CUDART_SO_PATH: Optional[str] = None
-    VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH: bool = True
-    VLLM_HPU_USE_DELAYED_SAMPLING: bool = False
    VLLM_DP_RANK: int = 0
    VLLM_DP_RANK_LOCAL: int = -1
    VLLM_DP_SIZE: int = 1
@@ -780,19 +778,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_CUDART_SO_PATH":
    lambda: os.getenv("VLLM_CUDART_SO_PATH", None),
-    # Contiguous cache fetching to avoid using costly gather operation on
-    # Gaudi3. This is only applicable to HPU contiguous cache. If set to true,
-    # contiguous cache fetch will be used.
-    "VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH":
-    lambda: os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() in
-    ("1", "true"),
-    # Use delayed sampling for HPU to reduce host cpu overhead
-    # between each step.
-    "VLLM_HPU_USE_DELAYED_SAMPLING":
-    lambda: os.environ.get("VLLM_DELAYED_SAMPLING", "false").lower() in
-    ("1", "true"),
    # Rank of the process in the data parallel setting
    "VLLM_DP_RANK":
    lambda: int(os.getenv("VLLM_DP_RANK", "0")),

--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -1164,10 +1164,6 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
                                                      posinf=pos_inf,
                                                      neginf=neg_inf))
-        # HPU needs special handling to prune out dummy samples.
-        if current_platform.is_hpu():
-            lora_logits = lora_logits[:logits.shape[0], :]
        logits[:,
               self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
               lora_logits.shape[1]] = lora_logits

--- a/vllm/lora/punica_wrapper/punica_hpu.py
+++ b/vllm/lora/punica_wrapper/punica_hpu.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import TYPE_CHECKING, Optional, Union, final
-import torch
-from vllm_hpu_extension.ops import (dispatch_bgmv_embedding,
-                                    dispatch_bgmv_linear)
-from .punica_base import PunicaWrapperBase
-from .utils import convert_mapping
-if TYPE_CHECKING:
-    # avoid circuit import
-    from vllm.lora.layers import LoRAMapping
-    from vllm.lora.models import LongContextLoRAContext
-@final
-class PunicaWrapperHPU(PunicaWrapperBase):
-    def __init__(self, max_num_batched_tokens: int, max_batches: int,
-                 device: Union[torch.device, str], **kwargs):
-        # Increasing max_num_batched_tokens by 3x to handle increase in
-        # tensor size due to padding.
-        PunicaWrapperBase.__init__(self, 3 * max_num_batched_tokens,
-                                   max_batches, device)
-    def _update_base_metadata(
-        self,
-        mapping: "LoRAMapping",
-        lora_index_to_id: list[Optional[int]],
-        max_loras: int,
-        vocab_size: int,
-        extra_vocab_size: int,
-        long_lora_context: Optional["LongContextLoRAContext"] = None,
-    ):
-        (
-            base_indices,
-            sampler_indices,
-            sampler_indices_padded,
-            embeddings_indices,
-            long_lora_offsets_tensor,
-            indices_len,
-        ) = convert_mapping(mapping, lora_index_to_id, max_loras, vocab_size,
-                            extra_vocab_size, self.device, None)
-        # Updating each element in `long_lora_offsets` with `lora_offset` slows
-        # down perf in HPU due to a series of `strided_insert` ops during lazy
-        # graph accumulation. Hence HPU appends `lora_offset` to a list and
-        # converts it to a tensor only after it is ready.
-        if long_lora_context:
-            index_mapping_indices: list[int] = list(
-                mapping.index_mapping).copy()
-            long_lora_offsets: list[int] = []
-            for i in range(len(index_mapping_indices)):
-                lora_offset: int = long_lora_context.offsets_by_lora_id.get(
-                    index_mapping_indices[i], 0)
-                long_lora_offsets.append(lora_offset)
-            long_lora_offsets_tensor = torch.tensor(long_lora_offsets,
-                                                    device=self.device,
-                                                    dtype=torch.long)
-            indices_len[-1] = long_lora_offsets_tensor.shape[-1]
-        self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices)
-        self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
-        self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
-            sampler_indices_padded)
-        self._embeddings_indices[:embeddings_indices.
-                                 shape[0], :embeddings_indices.shape[1]].copy_(
-                                     embeddings_indices)
-        if long_lora_offsets_tensor is not None:
-            self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_(
-                long_lora_offsets_tensor)
-        else:
-            self._long_lora_indices.zero_()
-        self.indices_len[:] = indices_len
-    def add_lora_embedding(self,
-                           y: torch.Tensor,
-                           x: torch.Tensor,
-                           lora_b_stacked: torch.Tensor,
-                           add_inputs: bool = True,
-                           **kwargs) -> None:
-        dispatch_bgmv_embedding(y, x, lora_b_stacked, 0)
-    def add_lora_linear(self,
-                        y: torch.Tensor,
-                        x: torch.Tensor,
-                        lora_a_stacked: tuple[torch.Tensor, ...],
-                        lora_b_stacked: tuple[torch.Tensor, ...],
-                        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
-                        scale: float,
-                        output_slices: tuple[int, ...],
-                        *,
-                        buffer: Optional[tuple[torch.Tensor, ...]] = None,
-                        **kwargs) -> None:
-        y_org = y
-        x = x.view(-1, x.shape[-1])
-        y = y.view(-1, y.shape[-1])
-        offset_left = 0
-        for slice_idx in range(len(output_slices)):
-            dispatch_bgmv_linear(
-                y[:, offset_left:offset_left + output_slices[slice_idx]], x,
-                lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, scale)
-            offset_left += output_slices[slice_idx]
-        y = y.view_as(y_org)
-    def add_lora_logits(self,
-                        y: torch.Tensor,
-                        x: torch.Tensor,
-                        lora_a_stacked: torch.Tensor,
-                        lora_b_stacked: torch.Tensor,
-                        scale,
-                        *,
-                        buffer: Optional[torch.Tensor] = None,
-                        **kwargs) -> None:
-        y_org = y
-        y = y.view(-1, y.shape[-1])
-        x = x.view(-1, x.shape[-1])
-        dispatch_bgmv_linear(y, x, lora_a_stacked, lora_b_stacked, 0, scale)
-        y = y.view_as(y_org)
-    def add_shrink(
-        self,
-        y: Union[tuple[torch.Tensor, ...], torch.Tensor],
-        x: torch.Tensor,
-        lora_a_stacked: tuple[torch.Tensor, ...],
-        scale: float,
-        **kwargs,
-    ) -> None:
-        raise NotImplementedError
-    def add_expand(
-        self,
-        y: torch.Tensor,
-        x: Union[tuple[torch.Tensor, ...], torch.Tensor],
-        lora_b_stacked: tuple[torch.Tensor, ...],
-        lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
-        output_slices: tuple[int, ...],
-        offset_start: int = 0,
-        add_inputs=True,
-        **kwargs,
-    ) -> None:
-        raise NotImplementedError
--- a/vllm/model_executor/custom_op.py
+++ b/vllm/model_executor/custom_op.py
@@ -73,11 +73,6 @@ class CustomOp(nn.Module):
        # NOTE(woosuk): This is a placeholder for future extensions.
        return self.forward_native(*args, **kwargs)
-    def forward_hpu(self, *args, **kwargs):
-        # By default, we assume that Gaudi ops are compatible with the
-        # PyTorch-native implementation.
-        return self.forward_native(*args, **kwargs)
    def forward_neuron(self, *args, **kwargs):
        # By default, we assume that Neuron ops are compatible with the
        # PyTorch-native implementation.
@@ -106,8 +101,6 @@ class CustomOp(nn.Module):
            return self.forward_hip
        elif current_platform.is_cpu():
            return self.forward_cpu
-        elif current_platform.is_hpu():
-            return self.forward_hpu
        elif current_platform.is_tpu():
            return self.forward_tpu
        elif current_platform.is_xpu():

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -475,39 +475,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
            activation,
        )
-    def forward_hpu(
-        self,
-        layer: torch.nn.Module,
-        x: torch.Tensor,
-        use_grouped_topk: bool,
-        top_k: int,
-        router_logits: torch.Tensor,
-        renormalize: bool,
-        topk_group: Optional[int] = None,
-        num_expert_group: Optional[int] = None,
-        global_num_experts: int = -1,
-        expert_map: Optional[torch.Tensor] = None,
-        custom_routing_function: Optional[Callable] = None,
-        scoring_func: str = "softmax",
-        e_score_correction_bias: Optional[torch.Tensor] = None,
-        apply_router_weight_on_input: bool = False,
-        activation: str = "silu",
-    ) -> torch.Tensor:
-        assert not use_grouped_topk
-        assert num_expert_group is None
-        assert topk_group is None
-        assert custom_routing_function is None
-        assert layer is not None
-        assert apply_router_weight_on_input is False
-        if scoring_func != "softmax":
-            raise NotImplementedError(
-                "Only softmax scoring function is supported for HPU.")
-        if e_score_correction_bias is not None:
-            raise NotImplementedError(
-                "Expert score correction bias is not supported for HPU.")
-        return layer.hpu_fused_moe(x, layer.w13_weight, layer.w2_weight,
-                                   router_logits, top_k)
    def forward_tpu(
        self,
        layer: torch.nn.Module,
@@ -716,9 +683,6 @@ class FusedMoE(torch.nn.Module):
        if self.scoring_func != "softmax" and not self.use_grouped_topk:
            raise ValueError("Only softmax scoring function is supported for "
                             "non-grouped topk.")
-        if current_platform.is_hpu():
-            from vllm_hpu_extension.ops import DynamicFusedMOE
-            self.hpu_fused_moe = DynamicFusedMOE(self.global_num_experts)
        if vllm_config.model_config is not None:
            model_dtype = vllm_config.model_config.dtype

--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -170,26 +170,6 @@ class RMSNorm(CustomOp):
        else:
            return norm_func(x, self.weight.data, self.variance_epsilon)
-    def forward_hpu(
-        self,
-        x: torch.Tensor,
-        residual: Optional[torch.Tensor] = None,
-    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
-        from vllm_hpu_extension.kernels import rms_norm
-        HPUFusedRMSNorm = rms_norm()
-        if HPUFusedRMSNorm is None:
-            return self.forward_native(x, residual)
-        if residual is not None:
-            orig_shape = x.shape
-            residual += x.view(residual.shape)
-            # Note: HPUFusedRMSNorm requires 3D tensors as inputs
-            x = HPUFusedRMSNorm.apply(residual, self.weight,
-                                      self.variance_epsilon)
-            return x.view(orig_shape), residual
-        x = HPUFusedRMSNorm.apply(x, self.weight, self.variance_epsilon)
-        return x
    def forward_xpu(
        self,
        x: torch.Tensor,

--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -229,64 +229,6 @@ class RotaryEmbedding(CustomOp):
                                     self.cos_sin_cache, self.is_neox_style)
        return query, key
-    def forward_hpu(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: Optional[torch.Tensor] = None,
-        offsets: Optional[torch.Tensor] = None,
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
-        from habana_frameworks.torch.hpex.kernels import (
-            RotaryPosEmbeddingMode, apply_rotary_pos_emb)
-        if offsets is not None:
-            offsets = offsets.view(positions.shape[0], -1)
-            positions = positions + offsets
-        positions = positions.flatten()
-        num_tokens = positions.shape[0]
-        cos_sin = self.cos_sin_cache.index_select(0, positions).view(
-            num_tokens, 1, -1)
-        cos, sin = cos_sin.chunk(2, dim=-1)
-        # HPU RoPE kernel requires hidden dimension for cos and sin to be equal
-        # to query hidden dimension, so the original tensors need to be
-        # expanded
-        # GPT-NeoX kernel requires position_ids = None, offset, mode = BLOCKWISE
-        # and expansion of cos/sin tensors via concatenation
-        # GPT-J kernel requires position_ids = None, offset = 0, mode = PAIRWISE
-        # and expansion of cos/sin tensors via repeat_interleave
-        rope_mode: RotaryPosEmbeddingMode
-        if self.is_neox_style:
-            rope_mode = RotaryPosEmbeddingMode.BLOCKWISE
-            cos = torch.cat((cos, cos), dim=-1)
-            sin = torch.cat((sin, sin), dim=-1)
-        else:
-            rope_mode = RotaryPosEmbeddingMode.PAIRWISE
-            sin = torch.repeat_interleave(sin,
-                                          2,
-                                          dim=-1,
-                                          output_size=cos_sin.shape[-1])
-            cos = torch.repeat_interleave(cos,
-                                          2,
-                                          dim=-1,
-                                          output_size=cos_sin.shape[-1])
-        query_shape = query.shape
-        query = query.view(num_tokens, -1, self.head_size)
-        query_rot = query[..., :self.rotary_dim]
-        query_pass = query[..., self.rotary_dim:]
-        query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0,
-                                         rope_mode)
-        query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
-        if key is not None:
-            key_shape = key.shape
-            key = key.view(num_tokens, -1, self.head_size)
-            key_rot = key[..., :self.rotary_dim]
-            key_pass = key[..., self.rotary_dim:]
-            key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0,
-                                           rope_mode)
-            key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
-        return query, key
    def forward_neuron(
        self,
        positions: torch.Tensor,

--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -388,18 +388,6 @@ class VocabParallelEmbedding(torch.nn.Module):
        # Copy the data. Select chunk corresponding to current shard.
        loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
-        if current_platform.is_hpu():
-            # FIXME(kzawora): Weight copy with slicing bugs out on Gaudi here,
-            # so we're using a workaround. Remove this when fixed in
-            # HPU PT bridge.
-            padded_weight = torch.cat([
-                loaded_weight,
-                torch.zeros(param.shape[0] - loaded_weight.shape[0],
-                            *loaded_weight.shape[1:])
-            ])
-            param.data.copy_(padded_weight)
-        else:
        param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
        param[loaded_weight.shape[0]:].data.fill_(0)

--- a/vllm/model_executor/model_loader/bitsandbytes_loader.py
+++ b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -199,10 +199,6 @@ class BitsAndBytesModelLoader(BaseModelLoader):
        if self.pre_quant:
            if self.load_8bit:
-                if current_platform.is_hpu():
-                    raise ValueError(
-                        "currently hpu supports 4bit quantization only")
                return self._quantized_8bit_generator(
                    hf_weights_files, use_safetensors,
                    quant_state_dict), quant_state_dict
@@ -306,10 +302,6 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                        in temp_state_dict):
                quant_state = _parse_quant_state(mapped_weight_name,
                                                 temp_state_dict)
-                if current_platform.is_hpu():
-                    assert quant_state.quant_type == "nf4", (
-                        "currently hpu supports nf4 quant_type only")
                quant_state_dict[mapped_weight_name] = quant_state
                yield org_weight_name, weight_tensor
            else:
@@ -380,8 +372,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
                                                      ...]
                # bitsandbytes requires data in GPU
-                if (weight_sub_tensor.is_cuda
+                if weight_sub_tensor.is_cuda:
-                        or weight_sub_tensor.device.type == "hpu"):
                    loaded_weight = weight_sub_tensor
                else:
                    loaded_weight = weight_sub_tensor.to(

--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@@ -218,16 +218,6 @@ class DefaultModelLoader(BaseModelLoader):
            weights_iterator = _xla_weights_iterator(weights_iterator)
-        elif current_platform.is_hpu():
-            import habana_frameworks.torch.core as htcore
-            def _hpu_weights_iterator(iterator: Generator):
-                for weights in iterator:
-                    yield weights
-                    htcore.mark_step()
-            weights_iterator = _hpu_weights_iterator(weights_iterator)
        if self.counter_before_loading_weights == 0.0:
            self.counter_before_loading_weights = time.perf_counter()
        # Apply the prefix.