[V0 deprecation] Remove V0 HPU backend (#21131)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>

[V0 deprecation] Remove V0 HPU backend (#21131)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
4de71463 · Woosuk Kwon · GitHub · ac9fb732 · 4de71463 · ac9fb732
Unverified Commit 4de71463 authored Jul 17, 2025 by Woosuk Kwon Committed by GitHub Jul 17, 2025
7 changed files
--- a/vllm/platforms/__init__.py
+++ b/vllm/platforms/__init__.py
@@ -116,23 +116,6 @@ def rocm_platform_plugin() -> Optional[str]:
    return "vllm.platforms.rocm.RocmPlatform" if is_rocm else None
-def hpu_platform_plugin() -> Optional[str]:
-    is_hpu = False
-    logger.debug("Checking if HPU platform is available.")
-    try:
-        from importlib import util
-        is_hpu = util.find_spec('habana_frameworks') is not None
-        if is_hpu:
-            logger.debug("Confirmed HPU platform is available.")
-        else:
-            logger.debug("HPU platform is not available because "
-                         "habana_frameworks is not found.")
-    except Exception as e:
-        logger.debug("HPU platform is not available because: %s", str(e))
-    return "vllm.platforms.hpu.HpuPlatform" if is_hpu else None
 def xpu_platform_plugin() -> Optional[str]:
    is_xpu = False
    logger.debug("Checking if XPU platform is available.")
@@ -208,7 +191,6 @@ builtin_platform_plugins = {
    'tpu': tpu_platform_plugin,
    'cuda': cuda_platform_plugin,
    'rocm': rocm_platform_plugin,
-    'hpu': hpu_platform_plugin,
    'xpu': xpu_platform_plugin,
    'cpu': cpu_platform_plugin,
    'neuron': neuron_platform_plugin,

--- a/vllm/platforms/hpu.py
+++ b/vllm/platforms/hpu.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
-from typing import TYPE_CHECKING, Optional
-import torch
-from vllm import envs
-from vllm.logger import init_logger
-from vllm.utils import DEFAULT_MAX_NUM_BATCHED_TOKENS
-from .interface import Platform, PlatformEnum, _Backend
-if TYPE_CHECKING:
-    from vllm.config import VllmConfig
-else:
-    VllmConfig = None
-logger = init_logger(__name__)
-class HpuPlatform(Platform):
-    _enum = PlatformEnum.HPU
-    device_name: str = "hpu"
-    device_type: str = "hpu"
-    dispatch_key: str = "HPU"
-    ray_device_key: str = "HPU"
-    dist_backend: str = "hccl"
-    device_control_env_var: str = "HABANA_VISIBLE_MODULES"
-    @classmethod
-    def get_attn_backend_cls(cls, selected_backend: _Backend, head_size: int,
-                             dtype: torch.dtype, kv_cache_dtype: Optional[str],
-                             block_size: int, use_v1: bool,
-                             use_mla: bool) -> str:
-        logger.info("Using HPUAttention backend.")
-        return "vllm.attention.backends.hpu_attn.HPUAttentionBackend"
-    @classmethod
-    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
-        return True
-    @classmethod
-    def inference_mode(cls):
-        return torch.no_grad()
-    @classmethod
-    def set_device(cls, device: torch.device) -> None:
-        """
-        Set the device for the current platform.
-        """
-        torch.hpu.set_device(device)
-    @classmethod
-    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
-        scheduler_config = vllm_config.scheduler_config
-        parallel_config = vllm_config.parallel_config
-        if scheduler_config.is_multi_step:
-            parallel_config.worker_cls = \
-                "vllm.worker.multi_step_hpu_worker.MultiStepHPUWorker"
-        if vllm_config.speculative_config is not None:
-            raise NotImplementedError(
-                "Speculative decoding is not implemented for HPU")
-        if parallel_config.worker_cls == "auto":
-            parallel_config.worker_cls = "vllm.worker.hpu_worker.HPUWorker"
-        # NOTE(kzawora): default block size for Gaudi should be 128
-        # smaller sizes still work, but very inefficiently
-        cache_config = vllm_config.cache_config
-        if cache_config and cache_config.block_size is None:
-            cache_config.block_size = 128
-        if (parallel_config.distributed_executor_backend == 'mp'
-                and envs.VLLM_WORKER_MULTIPROC_METHOD == 'fork'):
-            if os.environ.get("VLLM_WORKER_MULTIPROC_METHOD",
-                              None) is not None:
-                logger.warning("On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
-                               "might cause application hangs on exit. Using "
-                               "VLLM_WORKER_MULTIPROC_METHOD=fork anyway, "
-                               "as it was explicitly requested.")
-            else:
-                logger.warning(
-                    "On HPU, VLLM_WORKER_MULTIPROC_METHOD=fork "
-                    "might cause application hangs on exit. Setting "
-                    "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. "
-                    "To override that behavior, please set "
-                    "VLLM_WORKER_MULTIPROC_METHOD=fork explicitly.")
-                os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-        if vllm_config.model_config and vllm_config.model_config.use_mla:
-            logger.info(
-                "MLA is enabled on a non-GPU platform; forcing chunked "
-                "prefill and prefix caching to be disabled.")
-            vllm_config.scheduler_config.enable_chunked_prefill = False
-            vllm_config.scheduler_config.chunked_prefill_enabled = False
-            vllm_config.scheduler_config.max_num_batched_tokens = max(
-                vllm_config.scheduler_config.max_model_len,
-                DEFAULT_MAX_NUM_BATCHED_TOKENS)
-    @classmethod
-    def is_pin_memory_available(cls):
-        logger.warning("Pin memory is not supported on HPU.")
-        return False
-    @classmethod
-    def get_punica_wrapper(cls) -> str:
-        return "vllm.lora.punica_wrapper.punica_hpu.PunicaWrapperHPU"
-    @classmethod
-    def get_device_communicator_cls(cls) -> str:
-        return "vllm.distributed.device_communicators.hpu_communicator.HpuCommunicator"  # noqa
--- a/vllm/platforms/interface.py
+++ b/vllm/platforms/interface.py
@@ -54,7 +54,6 @@ class _Backend(enum.Enum):
    FLASHMLA_VLLM_V1 = enum.auto()
    FLASHMLA = enum.auto()  # Supported by V1
    CUTLASS_MLA_VLLM_V1 = enum.auto()
-    HPU_ATTN = enum.auto()
    PALLAS = enum.auto()
    PALLAS_VLLM_V1 = enum.auto()
    IPEX = enum.auto()
@@ -69,7 +68,6 @@ class PlatformEnum(enum.Enum):
    CUDA = enum.auto()
    ROCM = enum.auto()
    TPU = enum.auto()
-    HPU = enum.auto()
    XPU = enum.auto()
    CPU = enum.auto()
    NEURON = enum.auto()
@@ -154,9 +152,6 @@ class Platform:
    def is_tpu(self) -> bool:
        return self._enum == PlatformEnum.TPU
-    def is_hpu(self) -> bool:
-        return self._enum == PlatformEnum.HPU
    def is_xpu(self) -> bool:
        return self._enum == PlatformEnum.XPU

--- a/vllm/plugins/__init__.py
+++ b/vllm/plugins/__init__.py
@@ -2,7 +2,6 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import logging
-import os
 from typing import Any, Callable
 import torch
@@ -75,18 +74,6 @@ def load_general_plugins():
    if current_platform.is_xpu():
        # see https://github.com/pytorch/pytorch/blob/43c5f59/torch/_dynamo/config.py#L158
        torch._dynamo.config.disable = True
-    elif current_platform.is_hpu():
-        # NOTE(kzawora): PT HPU lazy backend (PT_HPU_LAZY_MODE = 1)
-        # does not support torch.compile
-        # Eager backend (PT_HPU_LAZY_MODE = 0) must be selected for
-        # torch.compile support
-        is_lazy = os.environ.get('PT_HPU_LAZY_MODE', '1') == '1'
-        if is_lazy:
-            torch._dynamo.config.disable = True
-            # NOTE(kzawora) multi-HPU inference with HPUGraphs (lazy-only)
-            # requires enabling lazy collectives
-            # see https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_HPU_Graphs.html # noqa: E501
-            os.environ['PT_HPU_ENABLE_LAZY_COLLECTIVES'] = 'true'
    plugins = load_plugins_by_group(group=DEFAULT_PLUGINS_GROUP)
    # general plugins, we only need to execute the loaded functions

--- a/vllm/worker/hpu_model_runner.py
+++ b/vllm/worker/hpu_model_runner.py
--- a/vllm/worker/hpu_worker.py
+++ b/vllm/worker/hpu_worker.py
--- a/vllm/worker/multi_step_hpu_worker.py
+++ b/vllm/worker/multi_step_hpu_worker.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-###############################################################################
-# Copyright (C) 2025 Habana Labs, Ltd. an Intel Company
-###############################################################################
-import dataclasses
-from typing import Dict, Optional, Tuple
-import torch
-from vllm.distributed import broadcast_tensor_dict
-from vllm.sequence import ExecuteModelRequest
-from vllm.worker.hpu_model_runner import ModelInputForHPU
-from vllm.worker.hpu_worker import HPUWorker
-from vllm.worker.worker_base import WorkerInput
-class MultiStepHPUWorker(HPUWorker):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.cached_model_input: Optional[ModelInputForHPU] = None
-    def _get_driver_input_and_broadcast(
-        self, execute_model_req: ExecuteModelRequest
-    ) -> Tuple[ModelInputForHPU, WorkerInput, Dict[str, torch.Tensor]]:
-        """
-        Get the driver input and broadcast it to other workers.
-        """
-        assert self.is_driver_worker
-        assert execute_model_req.virtual_engine == 0
-        is_first_multi_step = execute_model_req.is_first_multi_step
-        is_last_step = execute_model_req.is_last_step
-        if is_first_multi_step:
-            # on first step we prepare the worker input and model input normally
-            worker_input: WorkerInput = self.prepare_worker_input(
-                execute_model_req=execute_model_req)
-            worker_input = dataclasses.replace(
-                worker_input,
-                num_steps=execute_model_req.num_lookahead_slots + 1)
-            model_input: ModelInputForHPU = (
-                self.model_runner.prepare_model_input(
-                    execute_model_req.seq_group_metadata_list,
-                    execute_model_req.virtual_engine,
-                    execute_model_req.finished_requests_ids))
-            if execute_model_req.async_callback:
-                model_input = dataclasses.replace(
-                    model_input,
-                    async_callback=execute_model_req.async_callback)
-        else:
-            # on subsequent steps we reuse the worker input and model input
-            assert self.cached_model_input is not None
-            model_input = self.cached_model_input
-            worker_input = WorkerInput()
-        model_input = dataclasses.replace(
-            model_input,
-            is_first_multi_step=is_first_multi_step,
-            is_last_step=is_last_step)
-        if self.do_metadata_broadcast:
-            if is_first_multi_step:
-                broadcast_data = worker_input.as_broadcastable_tensor_dict()
-                broadcast_data.update(
-                    model_input.as_broadcastable_tensor_dict())
-                broadcast_tensor_dict(broadcast_data, src=0)
-            else:
-                broadcast_data = {
-                    "is_first_multi_step": is_first_multi_step,
-                    "is_last_step": is_last_step,
-                }
-                broadcast_tensor_dict(broadcast_data, src=0)
-        # Returning empty dict here to keep this compatible with
-        # `LocalOrDistributedWorkerBase._get_driver_input_and_broadcast`
-        return model_input, worker_input, {}
-    def prepare_input(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None,
-    ) -> Optional[Tuple[ModelInputForHPU, WorkerInput, Dict[str,
-                                                            torch.Tensor]]]:
-        if self.is_driver_worker:
-            if execute_model_req is None:
-                if self.do_metadata_broadcast:
-                    # This signals that there's no more requests to process for
-                    # now. All workers are running infinite loop with
-                    # broadcast_tensor_dict, and it stops the loop when the
-                    # driver broadcasts an empty input. Send an empty input to
-                    # notify all other workers to stop their execution loop.
-                    broadcast_tensor_dict({}, src=0)
-                return None
-            model_input, worker_input, _ = self._get_driver_input_and_broadcast(
-                execute_model_req)
-            if model_input.is_first_multi_step:
-                self.cached_model_input = model_input
-            return model_input, worker_input, {}
-        else:
-            broadcast_data = broadcast_tensor_dict(src=0)
-            if not broadcast_data:
-                return None
-            if len(broadcast_data) == 2:
-                assert self.cached_model_input is not None
-                self.cached_model_input = dataclasses.replace(
-                    self.cached_model_input,
-                    is_first_multi_step=broadcast_data["is_first_multi_step"],
-                    is_last_step=broadcast_data["is_last_step"])
-                empty_worker_input = WorkerInput()
-                return self.cached_model_input, empty_worker_input, {}
-            worker_input = WorkerInput.from_broadcasted_tensor_dict(
-                broadcast_data)
-            model_input = (
-                self.model_runner.
-                make_model_input_from_broadcasted_tensor_dict(broadcast_data))
-            self.cached_model_input = model_input
-            return model_input, worker_input, {}