Unverified Commit 4de71463 authored by Woosuk Kwon's avatar Woosuk Kwon Committed by GitHub
Browse files

[V0 deprecation] Remove V0 HPU backend (#21131)


Signed-off-by: default avatarWoosuk Kwon <woosuk.kwon@berkeley.edu>
parent ac9fb732
FROM vault.habana.ai/gaudi-docker/1.20.1/ubuntu22.04/habanalabs/pytorch-installer-2.6.0:latest
COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
RUN pip install -v -r requirements/hpu.txt
ENV no_proxy=localhost,127.0.0.1
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
# Common dependencies
-r common.txt
# Dependencies for HPU code
ray
triton==3.1.0
pandas
numpy==1.26.4
tabulate
setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
vllm-hpu-extension @ git+https://github.com/HabanaAI/vllm-hpu-extension.git@f1f6624
...@@ -410,29 +410,6 @@ class repackage_wheel(build_ext): ...@@ -410,29 +410,6 @@ class repackage_wheel(build_ext):
package_data[package_name].append(file_name) package_data[package_name].append(file_name)
def _is_hpu() -> bool:
# if VLLM_TARGET_DEVICE env var was set explicitly, skip HPU autodetection
if os.getenv("VLLM_TARGET_DEVICE", None) == VLLM_TARGET_DEVICE:
return VLLM_TARGET_DEVICE == "hpu"
# if VLLM_TARGET_DEVICE was not set explicitly, check if hl-smi succeeds,
# and if it doesn't, check if habanalabs driver is loaded
is_hpu_available = False
try:
out = subprocess.run(["hl-smi"], capture_output=True, check=True)
is_hpu_available = out.returncode == 0
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
if sys.platform.startswith("linux"):
try:
output = subprocess.check_output(
'lsmod | grep habanalabs | wc -l', shell=True)
is_hpu_available = int(output) > 0
except (ValueError, FileNotFoundError, PermissionError,
subprocess.CalledProcessError):
pass
return is_hpu_available
def _no_device() -> bool: def _no_device() -> bool:
return VLLM_TARGET_DEVICE == "empty" return VLLM_TARGET_DEVICE == "empty"
...@@ -440,7 +417,7 @@ def _no_device() -> bool: ...@@ -440,7 +417,7 @@ def _no_device() -> bool:
def _is_cuda() -> bool: def _is_cuda() -> bool:
has_cuda = torch.version.cuda is not None has_cuda = torch.version.cuda is not None
return (VLLM_TARGET_DEVICE == "cuda" and has_cuda return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
and not (_is_neuron() or _is_tpu() or _is_hpu())) and not (_is_neuron() or _is_tpu()))
def _is_hip() -> bool: def _is_hip() -> bool:
...@@ -573,12 +550,6 @@ def get_vllm_version() -> str: ...@@ -573,12 +550,6 @@ def get_vllm_version() -> str:
if neuron_version != MAIN_CUDA_VERSION: if neuron_version != MAIN_CUDA_VERSION:
neuron_version_str = neuron_version.replace(".", "")[:3] neuron_version_str = neuron_version.replace(".", "")[:3]
version += f"{sep}neuron{neuron_version_str}" version += f"{sep}neuron{neuron_version_str}"
elif _is_hpu():
# Get the Intel Gaudi Software Suite version
gaudi_sw_version = str(get_gaudi_sw_version())
if gaudi_sw_version != MAIN_CUDA_VERSION:
gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
version += f"{sep}gaudi{gaudi_sw_version}"
elif _is_tpu(): elif _is_tpu():
version += f"{sep}tpu" version += f"{sep}tpu"
elif _is_cpu(): elif _is_cpu():
...@@ -625,8 +596,6 @@ def get_requirements() -> list[str]: ...@@ -625,8 +596,6 @@ def get_requirements() -> list[str]:
requirements = _read_requirements("rocm.txt") requirements = _read_requirements("rocm.txt")
elif _is_neuron(): elif _is_neuron():
requirements = _read_requirements("neuron.txt") requirements = _read_requirements("neuron.txt")
elif _is_hpu():
requirements = _read_requirements("hpu.txt")
elif _is_tpu(): elif _is_tpu():
requirements = _read_requirements("tpu.txt") requirements = _read_requirements("tpu.txt")
elif _is_cpu(): elif _is_cpu():
...@@ -635,8 +604,7 @@ def get_requirements() -> list[str]: ...@@ -635,8 +604,7 @@ def get_requirements() -> list[str]:
requirements = _read_requirements("xpu.txt") requirements = _read_requirements("xpu.txt")
else: else:
raise ValueError( raise ValueError(
"Unsupported platform, please use CUDA, ROCm, Neuron, HPU, " "Unsupported platform, please use CUDA, ROCm, Neuron, or CPU.")
"or CPU.")
return requirements return requirements
......
...@@ -13,8 +13,7 @@ from vllm.scalar_type import ScalarType ...@@ -13,8 +13,7 @@ from vllm.scalar_type import ScalarType
logger = init_logger(__name__) logger = init_logger(__name__)
if not current_platform.is_tpu() and not current_platform.is_hpu()\ if not current_platform.is_tpu() and not current_platform.is_xpu():
and not current_platform.is_xpu():
try: try:
import vllm._C import vllm._C
except ImportError as e: except ImportError as e:
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
###############################################################################
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
###############################################################################
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple, Type
import torch
import vllm_hpu_extension.kernels as kernels
import vllm_hpu_extension.ops as ops
from vllm_hpu_extension.flags import enabled_flags
from vllm_hpu_extension.utils import Matmul, Softmax, VLLMKVCache
from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
AttentionLayer,
AttentionMetadata, AttentionType,
is_quantized_kv_cache)
from vllm.attention.backends.utils import CommonAttentionState
from vllm.attention.ops.hpu_paged_attn import (HPUPagedAttention,
HPUPagedAttentionMetadata)
from vllm.logger import init_logger
logger = init_logger(__name__)
class HPUAttentionBackend(AttentionBackend):
@staticmethod
def get_name() -> str:
return "HPU_ATTN"
@staticmethod
def get_impl_cls() -> Type["HPUAttentionImpl"]:
return HPUAttentionImpl
@staticmethod
def get_metadata_cls() -> Type["AttentionMetadata"]:
return HPUAttentionMetadata
@staticmethod
def get_state_cls() -> Type["CommonAttentionState"]:
return CommonAttentionState
@staticmethod
def get_kv_cache_shape(
num_blocks: int,
block_size: int,
num_kv_heads: int,
head_size: int,
) -> Tuple[int, ...]:
return HPUPagedAttention.get_kv_cache_shape(num_blocks, block_size,
num_kv_heads, head_size)
@staticmethod
def swap_blocks(
src_kv_cache: torch.Tensor,
dst_kv_cache: torch.Tensor,
src_to_dsts: torch.Tensor,
) -> None:
HPUPagedAttention.swap_blocks(src_kv_cache, dst_kv_cache, src_to_dsts)
@staticmethod
def copy_blocks(
kv_caches: List[torch.Tensor],
src_to_dsts: torch.Tensor,
) -> None:
HPUPagedAttention.copy_blocks(kv_caches, src_to_dsts)
@dataclass
class HPUAttentionMetadata(HPUPagedAttentionMetadata, AttentionMetadata):
"""Metadata for HPUAttentionbackend."""
# Currently, input sequences can only contain all prompts
# or all decoding. True if all sequences are prompts.
is_prompt: bool
attn_bias: Optional[torch.Tensor]
seq_lens_tensor: Optional[torch.Tensor]
context_lens_tensor: Optional[torch.Tensor]
class HPUAttentionImpl(AttentionImpl, torch.nn.Module):
"""
If the input tensors contain prompt tokens, the layout is as follows:
|<--------------- num_prefill_tokens ----------------->|
|<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|
Otherwise, the layout is as follows:
|<----------------- num_decode_tokens ------------------>|
|<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|
Generation tokens can contain padding when cuda-graph is used.
Currently, prompt tokens don't contain any padding.
The prompts might have different lengths, while the generation tokens
always have length 1.
"""
def __init__(
self,
num_heads: int,
head_size: int,
scale: float,
num_kv_heads: int,
alibi_slopes: Optional[List[float]],
sliding_window: Optional[int],
kv_cache_dtype: str,
blocksparse_params: Optional[Dict[str, Any]] = None,
max_seq_len: int = 4096,
attn_type: str = AttentionType.DECODER,
kv_sharing_target_layer_name: Optional[str] = None,
use_irope: bool = False,
) -> None:
super(AttentionImpl, self).__init__()
if kv_sharing_target_layer_name is not None:
raise NotImplementedError("KV sharing is not supported in V0 "
"HPU_ATTN backend.")
if use_irope:
logger.warning_once(
"Using irope in HPU is not supported yet, it will fall back "
"to global attention for long context.")
self.kv_cache_dtype = kv_cache_dtype
self.num_heads = num_heads
self.head_size = head_size
self.scale = float(scale)
self.matmul_qk = Matmul()
self.softmax = Softmax()
self.matmul_av = Matmul()
self.batch2block_matmul = Matmul()
self.block2batch_matmul = Matmul()
self.k_cache = VLLMKVCache()
self.v_cache = VLLMKVCache()
self.fused_scaled_dot_product_attention = kernels.fsdpa()
self.prefill_impl = 'naive'
if "flex_attention" in enabled_flags():
self.prefill_impl = 'flex'
if "fsdpa" in enabled_flags():
assert alibi_slopes is None, \
'Prefill with FusedSDPA not supported with alibi slopes!'
self.prefill_impl = 'fsdpa'
self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
self.sliding_window = sliding_window
self.alibi_slopes = alibi_slopes
if alibi_slopes is not None:
alibi_slopes_tensor = torch.tensor(alibi_slopes,
dtype=torch.bfloat16)
self.alibi_slopes = alibi_slopes_tensor
self.num_queries_per_kv = self.num_heads // self.num_kv_heads
if self.prefill_impl == 'fsdpa':
assert alibi_slopes is None, \
'Prefill with FusedSDPA not supported with alibi slopes!'
supported_head_sizes = HPUPagedAttention.get_supported_head_sizes()
if head_size not in supported_head_sizes:
raise ValueError(
f"Head size {head_size} is not supported by PagedAttention. "
f"Supported head sizes are: {supported_head_sizes}.")
self.attn_type = attn_type
if self.attn_type != AttentionType.DECODER:
raise NotImplementedError("Encoder self-attention and "
"encoder/decoder cross-attention "
"are not implemented for "
"HPUAttentionImpl")
if is_quantized_kv_cache(self.kv_cache_dtype):
raise NotImplementedError(
"HPUAttention with FP8 KV cache not yet supported")
def forward(
self,
layer: AttentionLayer,
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
kv_cache: torch.Tensor,
attn_metadata: HPUAttentionMetadata,
output: Optional[torch.Tensor] = None,
output_scale: Optional[torch.Tensor] = None,
) -> torch.Tensor:
"""Forward pass with xFormers and PagedAttention.
Args:
query: shape = [num_tokens, num_heads * head_size]
key: shape = [num_tokens, num_kv_heads * head_size]
value: shape = [num_tokens, num_kv_heads * head_size]
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
attn_metadata: Metadata for attention.
Returns:
shape = [num_tokens, num_heads * head_size]
"""
if output_scale is not None:
raise NotImplementedError(
"fused output quantization is not yet supported"
" for HPUAttentionImpl")
batch_size, seq_len, hidden_size = query.shape
_, seq_len_kv, _ = key.shape
key = key.view(-1, self.num_kv_heads, self.head_size)
value = value.view(-1, self.num_kv_heads, self.head_size)
block_indices = attn_metadata.block_indices
block_offsets = attn_metadata.block_offsets
key_cache = None
value_cache = None
if attn_metadata.is_prompt and self.attn_type \
is not AttentionType.ENCODER_ONLY:
key = key.unflatten(0, (block_indices.size(0), -1))
value = value.unflatten(0, (block_indices.size(0), -1))
if kv_cache is not None and isinstance(kv_cache, tuple):
key_cache, value_cache = HPUPagedAttention.split_kv_cache(
kv_cache, self.num_kv_heads, self.head_size)
# Reshape the input keys and values and store them in the cache.
# If kv_cache is not provided, the new key and value tensors are
# not cached. This happens during the initial memory profiling run.
key_cache = self.k_cache(key, key_cache, block_indices,
block_offsets)
value_cache = self.v_cache(value, value_cache, block_indices,
block_offsets)
if attn_metadata.is_prompt:
# Prompt run.
query_shape = (batch_size, seq_len, self.num_heads, self.head_size)
kv_shape = (batch_size, seq_len_kv, self.num_kv_heads,
self.head_size)
attn_bias = attn_metadata.attn_bias
if attn_bias is not None and self.alibi_slopes is not None:
position_bias = _make_alibi_bias(self.alibi_slopes,
self.num_kv_heads,
attn_bias.dtype,
attn_bias.shape[-1])
attn_bias = attn_bias.tile((1, self.num_kv_heads, 1, 1))
attn_bias.add_(position_bias)
block_list = attn_metadata.block_list if attn_metadata \
and attn_metadata.block_list is not None else None
out = ops.prompt_attention(
impl=self.prefill_impl,
query=query.view(query_shape),
key=key.view(kv_shape),
value=value.view(kv_shape),
is_causal=True,
attn_bias=attn_bias,
valid_seq_lengths=attn_metadata.seq_lens_tensor,
**self.common_attention_args(block_list, key_cache,
value_cache))
output = out.reshape(batch_size, seq_len, hidden_size)
else:
# Decoding run.
output = HPUPagedAttention.forward_decode(
query=query,
block_mapping=attn_metadata.block_mapping,
block_bias=attn_metadata.attn_bias,
block_groups=attn_metadata.block_groups,
**self.common_attention_args(attn_metadata.block_list,
key_cache, value_cache))
# Reshape the output tensor.
return output.view(batch_size, seq_len, hidden_size)
def common_attention_args(self,
block_list=None,
key_cache=None,
value_cache=None):
fsdpa_op = self.fused_scaled_dot_product_attention.apply \
if self.fused_scaled_dot_product_attention is not None else None
return {
'scale': self.scale,
'matmul_qk_op': self.matmul_qk,
'matmul_av_op': self.matmul_av,
'batch2block_matmul_op': self.batch2block_matmul,
'block2batch_matmul_op': self.block2batch_matmul,
'fsdpa_op': fsdpa_op,
'keys_fetch_func': self.k_cache.fetch_from_cache,
'values_fetch_func': self.v_cache.fetch_from_cache,
'softmax_op': self.softmax,
'block_list': block_list,
'key_cache': key_cache,
'value_cache': value_cache,
}
def _make_alibi_bias(
alibi_slopes: torch.Tensor,
num_kv_heads: int,
dtype: torch.dtype,
seq_len: int,
) -> torch.Tensor:
bias = torch.arange(seq_len, dtype=dtype)
# NOTE(zhuohan): HF uses
# `bias = bias[None, :].repeat(seq_len, 1)`
# here. We find that both biases give the same results, but
# the bias below more accurately follows the original ALiBi
# paper.
# Calculate a matrix where each element represents ith element- jth
# element.
bias = bias[None, :] - bias[:, None]
padded_len = (seq_len + 7) // 8 * 8
num_heads = alibi_slopes.shape[0]
bias = torch.empty(
1, # batch size
num_heads,
seq_len,
padded_len,
device=alibi_slopes.device,
dtype=dtype,
)[:, :, :, :seq_len].copy_(bias)
bias.mul_(alibi_slopes[:, None, None])
if num_heads != num_kv_heads:
bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
return bias
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
###############################################################################
# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
###############################################################################
from dataclasses import dataclass
from typing import List, Optional, Tuple
import torch
from vllm_hpu_extension import cache_ops, ops
# Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
_PARTITION_SIZE = 512
@dataclass
class HPUPagedAttentionMetadata:
"""Metadata for PagedAttention."""
block_list: Optional[torch.Tensor]
block_mapping: Optional[torch.Tensor]
block_usage: Optional[torch.Tensor]
block_indices: Optional[torch.Tensor]
block_offsets: Optional[torch.Tensor]
block_groups: Optional[torch.Tensor]
class HPUPagedAttention:
@staticmethod
def get_supported_head_sizes() -> List[int]:
return [64, 80, 96, 112, 128, 256]
@staticmethod
def get_kv_cache_shape(
num_blocks: int,
block_size: int,
num_kv_heads: int,
head_size: int,
) -> Tuple[int, ...]:
return (num_blocks, block_size, num_kv_heads, head_size)
@staticmethod
def split_kv_cache(
kv_cache: torch.Tensor,
num_kv_heads: int,
head_size: int,
) -> Tuple[torch.Tensor, torch.Tensor]:
key_cache = kv_cache[0]
value_cache = kv_cache[1]
return key_cache, value_cache
@staticmethod
def write_to_paged_cache(key: torch.Tensor, value: torch.Tensor,
key_cache: torch.Tensor,
value_cache: torch.Tensor,
slot_mapping: torch.Tensor, kv_cache_dtype: str,
is_prompt: bool) -> None:
cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
slot_mapping, kv_cache_dtype, is_prompt)
@staticmethod
def forward_decode(**kwargs) -> torch.Tensor:
return ops.flat_pa(**kwargs)
@staticmethod
def swap_blocks(
src_kv_cache: Tuple[torch.Tensor, torch.Tensor],
dst_kv_cache: Tuple[torch.Tensor, torch.Tensor],
src_to_dsts: torch.Tensor,
) -> None:
src_key_cache = src_kv_cache[0]
dst_key_cache = dst_kv_cache[0]
cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dsts)
src_value_cache = src_kv_cache[1]
dst_value_cache = dst_kv_cache[1]
cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dsts)
@staticmethod
def copy_blocks(
kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
src_to_dsts: torch.Tensor,
) -> None:
key_caches = [kv_cache[0] for kv_cache in kv_caches]
value_caches = [kv_cache[1] for kv_cache in kv_caches]
cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts)
...@@ -2452,7 +2452,7 @@ class SchedulerConfig: ...@@ -2452,7 +2452,7 @@ class SchedulerConfig:
return self.num_scheduler_steps > 1 return self.num_scheduler_steps > 1
Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu", "hpu"] Device = Literal["auto", "cuda", "neuron", "cpu", "tpu", "xpu"]
@config @config
......
...@@ -7,7 +7,6 @@ from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, ...@@ -7,7 +7,6 @@ from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
DeviceAwareBlockAllocator) DeviceAwareBlockAllocator)
from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
from vllm.platforms import current_platform
from vllm.utils import Device from vllm.utils import Device
...@@ -56,8 +55,7 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator): ...@@ -56,8 +55,7 @@ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
- The block IDs are assigned contiguously, with GPU block IDs coming - The block IDs are assigned contiguously, with GPU block IDs coming
before CPU block IDs. before CPU block IDs.
""" """
# For HPU, block id 0 is used only for padding reserved_blocks = 0
reserved_blocks = 1 if current_platform.is_hpu() else 0
block_ids = list( block_ids = list(
range(reserved_blocks, num_gpu_blocks + num_cpu_blocks)) range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
num_gpu_blocks -= reserved_blocks num_gpu_blocks -= reserved_blocks
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
import torch.distributed as dist
from vllm.platforms import current_platform
from .base_device_communicator import DeviceCommunicatorBase
if current_platform.is_hpu():
import habana_frameworks.torch as htorch # noqa: F401
class HpuCommunicator(DeviceCommunicatorBase):
def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
# FIXME(kzawora): this is a workaround for a bug in Habana PT bridge
# occurring when PT_HPU_ENABLE_LAZY_COLLECTIVES=true env var is used
# (which is required for tensor parallel HPUGraph inference)
htorch.core.mark_step()
dist.all_reduce(input_, group=self.device_group)
return input_
def all_gather(self, input_: torch.Tensor, dim: int = -1) -> torch.Tensor:
world_size = self.world_size
if dim < 0:
# Convert negative dim to positive.
dim += input_.dim()
input_size = input_.size()
# Allocate output tensor.
output_tensor = torch.empty((world_size, ) + input_size,
dtype=input_.dtype,
device=input_.device)
# All-gather.
htorch.core.mark_step()
dist.all_gather_into_tensor(output_tensor,
input_,
group=self.device_group)
# Reshape
output_tensor = output_tensor.movedim(0, dim)
output_tensor = output_tensor.reshape(input_size[:dim] +
(world_size *
input_size[dim], ) +
input_size[dim + 1:])
return output_tensor
...@@ -1365,9 +1365,8 @@ class EngineArgs: ...@@ -1365,9 +1365,8 @@ class EngineArgs:
supported = False supported = False
if current_platform.is_rocm() or ( if current_platform.is_rocm() or (
current_platform.is_cuda() current_platform.is_cuda()
and current_platform.is_device_capability(100)) or ( and current_platform.is_device_capability(100)
current_platform.device_name ): # handle hpu also for OOT platform
== "hpu"): # handle hpu also for OOT platform
supported = True supported = True
elif fp8_attention and will_use_fa: elif fp8_attention and will_use_fa:
from vllm.attention.utils.fa_utils import ( from vllm.attention.utils.fa_utils import (
......
...@@ -106,8 +106,6 @@ if TYPE_CHECKING: ...@@ -106,8 +106,6 @@ if TYPE_CHECKING:
VLLM_RAY_PER_WORKER_GPUS: float = 1.0 VLLM_RAY_PER_WORKER_GPUS: float = 1.0
VLLM_RAY_BUNDLE_INDICES: str = "" VLLM_RAY_BUNDLE_INDICES: str = ""
VLLM_CUDART_SO_PATH: Optional[str] = None VLLM_CUDART_SO_PATH: Optional[str] = None
VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH: bool = True
VLLM_HPU_USE_DELAYED_SAMPLING: bool = False
VLLM_DP_RANK: int = 0 VLLM_DP_RANK: int = 0
VLLM_DP_RANK_LOCAL: int = -1 VLLM_DP_RANK_LOCAL: int = -1
VLLM_DP_SIZE: int = 1 VLLM_DP_SIZE: int = 1
...@@ -780,19 +778,6 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -780,19 +778,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_CUDART_SO_PATH": "VLLM_CUDART_SO_PATH":
lambda: os.getenv("VLLM_CUDART_SO_PATH", None), lambda: os.getenv("VLLM_CUDART_SO_PATH", None),
# Contiguous cache fetching to avoid using costly gather operation on
# Gaudi3. This is only applicable to HPU contiguous cache. If set to true,
# contiguous cache fetch will be used.
"VLLM_USE_HPU_CONTIGUOUS_CACHE_FETCH":
lambda: os.environ.get("VLLM_CONTIGUOUS_PA", "true").lower() in
("1", "true"),
# Use delayed sampling for HPU to reduce host cpu overhead
# between each step.
"VLLM_HPU_USE_DELAYED_SAMPLING":
lambda: os.environ.get("VLLM_DELAYED_SAMPLING", "false").lower() in
("1", "true"),
# Rank of the process in the data parallel setting # Rank of the process in the data parallel setting
"VLLM_DP_RANK": "VLLM_DP_RANK":
lambda: int(os.getenv("VLLM_DP_RANK", "0")), lambda: int(os.getenv("VLLM_DP_RANK", "0")),
......
...@@ -1164,10 +1164,6 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA): ...@@ -1164,10 +1164,6 @@ class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
posinf=pos_inf, posinf=pos_inf,
neginf=neg_inf)) neginf=neg_inf))
# HPU needs special handling to prune out dummy samples.
if current_platform.is_hpu():
lora_logits = lora_logits[:logits.shape[0], :]
logits[:, logits[:,
self.base_layer.org_vocab_size:self.base_layer.org_vocab_size + self.base_layer.org_vocab_size:self.base_layer.org_vocab_size +
lora_logits.shape[1]] = lora_logits lora_logits.shape[1]] = lora_logits
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import TYPE_CHECKING, Optional, Union, final
import torch
from vllm_hpu_extension.ops import (dispatch_bgmv_embedding,
dispatch_bgmv_linear)
from .punica_base import PunicaWrapperBase
from .utils import convert_mapping
if TYPE_CHECKING:
# avoid circuit import
from vllm.lora.layers import LoRAMapping
from vllm.lora.models import LongContextLoRAContext
@final
class PunicaWrapperHPU(PunicaWrapperBase):
def __init__(self, max_num_batched_tokens: int, max_batches: int,
device: Union[torch.device, str], **kwargs):
# Increasing max_num_batched_tokens by 3x to handle increase in
# tensor size due to padding.
PunicaWrapperBase.__init__(self, 3 * max_num_batched_tokens,
max_batches, device)
def _update_base_metadata(
self,
mapping: "LoRAMapping",
lora_index_to_id: list[Optional[int]],
max_loras: int,
vocab_size: int,
extra_vocab_size: int,
long_lora_context: Optional["LongContextLoRAContext"] = None,
):
(
base_indices,
sampler_indices,
sampler_indices_padded,
embeddings_indices,
long_lora_offsets_tensor,
indices_len,
) = convert_mapping(mapping, lora_index_to_id, max_loras, vocab_size,
extra_vocab_size, self.device, None)
# Updating each element in `long_lora_offsets` with `lora_offset` slows
# down perf in HPU due to a series of `strided_insert` ops during lazy
# graph accumulation. Hence HPU appends `lora_offset` to a list and
# converts it to a tensor only after it is ready.
if long_lora_context:
index_mapping_indices: list[int] = list(
mapping.index_mapping).copy()
long_lora_offsets: list[int] = []
for i in range(len(index_mapping_indices)):
lora_offset: int = long_lora_context.offsets_by_lora_id.get(
index_mapping_indices[i], 0)
long_lora_offsets.append(lora_offset)
long_lora_offsets_tensor = torch.tensor(long_lora_offsets,
device=self.device,
dtype=torch.long)
indices_len[-1] = long_lora_offsets_tensor.shape[-1]
self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices)
self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
sampler_indices_padded)
self._embeddings_indices[:embeddings_indices.
shape[0], :embeddings_indices.shape[1]].copy_(
embeddings_indices)
if long_lora_offsets_tensor is not None:
self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_(
long_lora_offsets_tensor)
else:
self._long_lora_indices.zero_()
self.indices_len[:] = indices_len
def add_lora_embedding(self,
y: torch.Tensor,
x: torch.Tensor,
lora_b_stacked: torch.Tensor,
add_inputs: bool = True,
**kwargs) -> None:
dispatch_bgmv_embedding(y, x, lora_b_stacked, 0)
def add_lora_linear(self,
y: torch.Tensor,
x: torch.Tensor,
lora_a_stacked: tuple[torch.Tensor, ...],
lora_b_stacked: tuple[torch.Tensor, ...],
lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
scale: float,
output_slices: tuple[int, ...],
*,
buffer: Optional[tuple[torch.Tensor, ...]] = None,
**kwargs) -> None:
y_org = y
x = x.view(-1, x.shape[-1])
y = y.view(-1, y.shape[-1])
offset_left = 0
for slice_idx in range(len(output_slices)):
dispatch_bgmv_linear(
y[:, offset_left:offset_left + output_slices[slice_idx]], x,
lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, scale)
offset_left += output_slices[slice_idx]
y = y.view_as(y_org)
def add_lora_logits(self,
y: torch.Tensor,
x: torch.Tensor,
lora_a_stacked: torch.Tensor,
lora_b_stacked: torch.Tensor,
scale,
*,
buffer: Optional[torch.Tensor] = None,
**kwargs) -> None:
y_org = y
y = y.view(-1, y.shape[-1])
x = x.view(-1, x.shape[-1])
dispatch_bgmv_linear(y, x, lora_a_stacked, lora_b_stacked, 0, scale)
y = y.view_as(y_org)
def add_shrink(
self,
y: Union[tuple[torch.Tensor, ...], torch.Tensor],
x: torch.Tensor,
lora_a_stacked: tuple[torch.Tensor, ...],
scale: float,
**kwargs,
) -> None:
raise NotImplementedError
def add_expand(
self,
y: torch.Tensor,
x: Union[tuple[torch.Tensor, ...], torch.Tensor],
lora_b_stacked: tuple[torch.Tensor, ...],
lora_bias_stacked: Optional[tuple[torch.Tensor, ...]],
output_slices: tuple[int, ...],
offset_start: int = 0,
add_inputs=True,
**kwargs,
) -> None:
raise NotImplementedError
...@@ -73,11 +73,6 @@ class CustomOp(nn.Module): ...@@ -73,11 +73,6 @@ class CustomOp(nn.Module):
# NOTE(woosuk): This is a placeholder for future extensions. # NOTE(woosuk): This is a placeholder for future extensions.
return self.forward_native(*args, **kwargs) return self.forward_native(*args, **kwargs)
def forward_hpu(self, *args, **kwargs):
# By default, we assume that Gaudi ops are compatible with the
# PyTorch-native implementation.
return self.forward_native(*args, **kwargs)
def forward_neuron(self, *args, **kwargs): def forward_neuron(self, *args, **kwargs):
# By default, we assume that Neuron ops are compatible with the # By default, we assume that Neuron ops are compatible with the
# PyTorch-native implementation. # PyTorch-native implementation.
...@@ -106,8 +101,6 @@ class CustomOp(nn.Module): ...@@ -106,8 +101,6 @@ class CustomOp(nn.Module):
return self.forward_hip return self.forward_hip
elif current_platform.is_cpu(): elif current_platform.is_cpu():
return self.forward_cpu return self.forward_cpu
elif current_platform.is_hpu():
return self.forward_hpu
elif current_platform.is_tpu(): elif current_platform.is_tpu():
return self.forward_tpu return self.forward_tpu
elif current_platform.is_xpu(): elif current_platform.is_xpu():
......
...@@ -475,39 +475,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp): ...@@ -475,39 +475,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
activation, activation,
) )
def forward_hpu(
self,
layer: torch.nn.Module,
x: torch.Tensor,
use_grouped_topk: bool,
top_k: int,
router_logits: torch.Tensor,
renormalize: bool,
topk_group: Optional[int] = None,
num_expert_group: Optional[int] = None,
global_num_experts: int = -1,
expert_map: Optional[torch.Tensor] = None,
custom_routing_function: Optional[Callable] = None,
scoring_func: str = "softmax",
e_score_correction_bias: Optional[torch.Tensor] = None,
apply_router_weight_on_input: bool = False,
activation: str = "silu",
) -> torch.Tensor:
assert not use_grouped_topk
assert num_expert_group is None
assert topk_group is None
assert custom_routing_function is None
assert layer is not None
assert apply_router_weight_on_input is False
if scoring_func != "softmax":
raise NotImplementedError(
"Only softmax scoring function is supported for HPU.")
if e_score_correction_bias is not None:
raise NotImplementedError(
"Expert score correction bias is not supported for HPU.")
return layer.hpu_fused_moe(x, layer.w13_weight, layer.w2_weight,
router_logits, top_k)
def forward_tpu( def forward_tpu(
self, self,
layer: torch.nn.Module, layer: torch.nn.Module,
...@@ -716,9 +683,6 @@ class FusedMoE(torch.nn.Module): ...@@ -716,9 +683,6 @@ class FusedMoE(torch.nn.Module):
if self.scoring_func != "softmax" and not self.use_grouped_topk: if self.scoring_func != "softmax" and not self.use_grouped_topk:
raise ValueError("Only softmax scoring function is supported for " raise ValueError("Only softmax scoring function is supported for "
"non-grouped topk.") "non-grouped topk.")
if current_platform.is_hpu():
from vllm_hpu_extension.ops import DynamicFusedMOE
self.hpu_fused_moe = DynamicFusedMOE(self.global_num_experts)
if vllm_config.model_config is not None: if vllm_config.model_config is not None:
model_dtype = vllm_config.model_config.dtype model_dtype = vllm_config.model_config.dtype
......
...@@ -170,26 +170,6 @@ class RMSNorm(CustomOp): ...@@ -170,26 +170,6 @@ class RMSNorm(CustomOp):
else: else:
return norm_func(x, self.weight.data, self.variance_epsilon) return norm_func(x, self.weight.data, self.variance_epsilon)
def forward_hpu(
self,
x: torch.Tensor,
residual: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
from vllm_hpu_extension.kernels import rms_norm
HPUFusedRMSNorm = rms_norm()
if HPUFusedRMSNorm is None:
return self.forward_native(x, residual)
if residual is not None:
orig_shape = x.shape
residual += x.view(residual.shape)
# Note: HPUFusedRMSNorm requires 3D tensors as inputs
x = HPUFusedRMSNorm.apply(residual, self.weight,
self.variance_epsilon)
return x.view(orig_shape), residual
x = HPUFusedRMSNorm.apply(x, self.weight, self.variance_epsilon)
return x
def forward_xpu( def forward_xpu(
self, self,
x: torch.Tensor, x: torch.Tensor,
......
...@@ -229,64 +229,6 @@ class RotaryEmbedding(CustomOp): ...@@ -229,64 +229,6 @@ class RotaryEmbedding(CustomOp):
self.cos_sin_cache, self.is_neox_style) self.cos_sin_cache, self.is_neox_style)
return query, key return query, key
def forward_hpu(
self,
positions: torch.Tensor,
query: torch.Tensor,
key: Optional[torch.Tensor] = None,
offsets: Optional[torch.Tensor] = None,
) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
from habana_frameworks.torch.hpex.kernels import (
RotaryPosEmbeddingMode, apply_rotary_pos_emb)
if offsets is not None:
offsets = offsets.view(positions.shape[0], -1)
positions = positions + offsets
positions = positions.flatten()
num_tokens = positions.shape[0]
cos_sin = self.cos_sin_cache.index_select(0, positions).view(
num_tokens, 1, -1)
cos, sin = cos_sin.chunk(2, dim=-1)
# HPU RoPE kernel requires hidden dimension for cos and sin to be equal
# to query hidden dimension, so the original tensors need to be
# expanded
# GPT-NeoX kernel requires position_ids = None, offset, mode = BLOCKWISE
# and expansion of cos/sin tensors via concatenation
# GPT-J kernel requires position_ids = None, offset = 0, mode = PAIRWISE
# and expansion of cos/sin tensors via repeat_interleave
rope_mode: RotaryPosEmbeddingMode
if self.is_neox_style:
rope_mode = RotaryPosEmbeddingMode.BLOCKWISE
cos = torch.cat((cos, cos), dim=-1)
sin = torch.cat((sin, sin), dim=-1)
else:
rope_mode = RotaryPosEmbeddingMode.PAIRWISE
sin = torch.repeat_interleave(sin,
2,
dim=-1,
output_size=cos_sin.shape[-1])
cos = torch.repeat_interleave(cos,
2,
dim=-1,
output_size=cos_sin.shape[-1])
query_shape = query.shape
query = query.view(num_tokens, -1, self.head_size)
query_rot = query[..., :self.rotary_dim]
query_pass = query[..., self.rotary_dim:]
query_rot = apply_rotary_pos_emb(query_rot, cos, sin, None, 0,
rope_mode)
query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape)
if key is not None:
key_shape = key.shape
key = key.view(num_tokens, -1, self.head_size)
key_rot = key[..., :self.rotary_dim]
key_pass = key[..., self.rotary_dim:]
key_rot = apply_rotary_pos_emb(key_rot, cos, sin, None, 0,
rope_mode)
key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape)
return query, key
def forward_neuron( def forward_neuron(
self, self,
positions: torch.Tensor, positions: torch.Tensor,
......
...@@ -388,18 +388,6 @@ class VocabParallelEmbedding(torch.nn.Module): ...@@ -388,18 +388,6 @@ class VocabParallelEmbedding(torch.nn.Module):
# Copy the data. Select chunk corresponding to current shard. # Copy the data. Select chunk corresponding to current shard.
loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
if current_platform.is_hpu():
# FIXME(kzawora): Weight copy with slicing bugs out on Gaudi here,
# so we're using a workaround. Remove this when fixed in
# HPU PT bridge.
padded_weight = torch.cat([
loaded_weight,
torch.zeros(param.shape[0] - loaded_weight.shape[0],
*loaded_weight.shape[1:])
])
param.data.copy_(padded_weight)
else:
param[:loaded_weight.shape[0]].data.copy_(loaded_weight) param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
param[loaded_weight.shape[0]:].data.fill_(0) param[loaded_weight.shape[0]:].data.fill_(0)
......
...@@ -199,10 +199,6 @@ class BitsAndBytesModelLoader(BaseModelLoader): ...@@ -199,10 +199,6 @@ class BitsAndBytesModelLoader(BaseModelLoader):
if self.pre_quant: if self.pre_quant:
if self.load_8bit: if self.load_8bit:
if current_platform.is_hpu():
raise ValueError(
"currently hpu supports 4bit quantization only")
return self._quantized_8bit_generator( return self._quantized_8bit_generator(
hf_weights_files, use_safetensors, hf_weights_files, use_safetensors,
quant_state_dict), quant_state_dict quant_state_dict), quant_state_dict
...@@ -306,10 +302,6 @@ class BitsAndBytesModelLoader(BaseModelLoader): ...@@ -306,10 +302,6 @@ class BitsAndBytesModelLoader(BaseModelLoader):
in temp_state_dict): in temp_state_dict):
quant_state = _parse_quant_state(mapped_weight_name, quant_state = _parse_quant_state(mapped_weight_name,
temp_state_dict) temp_state_dict)
if current_platform.is_hpu():
assert quant_state.quant_type == "nf4", (
"currently hpu supports nf4 quant_type only")
quant_state_dict[mapped_weight_name] = quant_state quant_state_dict[mapped_weight_name] = quant_state
yield org_weight_name, weight_tensor yield org_weight_name, weight_tensor
else: else:
...@@ -380,8 +372,7 @@ class BitsAndBytesModelLoader(BaseModelLoader): ...@@ -380,8 +372,7 @@ class BitsAndBytesModelLoader(BaseModelLoader):
...] ...]
# bitsandbytes requires data in GPU # bitsandbytes requires data in GPU
if (weight_sub_tensor.is_cuda if weight_sub_tensor.is_cuda:
or weight_sub_tensor.device.type == "hpu"):
loaded_weight = weight_sub_tensor loaded_weight = weight_sub_tensor
else: else:
loaded_weight = weight_sub_tensor.to( loaded_weight = weight_sub_tensor.to(
......
...@@ -218,16 +218,6 @@ class DefaultModelLoader(BaseModelLoader): ...@@ -218,16 +218,6 @@ class DefaultModelLoader(BaseModelLoader):
weights_iterator = _xla_weights_iterator(weights_iterator) weights_iterator = _xla_weights_iterator(weights_iterator)
elif current_platform.is_hpu():
import habana_frameworks.torch.core as htcore
def _hpu_weights_iterator(iterator: Generator):
for weights in iterator:
yield weights
htcore.mark_step()
weights_iterator = _hpu_weights_iterator(weights_iterator)
if self.counter_before_loading_weights == 0.0: if self.counter_before_loading_weights == 0.0:
self.counter_before_loading_weights = time.perf_counter() self.counter_before_loading_weights = time.perf_counter()
# Apply the prefix. # Apply the prefix.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment