Unverified Commit 10138c92 authored by wangxiyuan's avatar wangxiyuan Committed by GitHub
Browse files

[V0 deprecation] Deprecate use_v1 parameter (#28112)


Signed-off-by: default avatarwangxiyuan <wangxiyuan1007@gmail.com>
parent a9d18b51
...@@ -27,7 +27,6 @@ class DummyPlatform(Platform): ...@@ -27,7 +27,6 @@ class DummyPlatform(Platform):
dtype, dtype,
kv_cache_dtype, kv_cache_dtype,
block_size, block_size,
use_v1,
use_mla, use_mla,
has_sink, has_sink,
use_sparse, use_sparse,
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import inspect
import os import os
from collections.abc import Generator from collections.abc import Generator
from contextlib import contextmanager from contextlib import contextmanager
...@@ -141,17 +142,35 @@ def _cached_get_attn_backend( ...@@ -141,17 +142,35 @@ def _cached_get_attn_backend(
# get device-specific attn_backend # get device-specific attn_backend
from vllm.platforms import current_platform from vllm.platforms import current_platform
attention_cls = current_platform.get_attn_backend_cls( sig = inspect.signature(current_platform.get_attn_backend_cls)
selected_backend, if "use_v1" in sig.parameters:
head_size, logger.warning_once(
dtype, "use_v1 parameter for get_attn_backend_cls is deprecated and will "
kv_cache_dtype, "be removed in v0.13.0 or v1.0.0, whichever is soonest. Please "
block_size, "remove it from your plugin code."
True, )
use_mla, attention_cls = current_platform.get_attn_backend_cls(
has_sink, selected_backend,
use_sparse, head_size,
) dtype,
kv_cache_dtype,
block_size,
True, # use_v1
use_mla,
has_sink,
use_sparse,
)
else:
attention_cls = current_platform.get_attn_backend_cls(
selected_backend,
head_size,
dtype,
kv_cache_dtype,
block_size,
use_mla,
has_sink,
use_sparse,
)
if not attention_cls: if not attention_cls:
raise ValueError( raise ValueError(
f"Invalid attention backend for {current_platform.device_name}" f"Invalid attention backend for {current_platform.device_name}"
......
...@@ -131,7 +131,6 @@ class CpuPlatform(Platform): ...@@ -131,7 +131,6 @@ class CpuPlatform(Platform):
dtype: torch.dtype, dtype: torch.dtype,
kv_cache_dtype: str | None, kv_cache_dtype: str | None,
block_size: int, block_size: int,
use_v1: bool,
use_mla: bool, use_mla: bool,
has_sink: bool, has_sink: bool,
use_sparse: bool, use_sparse: bool,
...@@ -144,8 +143,6 @@ class CpuPlatform(Platform): ...@@ -144,8 +143,6 @@ class CpuPlatform(Platform):
raise NotImplementedError("MLA is not supported on CPU.") raise NotImplementedError("MLA is not supported on CPU.")
if use_sparse: if use_sparse:
raise NotImplementedError("Sparse Attention is not supported on CPU.") raise NotImplementedError("Sparse Attention is not supported on CPU.")
if not use_v1:
raise ValueError("CPU backend only supports V1.")
return AttentionBackendEnum.CPU_ATTN.get_path() return AttentionBackendEnum.CPU_ATTN.get_path()
@classmethod @classmethod
......
...@@ -336,17 +336,10 @@ class CudaPlatformBase(Platform): ...@@ -336,17 +336,10 @@ class CudaPlatformBase(Platform):
dtype: torch.dtype, dtype: torch.dtype,
kv_cache_dtype: "CacheDType | None", kv_cache_dtype: "CacheDType | None",
block_size: int | None, block_size: int | None,
use_v1: bool,
use_mla: bool, use_mla: bool,
has_sink: bool, has_sink: bool,
use_sparse: bool, use_sparse: bool,
) -> str: ) -> str:
if not use_v1:
raise RuntimeError(
"V0 attention backends have been removed. Set VLLM_USE_V1=1 "
"to select a supported backend."
)
device_capability = cls.get_device_capability() device_capability = cls.get_device_capability()
assert device_capability is not None assert device_capability is not None
......
...@@ -215,7 +215,6 @@ class Platform: ...@@ -215,7 +215,6 @@ class Platform:
dtype: torch.dtype, dtype: torch.dtype,
kv_cache_dtype: "CacheDType | None", kv_cache_dtype: "CacheDType | None",
block_size: int, block_size: int,
use_v1: bool,
use_mla: bool, use_mla: bool,
has_sink: bool, has_sink: bool,
use_sparse: bool, use_sparse: bool,
......
...@@ -213,7 +213,6 @@ class RocmPlatform(Platform): ...@@ -213,7 +213,6 @@ class RocmPlatform(Platform):
dtype, dtype,
kv_cache_dtype, kv_cache_dtype,
block_size, block_size,
use_v1,
use_mla, use_mla,
has_sink, has_sink,
use_sparse, use_sparse,
...@@ -224,12 +223,6 @@ class RocmPlatform(Platform): ...@@ -224,12 +223,6 @@ class RocmPlatform(Platform):
if use_sparse: if use_sparse:
raise NotImplementedError("Sparse Attention is not supported on ROCm.") raise NotImplementedError("Sparse Attention is not supported on ROCm.")
if not use_v1:
raise RuntimeError(
"V0 attention backends have been removed. Set VLLM_USE_V1=1 "
"to select a supported backend."
)
if use_mla: if use_mla:
if selected_backend is None: if selected_backend is None:
selected_backend = ( selected_backend = (
......
...@@ -58,7 +58,6 @@ class TpuPlatform(Platform): ...@@ -58,7 +58,6 @@ class TpuPlatform(Platform):
dtype: torch.dtype, dtype: torch.dtype,
kv_cache_dtype: str | None, kv_cache_dtype: str | None,
block_size: int, block_size: int,
use_v1: bool,
use_mla: bool, use_mla: bool,
has_sink, has_sink,
use_sparse, use_sparse,
...@@ -70,8 +69,6 @@ class TpuPlatform(Platform): ...@@ -70,8 +69,6 @@ class TpuPlatform(Platform):
if selected_backend != AttentionBackendEnum.PALLAS: if selected_backend != AttentionBackendEnum.PALLAS:
logger.info("Cannot use %s backend on TPU.", selected_backend) logger.info("Cannot use %s backend on TPU.", selected_backend)
if not use_v1:
raise ValueError("TPU backend only supports V1.")
logger.info("Using Pallas V1 backend.") logger.info("Using Pallas V1 backend.")
return AttentionBackendEnum.PALLAS.get_path() return AttentionBackendEnum.PALLAS.get_path()
......
...@@ -48,7 +48,6 @@ class XPUPlatform(Platform): ...@@ -48,7 +48,6 @@ class XPUPlatform(Platform):
dtype: torch.dtype, dtype: torch.dtype,
kv_cache_dtype: str | None, kv_cache_dtype: str | None,
block_size: int, block_size: int,
use_v1: bool,
use_mla: bool, use_mla: bool,
has_sink: bool, has_sink: bool,
use_sparse, use_sparse,
...@@ -76,7 +75,7 @@ class XPUPlatform(Platform): ...@@ -76,7 +75,7 @@ class XPUPlatform(Platform):
elif selected_backend: elif selected_backend:
raise ValueError( raise ValueError(
f"Invalid attention backend for {cls.device_name}, " f"Invalid attention backend for {cls.device_name}, "
f"with use_v1: {use_v1} use_mla: {use_mla}" f"with use_mla: {use_mla}"
) )
logger.info("Using Flash Attention backend.") logger.info("Using Flash Attention backend.")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment