Unverified Commit 10138c92 authored by wangxiyuan's avatar wangxiyuan Committed by GitHub
Browse files

[V0 deprecation] Deprecate use_v1 parameter (#28112)


Signed-off-by: default avatarwangxiyuan <wangxiyuan1007@gmail.com>
parent a9d18b51
......@@ -27,7 +27,6 @@ class DummyPlatform(Platform):
dtype,
kv_cache_dtype,
block_size,
use_v1,
use_mla,
has_sink,
use_sparse,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import inspect
import os
from collections.abc import Generator
from contextlib import contextmanager
......@@ -141,13 +142,31 @@ def _cached_get_attn_backend(
# get device-specific attn_backend
from vllm.platforms import current_platform
sig = inspect.signature(current_platform.get_attn_backend_cls)
if "use_v1" in sig.parameters:
logger.warning_once(
"use_v1 parameter for get_attn_backend_cls is deprecated and will "
"be removed in v0.13.0 or v1.0.0, whichever is soonest. Please "
"remove it from your plugin code."
)
attention_cls = current_platform.get_attn_backend_cls(
selected_backend,
head_size,
dtype,
kv_cache_dtype,
block_size,
True, # use_v1
use_mla,
has_sink,
use_sparse,
)
else:
attention_cls = current_platform.get_attn_backend_cls(
selected_backend,
head_size,
dtype,
kv_cache_dtype,
block_size,
True,
use_mla,
has_sink,
use_sparse,
......
......@@ -131,7 +131,6 @@ class CpuPlatform(Platform):
dtype: torch.dtype,
kv_cache_dtype: str | None,
block_size: int,
use_v1: bool,
use_mla: bool,
has_sink: bool,
use_sparse: bool,
......@@ -144,8 +143,6 @@ class CpuPlatform(Platform):
raise NotImplementedError("MLA is not supported on CPU.")
if use_sparse:
raise NotImplementedError("Sparse Attention is not supported on CPU.")
if not use_v1:
raise ValueError("CPU backend only supports V1.")
return AttentionBackendEnum.CPU_ATTN.get_path()
@classmethod
......
......@@ -336,17 +336,10 @@ class CudaPlatformBase(Platform):
dtype: torch.dtype,
kv_cache_dtype: "CacheDType | None",
block_size: int | None,
use_v1: bool,
use_mla: bool,
has_sink: bool,
use_sparse: bool,
) -> str:
if not use_v1:
raise RuntimeError(
"V0 attention backends have been removed. Set VLLM_USE_V1=1 "
"to select a supported backend."
)
device_capability = cls.get_device_capability()
assert device_capability is not None
......
......@@ -215,7 +215,6 @@ class Platform:
dtype: torch.dtype,
kv_cache_dtype: "CacheDType | None",
block_size: int,
use_v1: bool,
use_mla: bool,
has_sink: bool,
use_sparse: bool,
......
......@@ -213,7 +213,6 @@ class RocmPlatform(Platform):
dtype,
kv_cache_dtype,
block_size,
use_v1,
use_mla,
has_sink,
use_sparse,
......@@ -224,12 +223,6 @@ class RocmPlatform(Platform):
if use_sparse:
raise NotImplementedError("Sparse Attention is not supported on ROCm.")
if not use_v1:
raise RuntimeError(
"V0 attention backends have been removed. Set VLLM_USE_V1=1 "
"to select a supported backend."
)
if use_mla:
if selected_backend is None:
selected_backend = (
......
......@@ -58,7 +58,6 @@ class TpuPlatform(Platform):
dtype: torch.dtype,
kv_cache_dtype: str | None,
block_size: int,
use_v1: bool,
use_mla: bool,
has_sink,
use_sparse,
......@@ -70,8 +69,6 @@ class TpuPlatform(Platform):
if selected_backend != AttentionBackendEnum.PALLAS:
logger.info("Cannot use %s backend on TPU.", selected_backend)
if not use_v1:
raise ValueError("TPU backend only supports V1.")
logger.info("Using Pallas V1 backend.")
return AttentionBackendEnum.PALLAS.get_path()
......
......@@ -48,7 +48,6 @@ class XPUPlatform(Platform):
dtype: torch.dtype,
kv_cache_dtype: str | None,
block_size: int,
use_v1: bool,
use_mla: bool,
has_sink: bool,
use_sparse,
......@@ -76,7 +75,7 @@ class XPUPlatform(Platform):
elif selected_backend:
raise ValueError(
f"Invalid attention backend for {cls.device_name}, "
f"with use_v1: {use_v1} use_mla: {use_mla}"
f"with use_mla: {use_mla}"
)
logger.info("Using Flash Attention backend.")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment