Unverified Commit 3468f17e authored by Matthew Bonanni's avatar Matthew Bonanni Committed by GitHub
Browse files

[V0 deprecation] Remove _VLLM_V1 suffixes from attention backend names (#25489)


Signed-off-by: default avatarMatthew Bonanni <mbonanni@redhat.com>
Signed-off-by: default avatarMatthew Bonanni <mbonanni001@gmail.com>
parent 71b25b0d
......@@ -35,7 +35,7 @@ docker run \
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
cd tests
pytest -v -s v1/core
pytest -v -s v1/engine
......
......@@ -103,7 +103,7 @@ backend_configs = {
# Triton Attention
"TritonAttn":
BackendConfig(name="TritonAttn",
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1"},
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
comp_config={
"cudagraph_mode": "FULL",
}),
......
......@@ -338,7 +338,7 @@ else:
@pytest.mark.parametrize("model_name, model_class", MODELS)
@pytest.mark.parametrize("backend",
[_Backend.FLASHINFER] if current_platform.is_cuda()
else [_Backend.TRITON_ATTN_VLLM_V1])
else [_Backend.TRITON_ATTN])
@pytest.mark.parametrize(
"split_attention",
[False, True] if current_platform.is_rocm() else [False])
......
......@@ -68,7 +68,7 @@ def default_server_args(with_tool_parser: bool):
def gptoss_server(monkeypatch_module: pytest.MonkeyPatch,
default_server_args: list[str]):
with monkeypatch_module.context() as m:
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
with RemoteOpenAIServer(GPT_OSS_MODEL_NAME,
default_server_args) as remote_server:
yield remote_server
......
......@@ -31,7 +31,7 @@ DEVICE_MLA_BACKENDS = {
}
DEVICE_REGULAR_ATTN_BACKENDS = {
"cuda": ["XFORMERS", "FLASHINFER"],
"cuda": ["XFORMERS", "FLASHINFER", "FLASH_ATTN"],
"hip": ["ROCM_FLASH"],
"cpu": ["TORCH_SDPA"],
}
......@@ -86,7 +86,7 @@ def test_env(
with patch("vllm.attention.selector.current_platform",
CpuPlatform()):
backend = get_attn_backend(16, torch.float16, None, block_size)
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
assert backend.get_name() == "TORCH_SDPA"
elif device == "hip":
with patch("vllm.attention.selector.current_platform",
......@@ -125,7 +125,7 @@ def test_env(
None,
block_size,
use_mla=use_mla)
expected = f"{name}_VLLM_V1"
expected = name
assert backend.get_name() == expected
else:
backend = get_attn_backend(16,
......@@ -133,7 +133,7 @@ def test_env(
None,
block_size,
use_mla=use_mla)
expected = "TRITON_ATTN_VLLM_V1"
expected = "TRITON_ATTN"
assert backend.get_name() == expected
elif device == "cuda":
......@@ -160,7 +160,7 @@ def test_env(
None,
block_size,
use_mla=use_mla)
expected = "CUTLASS_MLA_VLLM_V1"
expected = "CUTLASS_MLA"
assert backend.get_name() == expected
elif name == "FLASHINFER_MLA":
if block_size not in [32, 64]:
......@@ -193,7 +193,7 @@ def test_env(
None,
block_size,
use_mla=use_mla)
expected = f"{name}_VLLM_V1"
expected = name
assert backend.get_name() == expected
elif name == "FLASH_ATTN_MLA":
backend = get_attn_backend(16,
......@@ -210,7 +210,7 @@ def test_env(
None,
block_size,
use_mla=use_mla)
expected = "TRITON_MLA_VLLM_V1"
expected = "TRITON_MLA"
assert backend.get_name() == expected
elif name == "FLASHINFER":
backend = get_attn_backend(16,
......@@ -218,25 +218,24 @@ def test_env(
None,
block_size,
use_mla=use_mla)
expected = "FLASHINFER_VLLM_V1"
expected = "FLASHINFER"
assert backend.get_name() == expected
else:
elif name == "XFORMERS":
backend = get_attn_backend(32,
torch.float16,
None,
block_size,
use_mla=use_mla)
expected = "FLASH_ATTN_VLLM_V1"
expected = "XFORMERS"
assert backend.get_name() == expected
backend = get_attn_backend(16,
elif name == "FLASH_ATTN":
backend = get_attn_backend(32,
torch.float16,
None,
block_size,
use_mla=use_mla)
assert backend.get_name() == "FLEX_ATTENTION", (
"Should fallback to FlexAttention if head size is "
"not supported by FlashAttention")
expected = "FLASH_ATTN"
assert backend.get_name() == expected
@pytest.mark.parametrize("device", ["cpu", "cuda"])
......@@ -252,7 +251,7 @@ def test_fp32_fallback(
with patch("vllm.attention.selector.current_platform",
CpuPlatform()):
backend = get_attn_backend(16, torch.float32, None, 16)
assert backend.get_name() == "TORCH_SDPA_VLLM_V1"
assert backend.get_name() == "TORCH_SDPA"
elif device == "cuda":
with patch("vllm.attention.selector.current_platform",
......@@ -266,6 +265,9 @@ def test_flash_attn(monkeypatch: pytest.MonkeyPatch):
# TODO: When testing for v1, pipe in `use_v1` as an argument to
# get_attn_backend
pytest.skip("Skipping as current backend selector does not " \
"handle fallbacks when a backend is set via env var.")
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, STR_FLASH_ATTN_VAL)
......
......@@ -28,7 +28,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
# Test standard ROCm attention
backend = get_attn_backend(16, torch.float16, torch.float16, 16, False)
assert (backend.get_name() == "ROCM_FLASH"
or backend.get_name() == "TRITON_ATTN_VLLM_V1")
or backend.get_name() == "TRITON_ATTN")
# MLA test for deepseek related
......@@ -40,8 +40,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
16,
False,
use_mla=True)
assert (backend.get_name() == "TRITON_MLA"
or backend.get_name() == "TRITON_MLA_VLLM_V1")
assert backend.get_name() == "TRITON_MLA"
# If attention backend is None
# If use_mla is true
......@@ -53,8 +52,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
16,
False,
use_mla=True)
assert (backend.get_name() == "TRITON_MLA"
or backend.get_name() == "TRITON_MLA_VLLM_V1")
assert backend.get_name() == "TRITON_MLA"
# change the attention backend to AITER MLA
m.setenv(STR_BACKEND_ENV_VAR, "ROCM_AITER_MLA")
......@@ -64,8 +62,7 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
1,
False,
use_mla=True)
assert (backend.get_name() == "ROCM_AITER_MLA"
or backend.get_name() == "ROCM_AITER_MLA_VLLM_V1")
assert backend.get_name() == "ROCM_AITER_MLA"
# If attention backend is None
# If use_mla is true
......@@ -79,5 +76,4 @@ def test_selector(monkeypatch: pytest.MonkeyPatch):
1,
False,
use_mla=True)
assert (backend.get_name() == "ROCM_AITER_MLA"
or backend.get_name() == "ROCM_AITER_MLA_VLLM_V1")
assert backend.get_name() == "ROCM_AITER_MLA"
......@@ -524,14 +524,14 @@ def make_backend(backend_name: str) -> AttentionBackend:
* Backend instance
'''
if backend_name in (STR_XFORMERS_ATTN_VAL, "XFORMERS_VLLM_V1"):
if backend_name == STR_XFORMERS_ATTN_VAL:
from vllm.v1.attention.backends.xformers import (
XFormersAttentionBackend)
return XFormersAttentionBackend()
if backend_name in (STR_FLASH_ATTN_VAL, "FLASH_ATTN_VLLM_V1"):
if backend_name == STR_FLASH_ATTN_VAL:
from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend
return FlashAttentionBackend()
if backend_name == "TRITON_ATTN_VLLM_V1":
if backend_name == "TRITON_ATTN":
from vllm.v1.attention.backends.triton_attn import (
TritonAttentionBackend)
return TritonAttentionBackend()
......@@ -539,7 +539,7 @@ def make_backend(backend_name: str) -> AttentionBackend:
from vllm.v1.attention.backends.flex_attention import (
FlexAttentionBackend)
return FlexAttentionBackend()
if backend_name in ("TORCH_SDPA", "TORCH_SDPA_VLLM_V1"):
if backend_name == "TORCH_SDPA":
from vllm.v1.attention.backends.cpu_attn import TorchSDPABackend
return TorchSDPABackend()
if backend_name == "FLASHINFER":
......
......@@ -84,7 +84,7 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
# FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
# has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
# L4 supports FA3.
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN")
if model_arch == "WhisperForConditionalGeneration":
m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
LLM(
......
......@@ -1131,14 +1131,14 @@ def has_module_attribute(module_name, attribute_name):
def get_attn_backend_list_based_on_platform() -> list[str]:
if current_platform.is_cuda():
return ["FLASH_ATTN_VLLM_V1", "TRITON_ATTN_VLLM_V1", "TREE_ATTN"]
return ["FLASH_ATTN", "TRITON_ATTN", "TREE_ATTN"]
elif current_platform.is_rocm():
attn_backend_list = ["TRITON_ATTN_VLLM_V1"]
attn_backend_list = ["TRITON_ATTN"]
try:
import aiter # noqa: F401
attn_backend_list.append("FLASH_ATTN_VLLM_V1")
attn_backend_list.append("FLASH_ATTN")
except Exception:
print("Skip FLASH_ATTN_VLLM_V1 on ROCm as aiter is not installed")
print("Skip FLASH_ATTN on ROCm as aiter is not installed")
return attn_backend_list
else:
......
......@@ -21,16 +21,15 @@ from vllm.v1.attention.backends.utils import (CommonAttentionMetadata,
from vllm.v1.kv_cache_interface import FullAttentionSpec
BACKENDS_TO_TEST = [
_Backend.FLASH_ATTN_VLLM_V1, _Backend.FLASHINFER_VLLM_V1,
_Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN_VLLM_V1, _Backend.TREE_ATTN,
"FLEX_ATTENTION_SLOW"
_Backend.FLASH_ATTN, _Backend.FLASHINFER, _Backend.FLEX_ATTENTION,
_Backend.TRITON_ATTN, _Backend.TREE_ATTN, "FLEX_ATTENTION_SLOW"
]
# Remove flashinfer from the list if it's not available
try:
import flashinfer # noqa: F401
except ImportError:
BACKENDS_TO_TEST.remove(_Backend.FLASHINFER_VLLM_V1)
BACKENDS_TO_TEST.remove(_Backend.FLASHINFER)
def _convert_dtype_to_torch(dtype):
......@@ -214,7 +213,7 @@ def run_attention_backend(
builder_cls, impl_cls = get_attention_backend(actual_backend)
# Mock flashinfer's get_per_layer_parameters if needed
if actual_backend == _Backend.FLASHINFER_VLLM_V1:
if actual_backend == _Backend.FLASHINFER:
import unittest.mock
from vllm.v1.attention.backends.utils import PerLayerParameters
......@@ -434,7 +433,7 @@ def _test_backend_correctness(
# [num_blocks, 2, block_size, num_kv_heads, head_size]
# Select the appropriate KV cache format for each backend
kv_cache_for_backend = kv_cache
if backend_name == _Backend.FLASHINFER_VLLM_V1:
if backend_name == _Backend.FLASHINFER:
kv_cache_for_backend = kv_cache.transpose(0, 1)
# For FlashInfer default to HND layout and
......@@ -518,8 +517,8 @@ def test_causal_backend_correctness(batch_spec_name: str, model: str):
SLIDING_WINDOW_BACKENDS_TO_TEST = [
_Backend.FLASH_ATTN_VLLM_V1, _Backend.FLEX_ATTENTION,
_Backend.TRITON_ATTN_VLLM_V1, "FLEX_ATTENTION_SLOW"
_Backend.FLASH_ATTN, _Backend.FLEX_ATTENTION, _Backend.TRITON_ATTN,
"FLEX_ATTENTION_SLOW"
]
......
......@@ -15,8 +15,8 @@ from vllm.v1.attention.backends.utils import CommonAttentionMetadata
from vllm.v1.kv_cache_interface import FullAttentionSpec
BACKENDS_TO_TEST = [
_Backend.CUTLASS_MLA, _Backend.FLASHMLA_VLLM_V1, _Backend.FLASH_ATTN_MLA,
_Backend.TRITON_MLA_VLLM_V1
_Backend.CUTLASS_MLA, _Backend.FLASHMLA, _Backend.FLASH_ATTN_MLA,
_Backend.TRITON_MLA
]
# Remove CUTLASS_MLA from the list if not using sm100
......
......@@ -120,30 +120,30 @@ def get_attention_backend(backend_name: _Backend):
Tuple of (backend_builder_class, backend_impl_class)
"""
backend_map = {
_Backend.FLASH_ATTN_VLLM_V1:
_Backend.FLASH_ATTN:
("vllm.v1.attention.backends.flash_attn.FlashAttentionBackend"
if current_platform.is_cuda() else
"vllm.v1.attention.backends.rocm_aiter_fa.AiterFlashAttentionBackend"
),
_Backend.FLASHINFER_VLLM_V1:
_Backend.FLASHINFER:
"vllm.v1.attention.backends.flashinfer.FlashInferBackend",
_Backend.FLEX_ATTENTION:
"vllm.v1.attention.backends.flex_attention.FlexAttentionBackend",
_Backend.TRITON_ATTN_VLLM_V1:
_Backend.TRITON_ATTN:
"vllm.v1.attention.backends.triton_attn.TritonAttentionBackend",
_Backend.TREE_ATTN:
"vllm.v1.attention.backends.tree_attn.TreeAttentionBackend",
_Backend.XFORMERS_VLLM_V1:
_Backend.XFORMERS:
"vllm.v1.attention.backends.xformers.XFormersAttentionBackend",
_Backend.CUTLASS_MLA:
"vllm.v1.attention.backends.mla.cutlass_mla.CutlassMLABackend",
_Backend.FLASHMLA_VLLM_V1:
_Backend.FLASHMLA:
"vllm.v1.attention.backends.mla.flashmla.FlashMLABackend",
_Backend.FLASH_ATTN_MLA:
"vllm.v1.attention.backends.mla.flashattn_mla.FlashAttnMLABackend",
_Backend.FLASHINFER_MLA:
"vllm.v1.attention.backends.mla.flashinfer_mla.FlashInferMLABackend",
_Backend.TRITON_MLA_VLLM_V1:
_Backend.TRITON_MLA:
"vllm.v1.attention.backends.mla.triton_mla.TritonMLABackend",
}
......
......@@ -89,7 +89,7 @@ backend_configs = {
# Triton Attention
"TritonAttn":
BackendConfig(name="TritonAttn",
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN_VLLM_V1"},
env_vars={"VLLM_ATTENTION_BACKEND": "TRITON_ATTN"},
comp_config={
"cudagraph_mode": "FULL_AND_PIECEWISE",
}),
......
......@@ -9,11 +9,14 @@ from ...utils import create_new_process_for_each_test
@create_new_process_for_each_test()
@pytest.mark.parametrize("attn_backend",
["FLASH_ATTN_VLLM_V1", "FLASHINFER_VLLM_V1"])
@pytest.mark.parametrize("attn_backend", ["FLASH_ATTN", "FLASHINFER"])
def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
prompt = "\n<User>: Implement fibonacci sequence in Python.\n<Claude>:"
if attn_backend == "FLASHINFER":
pytest.skip("This test is failing with FlashInfer backend and "
"needs investigation. See issue #25679.")
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
......
......@@ -176,12 +176,11 @@ def test_eagle_correctness(
m.setenv("VLLM_MLA_DISABLE", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
if (attn_backend == "TRITON_ATTN_VLLM_V1"
and not current_platform.is_rocm()):
pytest.skip("TRITON_ATTN_VLLM_V1 does not support "
if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
pytest.skip("TRITON_ATTN does not support "
"multi-token eagle spec decode on current platform")
if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm():
if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
m.setenv("VLLM_ROCM_USE_AITER", "1")
method, model_name, spec_model_name, tp_size = model_setup
......
......@@ -314,12 +314,11 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
if (attn_backend == "TRITON_ATTN_VLLM_V1"
and not current_platform.is_rocm()):
pytest.skip("TRITON_ATTN_VLLM_V1 does not support "
if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
pytest.skip("TRITON_ATTN does not support "
"multi-token eagle spec decode on current platform")
if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm():
if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
# Setup draft model mock
......@@ -400,16 +399,15 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
if (attn_backend == "TRITON_ATTN_VLLM_V1"
and not current_platform.is_rocm()):
pytest.skip("TRITON_ATTN_VLLM_V1 does not support "
if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
pytest.skip("TRITON_ATTN does not support "
"multi-token eagle spec decode on current platform")
if (attn_backend == "TREE_ATTN"):
pytest.skip("TREE_ATTN is tested separately in test_propose_tree"
"because it requires special input mocking.")
if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm():
if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
# Use GPU device
......@@ -510,12 +508,12 @@ def test_propose(method, attn_backend, num_speculative_tokens, monkeypatch):
device=device)
sampling_metadata = mock.MagicMock()
if attn_backend == "FLASH_ATTN_VLLM_V1":
if attn_backend == "FLASH_ATTN":
attn_metadata_builder_cls, _ = get_attention_backend(
_Backend.FLASH_ATTN_VLLM_V1)
elif attn_backend == "TRITON_ATTN_VLLM_V1":
_Backend.FLASH_ATTN)
elif attn_backend == "TRITON_ATTN":
attn_metadata_builder_cls, _ = get_attention_backend(
_Backend.TRITON_ATTN_VLLM_V1)
_Backend.TRITON_ATTN)
elif attn_backend == "TREE_ATTN":
attn_metadata_builder_cls, _ = get_attention_backend(
_Backend.TREE_ATTN)
......
......@@ -41,12 +41,11 @@ def test_eagle_max_len(monkeypatch: pytest.MonkeyPatch,
m.setenv("VLLM_USE_V1", "1")
m.setenv("VLLM_ATTENTION_BACKEND", attn_backend)
if (attn_backend == "TRITON_ATTN_VLLM_V1"
and not current_platform.is_rocm()):
pytest.skip("TRITON_ATTN_VLLM_V1 does not support "
if (attn_backend == "TRITON_ATTN" and not current_platform.is_rocm()):
pytest.skip("TRITON_ATTN does not support "
"multi-token eagle spec decode on current platform")
if attn_backend == "FLASH_ATTN_VLLM_V1" and current_platform.is_rocm():
if attn_backend == "FLASH_ATTN" and current_platform.is_rocm():
m.setenv("VLLM_ROCM_USE_AITER", "1")
llm = LLM(
......
......@@ -278,7 +278,7 @@ def test_tree_attn_correctness() -> None:
block_table=block_table,
slot_mapping=branch_slot_mapping,
seqlen_k=sequence_position + q_len,
backend=_Backend.FLASH_ATTN_VLLM_V1,
backend=_Backend.FLASH_ATTN,
).view(batch_size, -1, num_heads, dim_per_head)
# Compare the outputs.
......
......@@ -54,26 +54,3 @@ def test_v1_llm_by_default(monkeypatch):
print(llm.generate("Hello my name is"))
assert hasattr(llm.llm_engine, "engine_core")
m.delenv("VLLM_USE_V1")
def test_v1_attn_backend(monkeypatch):
with monkeypatch.context() as m:
if os.getenv("VLLM_USE_V1", None):
m.delenv("VLLM_USE_V1")
m.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
# Fall back to V0.
_ = AsyncEngineArgs(model=MODEL).create_engine_config()
assert not envs.VLLM_USE_V1
m.delenv("VLLM_USE_V1")
# Reject if V1.
m.setenv("VLLM_USE_V1", "1")
with pytest.raises(NotImplementedError):
AsyncEngineArgs(model=MODEL).create_engine_config()
m.delenv("VLLM_USE_V1")
m.setenv("VLLM_ATTENTION_BACKEND", "FLASHMLA")
_ = AsyncEngineArgs(model=MODEL).create_engine_config()
assert envs.VLLM_USE_V1
m.delenv("VLLM_USE_V1")
......@@ -364,7 +364,7 @@ class Attention(nn.Module, AttentionLayerBase):
self.impl.process_weights_after_loading(act_dtype)
# FlashInfer requires attention sinks to be float32
if (self.backend == _Backend.FLASHINFER_VLLM_V1
if (self.backend == _Backend.FLASHINFER
and hasattr(self.impl, 'sinks')):
from vllm.v1.attention.backends.flashinfer import FlashInferImpl
assert isinstance(self.impl, FlashInferImpl)
......@@ -420,21 +420,17 @@ class MultiHeadAttention(nn.Module):
self.attn_backend = backend if backend in {
_Backend.TORCH_SDPA,
_Backend.TORCH_SDPA_VLLM_V1,
_Backend.XFORMERS,
_Backend.PALLAS_VLLM_V1,
_Backend.PALLAS,
_Backend.ROCM_AITER_FA,
_Backend.FLASH_ATTN,
_Backend.FLASH_ATTN_VLLM_V1,
} else _Backend.TORCH_SDPA
if (self.attn_backend == _Backend.XFORMERS
and not check_xformers_availability()):
self.attn_backend = _Backend.TORCH_SDPA
if self.attn_backend in {
_Backend.FLASH_ATTN, _Backend.FLASH_ATTN_VLLM_V1
}:
if self.attn_backend == _Backend.FLASH_ATTN:
if use_upstream_fa:
from flash_attn import flash_attn_varlen_func
self._flash_attn_varlen_func = flash_attn_varlen_func
......@@ -468,11 +464,7 @@ class MultiHeadAttention(nn.Module):
key = torch.repeat_interleave(key, num_repeat, dim=2)
value = torch.repeat_interleave(value, num_repeat, dim=2)
if self.attn_backend in {
_Backend.FLASH_ATTN,
_Backend.FLASH_ATTN_VLLM_V1,
}:
if self.attn_backend == _Backend.FLASH_ATTN:
cu_seqlens_q = torch.arange(0, (bsz + 1) * q_len,
step=q_len,
dtype=torch.int32,
......@@ -499,8 +491,7 @@ class MultiHeadAttention(nn.Module):
key,
value,
scale=self.scale)
elif (self.attn_backend == _Backend.TORCH_SDPA
or self.attn_backend == _Backend.TORCH_SDPA_VLLM_V1):
elif self.attn_backend == _Backend.TORCH_SDPA:
query, key, value = (x.transpose(1, 2)
for x in (query, key, value))
out = F.scaled_dot_product_attention(query,
......@@ -508,7 +499,7 @@ class MultiHeadAttention(nn.Module):
value,
scale=self.scale)
out = out.transpose(1, 2)
elif self.attn_backend == _Backend.PALLAS_VLLM_V1:
elif self.attn_backend == _Backend.PALLAS:
query, key, value = (x.transpose(1, 2)
for x in (query, key, value))
from torch_xla.experimental.custom_kernel import flash_attention
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment