Unverified Commit 7b43db21 authored by Andreas Karatzas's avatar Andreas Karatzas Committed by GitHub
Browse files

[ROCm][CI][Bugfix] Multi-Modal Model Support Fixes and Attention Backend Improvements (#30270)


Signed-off-by: default avatarAndreas Karatzas <akaratza@amd.com>
parent 6a09612b
...@@ -964,7 +964,7 @@ steps: ...@@ -964,7 +964,7 @@ steps:
- pytest -v -s models/multimodal/processing - pytest -v -s models/multimodal/processing
- label: Multi-Modal Models Test (Standard) # 60min - label: Multi-Modal Models Test (Standard) # 60min
timeout_in_minutes: 80 timeout_in_minutes: 100
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
...@@ -973,13 +973,15 @@ steps: ...@@ -973,13 +973,15 @@ steps:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/multimodal
commands: commands:
- export MIOPEN_DEBUG_CONV_DIRECT=0
- export MIOPEN_DEBUG_CONV_GEMM=0
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch' - pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Accuracy Eval (Small Models) # 150min - 180min - label: Multi-Modal Accuracy Eval (Small Models) # 5min
timeout_in_minutes: 180 timeout_in_minutes: 10
mirror_hardwares: [amdexperimental, amdproduction] mirror_hardwares: [amdexperimental, amdproduction]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
...@@ -989,7 +991,9 @@ steps: ...@@ -989,7 +991,9 @@ steps:
- vllm/inputs/ - vllm/inputs/
- vllm/v1/core/ - vllm/v1/core/
commands: commands:
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 - export MIOPEN_DEBUG_CONV_DIRECT=0
- export MIOPEN_DEBUG_CONV_GEMM=0
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt
- label: Multi-Modal Models Test (Extended) 1 # 60min - label: Multi-Modal Models Test (Extended) 1 # 60min
timeout_in_minutes: 120 timeout_in_minutes: 120
...@@ -1001,10 +1005,13 @@ steps: ...@@ -1001,10 +1005,13 @@ steps:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/multimodal
commands: commands:
- export MIOPEN_DEBUG_CONV_DIRECT=0
- export MIOPEN_DEBUG_CONV_GEMM=0
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
- label: Multi-Modal Models Test (Extended) 2 - label: Multi-Modal Models Test (Extended) 2 #60min
timeout_in_minutes: 120
mirror_hardwares: [amdexperimental] mirror_hardwares: [amdexperimental]
agent_pool: mi325_1 agent_pool: mi325_1
# grade: Blocking # grade: Blocking
...@@ -1013,6 +1020,8 @@ steps: ...@@ -1013,6 +1020,8 @@ steps:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/multimodal
commands: commands:
- export MIOPEN_DEBUG_CONV_DIRECT=0
- export MIOPEN_DEBUG_CONV_GEMM=0
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
...@@ -1026,6 +1035,8 @@ steps: ...@@ -1026,6 +1035,8 @@ steps:
- vllm/ - vllm/
- tests/models/multimodal - tests/models/multimodal
commands: commands:
- export MIOPEN_DEBUG_CONV_DIRECT=0
- export MIOPEN_DEBUG_CONV_GEMM=0
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM tests.""" """Pytest configuration for vLLM multimodal tests."""
import warnings import warnings
...@@ -9,16 +9,13 @@ import torch ...@@ -9,16 +9,13 @@ import torch
from vllm.platforms import current_platform from vllm.platforms import current_platform
def pytest_configure(config): def pytest_collection_modifyitems(config, items):
"""Disable Flash/MemEfficient SDP on ROCm to avoid HF """Configure ROCm-specific settings based on collected tests."""
Transformers accuracy issues.
"""
if not current_platform.is_rocm(): if not current_platform.is_rocm():
return return
skip_patterns = ["test_granite_speech.py"] skip_patterns = ["test_granite_speech.py"]
if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns): if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
# Skip disabling SDP for Granite Speech tests on ROCm
return return
# Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
......
...@@ -173,6 +173,13 @@ VLM_TEST_SETTINGS = { ...@@ -173,6 +173,13 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner, patch_hf_runner=model_utils.qwen3_vl_patch_hf_runner,
vllm_runner_kwargs={
"attention_config": {
"backend": "ROCM_AITER_FA",
},
}
if current_platform.is_rocm()
else None,
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[ marks=[
pytest.mark.core_model, pytest.mark.core_model,
...@@ -253,8 +260,19 @@ VLM_TEST_SETTINGS = { ...@@ -253,8 +260,19 @@ VLM_TEST_SETTINGS = {
image_size_factors=[(0.25, 0.2, 0.15)], image_size_factors=[(0.25, 0.2, 0.15)],
vllm_runner_kwargs={ vllm_runner_kwargs={
"model_impl": "transformers", "model_impl": "transformers",
# TODO: [ROCm] Revert this once issue #30167 is resolved
**(
{
"mm_processor_kwargs": {
"min_pixels": 256 * 28 * 28,
"max_pixels": 1280 * 28 * 28,
}, },
marks=[large_gpu_mark(min_gb=32)], }
if current_platform.is_rocm()
else {}
),
},
marks=[large_gpu_mark(min_gb=80 if current_platform.is_rocm() else 32)],
), ),
#### Extended model tests #### Extended model tests
"aria": VLMTestInfo( "aria": VLMTestInfo(
...@@ -645,7 +663,17 @@ VLM_TEST_SETTINGS = { ...@@ -645,7 +663,17 @@ VLM_TEST_SETTINGS = {
hf_output_post_proc=model_utils.minimax_vl_01_hf_output, hf_output_post_proc=model_utils.minimax_vl_01_hf_output,
patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner, patch_hf_runner=model_utils.minimax_vl_01_patch_hf_runner,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
marks=[large_gpu_mark(min_gb=80)], marks=[
large_gpu_mark(min_gb=80),
# TODO: [ROCm] Fix pickle issue with ROCm spawn and tp>1
pytest.mark.skipif(
current_platform.is_rocm(),
reason=(
"ROCm: Model too large for single GPU; "
"multi-GPU blocked by HF _LazyConfigMapping pickle issue with spawn"
),
),
],
), ),
"molmo": VLMTestInfo( "molmo": VLMTestInfo(
models=["allenai/Molmo-7B-D-0924"], models=["allenai/Molmo-7B-D-0924"],
......
...@@ -39,7 +39,7 @@ models = [MODEL_NAME] ...@@ -39,7 +39,7 @@ models = [MODEL_NAME]
def granite_speech_attention_config(): def granite_speech_attention_config():
"""Return attention config for Granite Speech tests on ROCm.""" """Return attention config for Granite Speech tests on ROCm."""
if current_platform.is_rocm(): if current_platform.is_rocm():
return {"backend": "TRITON_ATTN"} return {"backend": "ROCM_AITER_FA"}
return None return None
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Pytest configuration for vLLM pooling tests."""
import pytest
from vllm.platforms import current_platform
@pytest.fixture
def siglip_attention_config():
"""Return attention config for SigLIP tests on ROCm.
On ROCm, SigLIP tests require FLEX_ATTENTION backend.
"""
if current_platform.is_rocm():
return {"backend": "FLEX_ATTENTION"}
return None
...@@ -22,6 +22,7 @@ from typing import TYPE_CHECKING ...@@ -22,6 +22,7 @@ from typing import TYPE_CHECKING
import torch import torch
from vllm.config.utils import getattr_iter from vllm.config.utils import getattr_iter
from vllm.logger import init_logger
from vllm.model_executor.models.interfaces import SupportsMRoPE, SupportsMultiModal from vllm.model_executor.models.interfaces import SupportsMRoPE, SupportsMultiModal
from vllm.model_executor.models.utils import WeightsMapper from vllm.model_executor.models.utils import WeightsMapper
from vllm.multimodal import MultiModalKwargsItems from vllm.multimodal import MultiModalKwargsItems
...@@ -36,6 +37,7 @@ from vllm.multimodal.inputs import ( ...@@ -36,6 +37,7 @@ from vllm.multimodal.inputs import (
from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems from vllm.multimodal.parse import ImageProcessorItems, MultiModalDataItems
from vllm.multimodal.processing import BaseMultiModalProcessor, BaseProcessingInfo from vllm.multimodal.processing import BaseMultiModalProcessor, BaseProcessingInfo
from vllm.multimodal.profiling import BaseDummyInputsBuilder from vllm.multimodal.profiling import BaseDummyInputsBuilder
from vllm.platforms import current_platform
from vllm.sequence import IntermediateTensors from vllm.sequence import IntermediateTensors
if TYPE_CHECKING: if TYPE_CHECKING:
...@@ -52,6 +54,8 @@ DYNAMIC_ARG_DIMS = { ...@@ -52,6 +54,8 @@ DYNAMIC_ARG_DIMS = {
"inputs_embeds": 0, "inputs_embeds": 0,
} }
logger = init_logger(__name__)
class MultiModalProcessingInfo(BaseProcessingInfo): class MultiModalProcessingInfo(BaseProcessingInfo):
def get_supported_mm_limits(self): def get_supported_mm_limits(self):
...@@ -345,8 +349,29 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE): ...@@ -345,8 +349,29 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
num_image_patches = kwargs.pop("num_image_patches") num_image_patches = kwargs.pop("num_image_patches")
kwargs.pop("token_type_ids", None) # used only in `forward` kwargs.pop("token_type_ids", None) # used only in `forward`
if pixel_values is not None: if pixel_values is not None:
vision_embeddings = self.model.get_image_features(pixel_values, **kwargs) # ROCm: Force math SDP backend for vision encoder to avoid accuracy issues
# with flash_sdp and mem_efficient_sdp
if current_platform.is_rocm():
# TODO: [ROCm] Fix accuracy issues with flash backend
logger.debug(
"ROCm platform detected. Forcing math SDP backend "
"for vision encoder. Currently ROCm platform has "
"accuracy issues with `flash_sdp` and"
"`mem_efficient_sdp` backends. See issue: "
"https://github.com/vllm-project/vllm/issues/30167"
)
with torch.nn.attention.sdpa_kernel(
backends=[torch.nn.attention.SDPBackend.MATH]
):
vision_embeddings = self.model.get_image_features(
pixel_values, **kwargs
)
else:
vision_embeddings = self.model.get_image_features(
pixel_values, **kwargs
)
if isinstance(vision_embeddings, torch.Tensor): if isinstance(vision_embeddings, torch.Tensor):
if vision_embeddings.ndim == 2: if vision_embeddings.ndim == 2:
...@@ -364,6 +389,11 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE): ...@@ -364,6 +389,11 @@ class MultiModalMixin(SupportsMultiModal, SupportsMRoPE):
] ]
return vision_embeddings return vision_embeddings
else:
logger.debug(
"No pixel values or image embeddings provided for multimodal embedding."
)
return None
def get_mrope_input_positions( def get_mrope_input_positions(
self, self,
......
...@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Optional ...@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, Optional
import torch import torch
import vllm.envs as envs import vllm.envs as envs
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.backends.registry import AttentionBackendEnum from vllm.attention.backends.registry import AttentionBackendEnum
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.utils.torch_utils import cuda_device_count_stateless from vllm.utils.torch_utils import cuda_device_count_stateless
...@@ -204,7 +205,7 @@ class RocmPlatform(Platform): ...@@ -204,7 +205,7 @@ class RocmPlatform(Platform):
assert block_size == 1, ( assert block_size == 1, (
"Sparse MLA backend on ROCm only supports block size 1 for now." "Sparse MLA backend on ROCm only supports block size 1 for now."
) )
logger.info_once("Using Sparse MLA backend on V1 engine.") logger.info_once("Using Sparse MLA backend.")
return AttentionBackendEnum.ROCM_AITER_MLA_SPARSE.get_path() return AttentionBackendEnum.ROCM_AITER_MLA_SPARSE.get_path()
if attn_selector_config.use_mla: if attn_selector_config.use_mla:
...@@ -239,16 +240,16 @@ class RocmPlatform(Platform): ...@@ -239,16 +240,16 @@ class RocmPlatform(Platform):
return AttentionBackendEnum.FLEX_ATTENTION.get_path() return AttentionBackendEnum.FLEX_ATTENTION.get_path()
if selected_backend == AttentionBackendEnum.TRITON_ATTN: if selected_backend == AttentionBackendEnum.TRITON_ATTN:
logger.info("Using Triton Attention backend on V1 engine.") logger.info("Using Triton Attention backend.")
return AttentionBackendEnum.TRITON_ATTN.get_path() return AttentionBackendEnum.TRITON_ATTN.get_path()
if selected_backend == AttentionBackendEnum.ROCM_ATTN: if selected_backend == AttentionBackendEnum.ROCM_ATTN:
logger.info("Using Rocm Attention backend on V1 engine.") logger.info("Using Rocm Attention backend.")
return AttentionBackendEnum.ROCM_ATTN.get_path() return AttentionBackendEnum.ROCM_ATTN.get_path()
if selected_backend == AttentionBackendEnum.ROCM_AITER_FA: if selected_backend == AttentionBackendEnum.ROCM_AITER_FA:
if on_gfx9(): if on_gfx9():
logger.info("Using Aiter Flash Attention backend on V1 engine.") logger.info("Using Aiter Flash Attention backend.")
return AttentionBackendEnum.ROCM_AITER_FA.get_path() return AttentionBackendEnum.ROCM_AITER_FA.get_path()
else: else:
raise ValueError( raise ValueError(
...@@ -257,25 +258,25 @@ class RocmPlatform(Platform): ...@@ -257,25 +258,25 @@ class RocmPlatform(Platform):
) )
if selected_backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN: if selected_backend == AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN:
logger.info("Using Aiter Unified Attention backend on V1 engine.") logger.info("Using Aiter Unified Attention backend.")
return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path() return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
# Handle automatic backend selection based on environment variables # Handle automatic backend selection based on environment variables
if selected_backend is None: if selected_backend is None:
# Priority 1: Check for AITER Unified Attention (must check before MHA) # Priority 1: Check for AITER Unified Attention (must check before MHA)
if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION: if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION:
logger.info("Using Aiter Unified Attention backend on V1 engine.") logger.info("Using Aiter Unified Attention backend.")
return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path() return AttentionBackendEnum.ROCM_AITER_UNIFIED_ATTN.get_path()
# Priority 2: Check for AITER MHA (Flash Attention) # Priority 2: Check for AITER MHA (Flash Attention)
# Only use if explicitly enabled (not just VLLM_ROCM_USE_AITER=1) # Only use if explicitly enabled (not just VLLM_ROCM_USE_AITER=1)
if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9(): if envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA and on_gfx9():
logger.info("Using Aiter Flash Attention backend on V1 engine.") logger.info("Using Aiter Flash Attention backend.")
return AttentionBackendEnum.ROCM_AITER_FA.get_path() return AttentionBackendEnum.ROCM_AITER_FA.get_path()
# Priority 3: Check for ROCM_ATTN (prefill-decode split) # Priority 3: Check for ROCM_ATTN (prefill-decode split)
if envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION: if envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
logger.info("Using Rocm Attention backend on V1 engine.") logger.info("Using Rocm Attention backend.")
return AttentionBackendEnum.ROCM_ATTN.get_path() return AttentionBackendEnum.ROCM_ATTN.get_path()
# Priority 4: Check for AITER enabled without specific flags # Priority 4: Check for AITER enabled without specific flags
...@@ -285,11 +286,19 @@ class RocmPlatform(Platform): ...@@ -285,11 +286,19 @@ class RocmPlatform(Platform):
and on_gfx9() and on_gfx9()
and envs.VLLM_ROCM_USE_AITER_MHA is not False and envs.VLLM_ROCM_USE_AITER_MHA is not False
): ):
logger.info("Using Aiter Flash Attention backend on V1 engine.") logger.info("Using Aiter Flash Attention backend.")
return AttentionBackendEnum.ROCM_AITER_FA.get_path() return AttentionBackendEnum.ROCM_AITER_FA.get_path()
# Priority 5: If model is Encoder-only self-attention type
if (
attn_selector_config.attn_type is not None
and attn_selector_config.attn_type == AttentionType.ENCODER_ONLY
):
logger.info("Using FlexAttention backend.")
return AttentionBackendEnum.FLEX_ATTENTION.get_path()
# Default: Triton Unified Attention # Default: Triton Unified Attention
logger.info("Using Triton Attention backend on V1 engine.") logger.info("Using Triton Attention backend.")
return AttentionBackendEnum.TRITON_ATTN.get_path() return AttentionBackendEnum.TRITON_ATTN.get_path()
raise RuntimeError( raise RuntimeError(
...@@ -324,14 +333,19 @@ class RocmPlatform(Platform): ...@@ -324,14 +333,19 @@ class RocmPlatform(Platform):
from vllm._aiter_ops import rocm_aiter_ops from vllm._aiter_ops import rocm_aiter_ops
if rocm_aiter_ops.is_mha_enabled(): if rocm_aiter_ops.is_enabled():
# Note: AITER FA is only supported for Qwen-VL models. logger.info_once("Using AITER Flash Attention backend for ViT model.")
# TODO: Add support for other VL models in their model class.
return AttentionBackendEnum.ROCM_AITER_FA return AttentionBackendEnum.ROCM_AITER_FA
if on_gfx9() and find_spec("flash_attn") is not None: if (
on_gfx9()
and find_spec("flash_attn") is not None
and (dtype == torch.float16 or dtype == torch.bfloat16)
):
logger.info_once("Using Flash Attention backend for ViT model.")
return AttentionBackendEnum.FLASH_ATTN return AttentionBackendEnum.FLASH_ATTN
logger.info_once("Using Torch SDPA backend for ViT model.")
return AttentionBackendEnum.TORCH_SDPA return AttentionBackendEnum.TORCH_SDPA
@classmethod @classmethod
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment