conftest.py 2.34 KB
Newer Older
1
2
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
"""Pytest configuration for vLLM multimodal tests."""
4

5
import os
6
7
import warnings

8
9
10
11
12
import torch

from vllm.platforms import current_platform


13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def pytest_configure(config):
    """Early ROCm configuration that must happen before test collection."""
    if not current_platform.is_rocm():
        return

    # Disable skinny GEMM on ROCm to avoid non-deterministic results
    # from atomic reductions in wvSplitKrc kernel.
    # See: https://github.com/vllm-project/vllm/pull/33493#issuecomment-3906083975
    os.environ["VLLM_ROCM_USE_SKINNY_GEMM"] = "0"
    warnings.warn(
        "ROCm: Set VLLM_ROCM_USE_SKINNY_GEMM=0 to avoid non-deterministic "
        "results from skinny GEMM atomic reductions",
        UserWarning,
        stacklevel=1,
    )


30
31
def pytest_collection_modifyitems(config, items):
    """Configure ROCm-specific settings based on collected tests."""
32
33
34
    if not current_platform.is_rocm():
        return

35
36
37
38
39
    skip_patterns = ["test_granite_speech.py"]
    if any(pattern in str(arg) for arg in config.args for pattern in skip_patterns):
        return

    # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers
40
    # accuracy issues: https://github.com/vllm-project/vllm/issues/30167
41
    # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace
42
43
44
    torch.backends.cuda.enable_flash_sdp(False)
    torch.backends.cuda.enable_mem_efficient_sdp(False)
    torch.backends.cuda.enable_math_sdp(True)
45
46
47
48
49
50
    warnings.warn(
        "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp "
        "to avoid HuggingFace Transformers accuracy issues",
        UserWarning,
        stacklevel=1,
    )
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69


def patch_hf_vision_attn_for_rocm(model):
    """Force SDPA for HF vision encoders on ROCm.

    HF's flash_attention_2 has accuracy issues on ROCm that bypass
    torch.backends.cuda settings. This forces SDPA which then uses
    math_sdp via the pytest_collection_modifyitems settings.
    """
    if not current_platform.is_rocm():
        return

    inner = getattr(model, "model", model)

    if hasattr(inner, "vision_embedding"):
        vit = inner.vision_embedding[0]
        for layer in vit.encoder.layers:
            if hasattr(layer, "self_attn"):
                layer.self_attn.vision_config._attn_implementation = "sdpa"