[test]fix basic_correctness and benchmarks

dbd62f84 · zhuwenwen · 0e8619b8 · dbd62f84 · dbd62f84 · dbd62f84
Commit dbd62f84 authored May 29, 2025 by zhuwenwen
8 changed files
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -16,6 +16,8 @@ from ..models.utils import check_outputs_equal
 from ..utils import multi_gpu_test
 import os
 from ..utils import models_path_prefix
+from vllm.utils import gpuname
+import vllm.envs as envs

 MODELS = [
    os.path.join(models_path_prefix, "google/gemma-2-2b-it"),
@@ -35,7 +37,11 @@ def v1(run_with_both_engines):

 def test_vllm_gc_ed():
    """Verify vllm instance is GC'ed when it is deleted"""
+    if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
+        llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"), block_size=64)
+    else:
        llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"))
+        
    weak_llm = weakref.ref(llm)
    del llm
    # If there's any circular reference to vllm, this fails
@@ -79,6 +85,16 @@ def test_models(
        with hf_runner(model, dtype=dtype) as hf_model:
            hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

+        if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
+            with VllmRunner(model,
+                        max_model_len=8192,
+                        dtype=dtype,
+                        enforce_eager=enforce_eager,
+                        gpu_memory_utilization=0.7,
+                        block_size=64) as vllm_model:
+                vllm_outputs = vllm_model.generate_greedy(example_prompts,
+                                                        max_tokens)
+        else:
            with VllmRunner(model,
                            max_model_len=8192,
                            dtype=dtype,

--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -21,6 +21,8 @@ from ..models.utils import check_logprobs_close, check_outputs_equal
 from ..utils import multi_gpu_test
 import os
 from ..utils import models_path_prefix
+from vllm.utils import gpuname
+import vllm.envs as envs

 if TYPE_CHECKING:
    from .conftest import HfRunner, VllmRunner
@@ -50,7 +52,7 @@ def use_v0_only(monkeypatch: pytest.MonkeyPatch):
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.
 @pytest.mark.parametrize("tensor_parallel_size", [1])
-@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
+@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"] if not current_platform.is_rocm() else ["FLASH_ATTN"]) 
 def test_models(
    hf_runner: HfRunner,
    vllm_runner: VllmRunner,
@@ -85,6 +87,7 @@ def test_models(
                tensor_parallel_size=tensor_parallel_size,
                enforce_eager=enforce_eager,
                max_num_seqs=max_num_seqs,
+                block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
        ) as vllm_model:
            vllm_outputs = vllm_model.generate_greedy(example_prompts,
                                                      max_tokens)
@@ -100,7 +103,7 @@ def test_models(
 @multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
+@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"] if not current_platform.is_rocm() else ["FLASH_ATTN"])
 def test_models_distributed(
    hf_runner: HfRunner,
    vllm_runner: VllmRunner,
@@ -142,6 +145,7 @@ def test_models_distributed(
                enable_chunked_prefill=enable_chunked_prefill,
                max_num_batched_tokens=max_num_batched_tokens,
                distributed_executor_backend=distributed_executor_backend,
+                block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
        ) as vllm_model:
            vllm_outputs = vllm_model.generate_greedy(
                example_prompts,
@@ -267,6 +271,7 @@ def test_with_prefix_caching(
                tensor_parallel_size=tensor_parallel_size,
                enforce_eager=enforce_eager,
                max_num_seqs=max_num_seqs,
+                block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
        ) as vllm_model:
            outputs[enable] = []
            for prompt in full_prompts:

--- a/tests/basic_correctness/test_cumem.py
+++ b/tests/basic_correctness/test_cumem.py
 # SPDX-License-Identifier: Apache-2.0

+import os
 import pytest
 import torch

@@ -7,8 +8,7 @@ from vllm import LLM, SamplingParams
 from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.utils import GiB_bytes

-from ..utils import create_new_process_for_each_test
-
+from ..utils import create_new_process_for_each_test, models_path_prefix

 @create_new_process_for_each_test()
 def test_python_error():
@@ -119,9 +119,9 @@ def test_cumem_with_cudagraph():
    "model, use_v1",
    [
        # sleep mode with safetensors
-        ("meta-llama/Llama-3.2-1B", True),
+        (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"), True),
        # sleep mode with pytorch checkpoint
-        ("facebook/opt-125m", False),
+        (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"), False),
    ])
 def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
    with monkeypatch.context() as m:

--- a/tests/benchmarks/test_latency_cli.py
+++ b/tests/benchmarks/test_latency_cli.py
@@ -2,8 +2,10 @@
 import subprocess

 import pytest
+import os
+from ..utils import models_path_prefix

-MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")


 @pytest.mark.benchmark

--- a/tests/benchmarks/test_serve_cli.py
+++ b/tests/benchmarks/test_serve_cli.py
@@ -2,10 +2,11 @@
 import subprocess

 import pytest
+import os

-from ..utils import RemoteOpenAIServer
+from ..utils import RemoteOpenAIServer, models_path_prefix

-MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")


 @pytest.fixture(scope="module")

--- a/tests/benchmarks/test_throughput_cli.py
+++ b/tests/benchmarks/test_throughput_cli.py
 # SPDX-License-Identifier: Apache-2.0
 import subprocess

+import os
 import pytest

-MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
+from ..utils import  models_path_prefix
+
+MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")


 @pytest.mark.benchmark

--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -13,8 +13,9 @@ import torch

 from vllm import LLM, SamplingParams

-from utils import models_path_prefix
+from .utils import models_path_prefix
 from vllm.utils import SUPPORT_TC, gpuname
+import vllm.envs as envs


 @pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len")
@@ -37,15 +38,15 @@ def test_max_tokens_none():
    sampling_params = SamplingParams(temperature=0.01,
                                     top_p=0.1,
                                     max_tokens=None)
-    if not gpuname.startswith('BW'):
+    if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
        llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"),
                max_num_batched_tokens=4096,
-                tensor_parallel_size=1)
+                tensor_parallel_size=1,
+                block_size=64)
    else:
        llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"),
                max_num_batched_tokens=4096,
-                tensor_parallel_size=1,
-                block_size=64)
+                tensor_parallel_size=1)
    prompts = ["Just say hello!"]
    outputs = llm.generate(prompts, sampling_params=sampling_params)

@@ -70,10 +71,10 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
    # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_MODELSCOPE", "True")
-        if not gpuname.startswith('BW'):
-            llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"))
-        else:
+        if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
            llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"), block_size=64)
+        else:
+            llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"))

        prompts = [
            "Hello, my name is",

--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -852,8 +852,6 @@ class ROCmFlashAttentionImpl(AttentionImpl):
            else:
                # prefix-enabled attention -
                # not applicable for encoder-only models
-                # if not envs.VLLM_USE_TRITON_PREFIX_FLASH_ATTN:
-                #         self.fa_prefix_attn_func = vllm_flash_attn_varlen_func
                if envs.VLLM_USE_TRITON_PREFIX_FLASH_ATTN or gpuname.startswith('BW'):   
                    version_key = triton_key()
                    if self.attn_type != AttentionType.ENCODER_ONLY: