remove unused backend

308e5937 · zhuwenwen · 00f18159 · 308e5937 · 308e5937 · 308e5937
Commit 308e5937 authored Nov 21, 2024 by zhuwenwen
4 changed files
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -10,6 +10,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.metrics import RayPrometheusStatLogger
 from vllm.sampling_params import SamplingParams
+import vllm.envs as envs

 from ..conftest import cleanup

@@ -19,7 +20,7 @@ MODELS = [


 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", [("float" if envs.VLLM_USE_TRITON_FLASH_ATTN else "half")])
 @pytest.mark.parametrize("max_tokens", [128])
 def test_metric_counter_prompt_tokens(
    vllm_runner,
@@ -54,7 +55,7 @@ def test_metric_counter_prompt_tokens(


 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", [("float" if envs.VLLM_USE_TRITON_FLASH_ATTN else "half")])
 @pytest.mark.parametrize("max_tokens", [128])
 def test_metric_counter_generation_tokens(
    vllm_runner,
@@ -86,7 +87,7 @@ def test_metric_counter_generation_tokens(


 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", [("float" if envs.VLLM_USE_TRITON_FLASH_ATTN else "half")])
 @pytest.mark.parametrize(
    "served_model_name",
    [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])

--- a/tests/multi_step/test_correctness_async_llm.py
+++ b/tests/multi_step/test_correctness_async_llm.py
@@ -36,7 +36,8 @@ DEFAULT_SERVER_ARGS: List[str] = [
 @pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
 @pytest.mark.parametrize("num_logprobs", [5])
 @pytest.mark.parametrize("is_async", [True])
-@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
+# @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
+@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
 @pytest.mark.asyncio
 async def test_multi_step(
    example_prompts,

--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -93,7 +93,8 @@ def test_eviction(num_blocks: int, ):


 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
+# @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
+@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [5])
 @pytest.mark.parametrize("cached_position", [0, 1])

--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -240,7 +240,8 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
            for i in range(batch_size)
        }

-    for use_flashinfer in [True, False]:
+    # for use_flashinfer in [True, False]:
+    for use_flashinfer in [False]:
        rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
        rejection_sampler.init_gpu_tensors(device=device)
        # We use seeded sequences to ensure the same tokens are accepted