Commit 308e5937 authored by zhuwenwen's avatar zhuwenwen
Browse files

remove unused backend

parent 00f18159
...@@ -10,6 +10,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs ...@@ -10,6 +10,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.metrics import RayPrometheusStatLogger from vllm.engine.metrics import RayPrometheusStatLogger
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
import vllm.envs as envs
from ..conftest import cleanup from ..conftest import cleanup
...@@ -19,7 +20,7 @@ MODELS = [ ...@@ -19,7 +20,7 @@ MODELS = [
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", [("float" if envs.VLLM_USE_TRITON_FLASH_ATTN else "half")])
@pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("max_tokens", [128])
def test_metric_counter_prompt_tokens( def test_metric_counter_prompt_tokens(
vllm_runner, vllm_runner,
...@@ -54,7 +55,7 @@ def test_metric_counter_prompt_tokens( ...@@ -54,7 +55,7 @@ def test_metric_counter_prompt_tokens(
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", [("float" if envs.VLLM_USE_TRITON_FLASH_ATTN else "half")])
@pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("max_tokens", [128])
def test_metric_counter_generation_tokens( def test_metric_counter_generation_tokens(
vllm_runner, vllm_runner,
...@@ -86,7 +87,7 @@ def test_metric_counter_generation_tokens( ...@@ -86,7 +87,7 @@ def test_metric_counter_generation_tokens(
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", [("float" if envs.VLLM_USE_TRITON_FLASH_ATTN else "half")])
@pytest.mark.parametrize( @pytest.mark.parametrize(
"served_model_name", "served_model_name",
[None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]]) [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
......
...@@ -36,7 +36,8 @@ DEFAULT_SERVER_ARGS: List[str] = [ ...@@ -36,7 +36,8 @@ DEFAULT_SERVER_ARGS: List[str] = [
@pytest.mark.parametrize("num_prompts", NUM_PROMPTS) @pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
@pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("is_async", [True]) @pytest.mark.parametrize("is_async", [True])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) # @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_multi_step( async def test_multi_step(
example_prompts, example_prompts,
......
...@@ -93,7 +93,8 @@ def test_eviction(num_blocks: int, ): ...@@ -93,7 +93,8 @@ def test_eviction(num_blocks: int, ):
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"]) # @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5]) @pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("cached_position", [0, 1]) @pytest.mark.parametrize("cached_position", [0, 1])
......
...@@ -240,7 +240,8 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int, ...@@ -240,7 +240,8 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
for i in range(batch_size) for i in range(batch_size)
} }
for use_flashinfer in [True, False]: # for use_flashinfer in [True, False]:
for use_flashinfer in [False]:
rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer) rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
rejection_sampler.init_gpu_tensors(device=device) rejection_sampler.init_gpu_tensors(device=device)
# We use seeded sequences to ensure the same tokens are accepted # We use seeded sequences to ensure the same tokens are accepted
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment