Commit 308e5937 authored by zhuwenwen's avatar zhuwenwen
Browse files

remove unused backend

parent 00f18159
......@@ -10,6 +10,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.metrics import RayPrometheusStatLogger
from vllm.sampling_params import SamplingParams
import vllm.envs as envs
from ..conftest import cleanup
......@@ -19,7 +20,7 @@ MODELS = [
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("dtype", [("float" if envs.VLLM_USE_TRITON_FLASH_ATTN else "half")])
@pytest.mark.parametrize("max_tokens", [128])
def test_metric_counter_prompt_tokens(
vllm_runner,
......@@ -54,7 +55,7 @@ def test_metric_counter_prompt_tokens(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("dtype", [("float" if envs.VLLM_USE_TRITON_FLASH_ATTN else "half")])
@pytest.mark.parametrize("max_tokens", [128])
def test_metric_counter_generation_tokens(
vllm_runner,
......@@ -86,7 +87,7 @@ def test_metric_counter_generation_tokens(
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize("dtype", [("float" if envs.VLLM_USE_TRITON_FLASH_ATTN else "half")])
@pytest.mark.parametrize(
"served_model_name",
[None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
......
......@@ -36,7 +36,8 @@ DEFAULT_SERVER_ARGS: List[str] = [
@pytest.mark.parametrize("num_prompts", NUM_PROMPTS)
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("is_async", [True])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
# @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
@pytest.mark.parametrize("attention_backend", ["FLASH_ATTN"])
@pytest.mark.asyncio
async def test_multi_step(
example_prompts,
......
......@@ -93,7 +93,8 @@ def test_eviction(num_blocks: int, ):
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
# @pytest.mark.parametrize("backend", ["FLASH_ATTN", "FLASHINFER", "XFORMERS"])
@pytest.mark.parametrize("backend", ["FLASH_ATTN"])
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("cached_position", [0, 1])
......
......@@ -240,7 +240,8 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
for i in range(batch_size)
}
for use_flashinfer in [True, False]:
# for use_flashinfer in [True, False]:
for use_flashinfer in [False]:
rejection_sampler = RejectionSampler(use_flashinfer=use_flashinfer)
rejection_sampler.init_gpu_tensors(device=device)
# We use seeded sequences to ensure the same tokens are accepted
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment