Commit dbd62f84 authored by zhuwenwen's avatar zhuwenwen
Browse files

[test]fix basic_correctness and benchmarks

parent 0e8619b8
......@@ -16,6 +16,8 @@ from ..models.utils import check_outputs_equal
from ..utils import multi_gpu_test
import os
from ..utils import models_path_prefix
from vllm.utils import gpuname
import vllm.envs as envs
MODELS = [
os.path.join(models_path_prefix, "google/gemma-2-2b-it"),
......@@ -35,7 +37,11 @@ def v1(run_with_both_engines):
def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted"""
if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"), block_size=64)
else:
llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"))
weak_llm = weakref.ref(llm)
del llm
# If there's any circular reference to vllm, this fails
......@@ -79,6 +85,16 @@ def test_models(
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
with VllmRunner(model,
max_model_len=8192,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
block_size=64) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
else:
with VllmRunner(model,
max_model_len=8192,
dtype=dtype,
......
......@@ -21,6 +21,8 @@ from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test
import os
from ..utils import models_path_prefix
from vllm.utils import gpuname
import vllm.envs as envs
if TYPE_CHECKING:
from .conftest import HfRunner, VllmRunner
......@@ -50,7 +52,7 @@ def use_v0_only(monkeypatch: pytest.MonkeyPatch):
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
@pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"] if not current_platform.is_rocm() else ["FLASH_ATTN"])
def test_models(
hf_runner: HfRunner,
vllm_runner: VllmRunner,
......@@ -85,6 +87,7 @@ def test_models(
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
......@@ -100,7 +103,7 @@ def test_models(
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"] if not current_platform.is_rocm() else ["FLASH_ATTN"])
def test_models_distributed(
hf_runner: HfRunner,
vllm_runner: VllmRunner,
......@@ -142,6 +145,7 @@ def test_models_distributed(
enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(
example_prompts,
......@@ -267,6 +271,7 @@ def test_with_prefix_caching(
tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
) as vllm_model:
outputs[enable] = []
for prompt in full_prompts:
......
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
import torch
......@@ -7,8 +8,7 @@ from vllm import LLM, SamplingParams
from vllm.device_allocator.cumem import CuMemAllocator
from vllm.utils import GiB_bytes
from ..utils import create_new_process_for_each_test
from ..utils import create_new_process_for_each_test, models_path_prefix
@create_new_process_for_each_test()
def test_python_error():
......@@ -119,9 +119,9 @@ def test_cumem_with_cudagraph():
"model, use_v1",
[
# sleep mode with safetensors
("meta-llama/Llama-3.2-1B", True),
(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"), True),
# sleep mode with pytorch checkpoint
("facebook/opt-125m", False),
(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"), False),
])
def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
with monkeypatch.context() as m:
......
......@@ -2,8 +2,10 @@
import subprocess
import pytest
import os
from ..utils import models_path_prefix
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
@pytest.mark.benchmark
......
......@@ -2,10 +2,11 @@
import subprocess
import pytest
import os
from ..utils import RemoteOpenAIServer
from ..utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
@pytest.fixture(scope="module")
......
# SPDX-License-Identifier: Apache-2.0
import subprocess
import os
import pytest
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct"
from ..utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
@pytest.mark.benchmark
......
......@@ -13,8 +13,9 @@ import torch
from vllm import LLM, SamplingParams
from utils import models_path_prefix
from .utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
@pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len")
......@@ -37,15 +38,15 @@ def test_max_tokens_none():
sampling_params = SamplingParams(temperature=0.01,
top_p=0.1,
max_tokens=None)
if not gpuname.startswith('BW'):
if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"),
max_num_batched_tokens=4096,
tensor_parallel_size=1)
tensor_parallel_size=1,
block_size=64)
else:
llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"),
max_num_batched_tokens=4096,
tensor_parallel_size=1,
block_size=64)
tensor_parallel_size=1)
prompts = ["Just say hello!"]
outputs = llm.generate(prompts, sampling_params=sampling_params)
......@@ -70,10 +71,10 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
with monkeypatch.context() as m:
m.setenv("VLLM_USE_MODELSCOPE", "True")
if not gpuname.startswith('BW'):
llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"))
else:
if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"), block_size=64)
else:
llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"))
prompts = [
"Hello, my name is",
......
......@@ -852,8 +852,6 @@ class ROCmFlashAttentionImpl(AttentionImpl):
else:
# prefix-enabled attention -
# not applicable for encoder-only models
# if not envs.VLLM_USE_TRITON_PREFIX_FLASH_ATTN:
# self.fa_prefix_attn_func = vllm_flash_attn_varlen_func
if envs.VLLM_USE_TRITON_PREFIX_FLASH_ATTN or gpuname.startswith('BW'):
version_key = triton_key()
if self.attn_type != AttentionType.ENCODER_ONLY:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment