Commit dbd62f84 authored by zhuwenwen's avatar zhuwenwen
Browse files

[test]fix basic_correctness and benchmarks

parent 0e8619b8
...@@ -16,6 +16,8 @@ from ..models.utils import check_outputs_equal ...@@ -16,6 +16,8 @@ from ..models.utils import check_outputs_equal
from ..utils import multi_gpu_test from ..utils import multi_gpu_test
import os import os
from ..utils import models_path_prefix from ..utils import models_path_prefix
from vllm.utils import gpuname
import vllm.envs as envs
MODELS = [ MODELS = [
os.path.join(models_path_prefix, "google/gemma-2-2b-it"), os.path.join(models_path_prefix, "google/gemma-2-2b-it"),
...@@ -35,7 +37,11 @@ def v1(run_with_both_engines): ...@@ -35,7 +37,11 @@ def v1(run_with_both_engines):
def test_vllm_gc_ed(): def test_vllm_gc_ed():
"""Verify vllm instance is GC'ed when it is deleted""" """Verify vllm instance is GC'ed when it is deleted"""
llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2")) if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"), block_size=64)
else:
llm = LLM(os.path.join(models_path_prefix, "distilbert/distilgpt2"))
weak_llm = weakref.ref(llm) weak_llm = weakref.ref(llm)
del llm del llm
# If there's any circular reference to vllm, this fails # If there's any circular reference to vllm, this fails
...@@ -79,13 +85,23 @@ def test_models( ...@@ -79,13 +85,23 @@ def test_models(
with hf_runner(model, dtype=dtype) as hf_model: with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with VllmRunner(model, if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
with VllmRunner(model,
max_model_len=8192, max_model_len=8192,
dtype=dtype, dtype=dtype,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7,
vllm_outputs = vllm_model.generate_greedy(example_prompts, block_size=64) as vllm_model:
max_tokens) vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
else:
with VllmRunner(model,
max_model_len=8192,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens)
check_outputs_equal( check_outputs_equal(
outputs_0_lst=hf_outputs, outputs_0_lst=hf_outputs,
...@@ -159,4 +175,4 @@ def test_models( ...@@ -159,4 +175,4 @@ def test_models(
# outputs_1_lst=vllm_outputs, # outputs_1_lst=vllm_outputs,
# name_0="hf", # name_0="hf",
# name_1="vllm", # name_1="vllm",
# ) # )
\ No newline at end of file
...@@ -21,6 +21,8 @@ from ..models.utils import check_logprobs_close, check_outputs_equal ...@@ -21,6 +21,8 @@ from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import multi_gpu_test from ..utils import multi_gpu_test
import os import os
from ..utils import models_path_prefix from ..utils import models_path_prefix
from vllm.utils import gpuname
import vllm.envs as envs
if TYPE_CHECKING: if TYPE_CHECKING:
from .conftest import HfRunner, VllmRunner from .conftest import HfRunner, VllmRunner
...@@ -50,7 +52,7 @@ def use_v0_only(monkeypatch: pytest.MonkeyPatch): ...@@ -50,7 +52,7 @@ def use_v0_only(monkeypatch: pytest.MonkeyPatch):
# NOTE: Increasing this in this suite will fail CI because we currently cannot # NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test. # reset distributed env properly. Use a value > 1 just when you test.
@pytest.mark.parametrize("tensor_parallel_size", [1]) @pytest.mark.parametrize("tensor_parallel_size", [1])
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"] if not current_platform.is_rocm() else ["FLASH_ATTN"])
def test_models( def test_models(
hf_runner: HfRunner, hf_runner: HfRunner,
vllm_runner: VllmRunner, vllm_runner: VllmRunner,
...@@ -85,6 +87,7 @@ def test_models( ...@@ -85,6 +87,7 @@ def test_models(
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, vllm_outputs = vllm_model.generate_greedy(example_prompts,
max_tokens) max_tokens)
...@@ -100,7 +103,7 @@ def test_models( ...@@ -100,7 +103,7 @@ def test_models(
@multi_gpu_test(num_gpus=2) @multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"]) @pytest.mark.parametrize("distributed_executor_backend", ["ray", "mp"])
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"]) @pytest.mark.parametrize("attention_backend", ["FLASHINFER", "FLASH_ATTN"] if not current_platform.is_rocm() else ["FLASH_ATTN"])
def test_models_distributed( def test_models_distributed(
hf_runner: HfRunner, hf_runner: HfRunner,
vllm_runner: VllmRunner, vllm_runner: VllmRunner,
...@@ -142,6 +145,7 @@ def test_models_distributed( ...@@ -142,6 +145,7 @@ def test_models_distributed(
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
max_num_batched_tokens=max_num_batched_tokens, max_num_batched_tokens=max_num_batched_tokens,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
) as vllm_model: ) as vllm_model:
vllm_outputs = vllm_model.generate_greedy( vllm_outputs = vllm_model.generate_greedy(
example_prompts, example_prompts,
...@@ -267,6 +271,7 @@ def test_with_prefix_caching( ...@@ -267,6 +271,7 @@ def test_with_prefix_caching(
tensor_parallel_size=tensor_parallel_size, tensor_parallel_size=tensor_parallel_size,
enforce_eager=enforce_eager, enforce_eager=enforce_eager,
max_num_seqs=max_num_seqs, max_num_seqs=max_num_seqs,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
) as vllm_model: ) as vllm_model:
outputs[enable] = [] outputs[enable] = []
for prompt in full_prompts: for prompt in full_prompts:
...@@ -338,4 +343,4 @@ def test_with_prefix_caching_cpu( ...@@ -338,4 +343,4 @@ def test_with_prefix_caching_cpu(
chunk_size, chunk_size,
1, 1,
dtype, dtype,
) )
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
import torch import torch
...@@ -7,8 +8,7 @@ from vllm import LLM, SamplingParams ...@@ -7,8 +8,7 @@ from vllm import LLM, SamplingParams
from vllm.device_allocator.cumem import CuMemAllocator from vllm.device_allocator.cumem import CuMemAllocator
from vllm.utils import GiB_bytes from vllm.utils import GiB_bytes
from ..utils import create_new_process_for_each_test from ..utils import create_new_process_for_each_test, models_path_prefix
@create_new_process_for_each_test() @create_new_process_for_each_test()
def test_python_error(): def test_python_error():
...@@ -119,9 +119,9 @@ def test_cumem_with_cudagraph(): ...@@ -119,9 +119,9 @@ def test_cumem_with_cudagraph():
"model, use_v1", "model, use_v1",
[ [
# sleep mode with safetensors # sleep mode with safetensors
("meta-llama/Llama-3.2-1B", True), (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"), True),
# sleep mode with pytorch checkpoint # sleep mode with pytorch checkpoint
("facebook/opt-125m", False), (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B"), False),
]) ])
def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool): def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
with monkeypatch.context() as m: with monkeypatch.context() as m:
...@@ -175,4 +175,4 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool): ...@@ -175,4 +175,4 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
output3 = llm.generate(prompt, sampling_params) output3 = llm.generate(prompt, sampling_params)
# cmp output # cmp output
assert output[0].outputs[0].text == output3[0].outputs[0].text assert output[0].outputs[0].text == output3[0].outputs[0].text
\ No newline at end of file
...@@ -2,8 +2,10 @@ ...@@ -2,8 +2,10 @@
import subprocess import subprocess
import pytest import pytest
import os
from ..utils import models_path_prefix
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
@pytest.mark.benchmark @pytest.mark.benchmark
...@@ -16,4 +18,4 @@ def test_bench_latency(): ...@@ -16,4 +18,4 @@ def test_bench_latency():
print(result.stdout) print(result.stdout)
print(result.stderr) print(result.stderr)
assert result.returncode == 0, f"Benchmark failed: {result.stderr}" assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
\ No newline at end of file
...@@ -2,10 +2,11 @@ ...@@ -2,10 +2,11 @@
import subprocess import subprocess
import pytest import pytest
import os
from ..utils import RemoteOpenAIServer from ..utils import RemoteOpenAIServer, models_path_prefix
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
...@@ -41,4 +42,4 @@ def test_bench_serve(server): ...@@ -41,4 +42,4 @@ def test_bench_serve(server):
print(result.stdout) print(result.stdout)
print(result.stderr) print(result.stderr)
assert result.returncode == 0, f"Benchmark failed: {result.stderr}" assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import subprocess import subprocess
import os
import pytest import pytest
MODEL_NAME = "meta-llama/Llama-3.2-1B-Instruct" from ..utils import models_path_prefix
MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
@pytest.mark.benchmark @pytest.mark.benchmark
...@@ -16,4 +19,4 @@ def test_bench_throughput(): ...@@ -16,4 +19,4 @@ def test_bench_throughput():
print(result.stdout) print(result.stdout)
print(result.stderr) print(result.stderr)
assert result.returncode == 0, f"Benchmark failed: {result.stderr}" assert result.returncode == 0, f"Benchmark failed: {result.stderr}"
\ No newline at end of file
...@@ -13,8 +13,9 @@ import torch ...@@ -13,8 +13,9 @@ import torch
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from utils import models_path_prefix from .utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
@pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len") @pytest.mark.skip(reason="In V1, we reject tokens > max_seq_len")
...@@ -37,15 +38,15 @@ def test_max_tokens_none(): ...@@ -37,15 +38,15 @@ def test_max_tokens_none():
sampling_params = SamplingParams(temperature=0.01, sampling_params = SamplingParams(temperature=0.01,
top_p=0.1, top_p=0.1,
max_tokens=None) max_tokens=None)
if not gpuname.startswith('BW'): if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"), llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"),
max_num_batched_tokens=4096, max_num_batched_tokens=4096,
tensor_parallel_size=1) tensor_parallel_size=1,
block_size=64)
else: else:
llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"), llm = LLM(model=os.path.join(models_path_prefix, "distilbert/distilgpt2"),
max_num_batched_tokens=4096, max_num_batched_tokens=4096,
tensor_parallel_size=1, tensor_parallel_size=1)
block_size=64)
prompts = ["Just say hello!"] prompts = ["Just say hello!"]
outputs = llm.generate(prompts, sampling_params=sampling_params) outputs = llm.generate(prompts, sampling_params=sampling_params)
...@@ -70,10 +71,10 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch): ...@@ -70,10 +71,10 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
with monkeypatch.context() as m: with monkeypatch.context() as m:
m.setenv("VLLM_USE_MODELSCOPE", "True") m.setenv("VLLM_USE_MODELSCOPE", "True")
if not gpuname.startswith('BW'): if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND:
llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"))
else:
llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"), block_size=64) llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"), block_size=64)
else:
llm = LLM(model=os.path.join(models_path_prefix, "qwen/Qwen1.5-0.5B-Chat"))
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
......
...@@ -852,8 +852,6 @@ class ROCmFlashAttentionImpl(AttentionImpl): ...@@ -852,8 +852,6 @@ class ROCmFlashAttentionImpl(AttentionImpl):
else: else:
# prefix-enabled attention - # prefix-enabled attention -
# not applicable for encoder-only models # not applicable for encoder-only models
# if not envs.VLLM_USE_TRITON_PREFIX_FLASH_ATTN:
# self.fa_prefix_attn_func = vllm_flash_attn_varlen_func
if envs.VLLM_USE_TRITON_PREFIX_FLASH_ATTN or gpuname.startswith('BW'): if envs.VLLM_USE_TRITON_PREFIX_FLASH_ATTN or gpuname.startswith('BW'):
version_key = triton_key() version_key = triton_key()
if self.attn_type != AttentionType.ENCODER_ONLY: if self.attn_type != AttentionType.ENCODER_ONLY:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment