Commit ced28510 authored by zhuwenwen's avatar zhuwenwen
Browse files

[tests] fix tests of core, engine and detokenizer

parent 734a433d
......@@ -29,18 +29,18 @@ class TestSetting:
"test_setting",
[
# basic llama model
TestSetting(
model="meta-llama/Llama-3.2-1B-Instruct",
model_args=[],
pp_size=2,
tp_size=2,
attn_backend="FLASHINFER",
method="generate",
fullgraph=True,
),
# TestSetting(
# model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
# model_args=[],
# pp_size=2,
# tp_size=2,
# attn_backend="FLASHINFER",
# method="generate",
# fullgraph=True,
# ),
# llama model with quantization
TestSetting(
model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
model=os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"),
model_args=["--quantization", "gptq"],
pp_size=1,
tp_size=1,
......@@ -50,7 +50,7 @@ class TestSetting:
),
# MoE model
TestSetting(
model="ibm/PowerMoE-3b",
model=os.path.join(models_path_prefix, "ibm/PowerMoE-3b"),
model_args=[],
pp_size=1,
tp_size=2,
......@@ -60,7 +60,7 @@ class TestSetting:
),
# embedding model
TestSetting(
model="BAAI/bge-multilingual-gemma2",
model=os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"),
model_args=["--task", "embed", "--dtype", "bfloat16"],
pp_size=1,
tp_size=1,
......@@ -69,18 +69,18 @@ class TestSetting:
fullgraph=True,
),
# encoder-based embedding model (BERT)
TestSetting(
model="BAAI/bge-base-en-v1.5",
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="XFORMERS",
method="encode",
fullgraph=True,
),
# TestSetting(
# model=os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"),
# model_args=["--task", "embed"],
# pp_size=1,
# tp_size=1,
# attn_backend="XFORMERS",
# method="encode",
# fullgraph=True,
# ),
# vision language model
TestSetting(
model="microsoft/Phi-3.5-vision-instruct",
model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size=2,
tp_size=1,
......
......@@ -9,6 +9,8 @@ from vllm import SamplingParams
from .conftest import get_token_ids_from_llm_generator
import os
from ....utils import models_path_prefix
import vllm.envs as envs
from vllm.utils import SUPPORT_TC, gpuname
@pytest.mark.parametrize(
......@@ -21,7 +23,7 @@ from ....utils import models_path_prefix
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"num_gpu_blocks_override": 5 * (64 + 1),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
......@@ -104,19 +106,19 @@ def test_block_manager_with_preemption(baseline_llm_generator,
"per_test_common_llm_kwargs",
[
{
"block_size": 16,
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
# Allow only 2 sequences of ~128 tokens in worst case.
# Note 8 = 128/block_size
"num_gpu_blocks_override": 2 * (8 + 1),
},
{
"block_size": 8,
# {
# "block_size": 8,
# Allow only 2 sequences of ~128 tokens in worst case.
# Note 16 = 128/block_size
"num_gpu_blocks_override": 2 * (16 + 2),
}
# # Allow only 2 sequences of ~128 tokens in worst case.
# # Note 16 = 128/block_size
# "num_gpu_blocks_override": 2 * (16 + 2),
# }
])
@pytest.mark.parametrize("baseline_llm_kwargs", [{
"num_lookahead_slots": 0,
......@@ -197,15 +199,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
])
@pytest.mark.parametrize("per_test_common_llm_kwargs",
[{
"block_size": 16,
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"max_num_batched_tokens": 2,
"max_num_seqs": 2,
}, {
"block_size": 16,
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"max_num_batched_tokens": 3,
"max_num_seqs": 2,
}, {
"block_size": 16,
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"max_num_batched_tokens": 256,
"max_num_seqs": 10,
}])
......@@ -271,7 +273,7 @@ def test_chunked_prefill_block_manager(baseline_llm_generator,
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"num_gpu_blocks_override": 5 * (64 + 1),
# Enable prefill cache
......@@ -352,7 +354,7 @@ def test_block_manager_prefix_caching_enabled_with_preemption(
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"num_gpu_blocks_override": 5 * (64 + 1),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
......@@ -427,7 +429,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
# we keep the blocks small, so that hit eviction quickly
"max_model_len": 48,
"block_size": 16,
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"num_gpu_blocks_override": 3,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
......
......@@ -2,6 +2,7 @@
from unittest.mock import MagicMock
import os
import pytest # noqa
from vllm.config import CacheConfig, SchedulerConfig
......@@ -12,6 +13,9 @@ from vllm.sampling_params import SamplingParams
from vllm.sequence import Logprob, SequenceGroup
from .utils import create_dummy_prompt
from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
def get_sequence_groups(scheduler_output):
......@@ -830,7 +834,7 @@ def test_prefix_caching_with_concurrent_partial_prefills():
assert out.num_batched_tokens == 44
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
@pytest.mark.parametrize("max_num_partial_prefills", [2, 4, 8])
def test_chunked_prefill_with_actual_engine(model: str,
max_num_partial_prefills: int):
......@@ -847,6 +851,7 @@ def test_chunked_prefill_with_actual_engine(model: str,
max_num_seqs=8,
enable_chunked_prefill=True,
gpu_memory_utilization=0.8,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
)
engine = LLMEngine.from_engine_args(engine_args)
......
......@@ -9,6 +9,8 @@ from vllm.engine.llm_engine import LLMEngine
from vllm.platforms import current_platform
from vllm.sequence import SequenceGroup
from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
MODEL = os.path.join(models_path_prefix, "JackFram/llama-160m")
......@@ -37,7 +39,8 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
gpu_memory_utilization=0.7,
num_scheduler_steps=num_scheduler_steps,
enable_chunked_prefill=enable_chunked_prefill,
enforce_eager=enforce_eager)
enforce_eager=enforce_eager,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16)
engine: LLMEngine = runner.model.llm_engine
# In multi-step + chunked-prefill there is no separate single prompt step.
......
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams
from ..utils import models_path_prefix
import vllm.envs as envs
from vllm.utils import SUPPORT_TC, gpuname
@pytest.mark.skip_v1
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
def test_computed_prefix_blocks(model: str):
# This test checks if the engine generates completions both with and
# without optional detokenization, that detokenization includes text
......@@ -18,7 +22,7 @@ def test_computed_prefix_blocks(model: str):
"paper clips? Is there an easy to follow video tutorial available "
"online for free?")
llm = LLM(model=model)
llm = LLM(model=model, block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16)
sampling_params = SamplingParams(max_tokens=10,
temperature=0.0,
detokenize=False)
......
......@@ -2,11 +2,13 @@
from typing import Any, Optional
import os
import pytest
from vllm import LLM, SamplingParams, envs
from ..utils import models_path_prefix
MODEL = "meta-llama/llama-2-7b-hf"
MODEL = os.path.join(models_path_prefix, "meta-llama/llama-2-7b-hf")
MAX_TOKENS = 200
......
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.sampling_params import SamplingParams
from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
@pytest.mark.parametrize("block_size", [64] if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else [16])
def test_computed_prefix_blocks(model: str, block_size: int):
# This test checks if we are able to run the engine to completion
# without triggering asserts.
......
......@@ -13,6 +13,8 @@ from vllm.executor.uniproc_executor import UniProcExecutor
from vllm.sampling_params import SamplingParams
import os
from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
class Mock:
......@@ -57,6 +59,7 @@ def test_custom_executor(model, tmp_path):
model=model,
distributed_executor_backend=CustomUniExecutor,
enforce_eager=True, # reduce test time
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
)
engine = LLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(max_tokens=1)
......@@ -69,7 +72,7 @@ def test_custom_executor(model, tmp_path):
os.chdir(cwd)
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
def test_custom_executor_async(model, tmp_path):
cwd = os.path.abspath(".")
os.chdir(tmp_path)
......@@ -80,6 +83,7 @@ def test_custom_executor_async(model, tmp_path):
model=model,
distributed_executor_backend=CustomUniExecutorAsync,
enforce_eager=True, # reduce test time
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
)
engine = AsyncLLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(max_tokens=1)
......@@ -96,7 +100,7 @@ def test_custom_executor_async(model, tmp_path):
os.chdir(cwd)
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
def test_respect_ray(model):
# even for TP=1 and PP=1,
# if users specify ray, we should use ray.
......@@ -106,6 +110,7 @@ def test_respect_ray(model):
model=model,
distributed_executor_backend="ray",
enforce_eager=True, # reduce test time
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
)
engine = LLMEngine.from_engine_args(engine_args)
assert engine.model_executor.uses_ray
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
from ..conftest import IMAGE_ASSETS
......
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams
from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
def test_skip_tokenizer_initialization(model: str):
# This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain
......@@ -14,6 +18,7 @@ def test_skip_tokenizer_initialization(model: str):
llm = LLM(
model=model,
skip_tokenizer_init=True,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
)
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
......
# SPDX-License-Identifier: Apache-2.0
import os
from vllm import SamplingParams
from vllm.config import LoadFormat
from ..utils import models_path_prefix
test_model = "openai-community/gpt2"
test_model = os.path.join(models_path_prefix, "openai-community/gpt2")
prompts = [
"Hello, my name is",
......
......@@ -2,9 +2,11 @@
import glob
import tempfile
import os
import huggingface_hub.constants
import torch
from ..utils import models_path_prefix
from vllm.model_executor.model_loader.weight_utils import (
download_weights_from_hf, fastsafetensors_weights_iterator,
......@@ -14,7 +16,7 @@ from vllm.model_executor.model_loader.weight_utils import (
def test_fastsafetensors_model_loader():
with tempfile.TemporaryDirectory() as tmpdir:
huggingface_hub.constants.HF_HUB_OFFLINE = False
download_weights_from_hf("openai-community/gpt2",
download_weights_from_hf(os.path.join(models_path_prefix, "openai-community/gpt2"),
allow_patterns=["*.safetensors"],
cache_dir=tmpdir)
safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment