Commit ced28510 authored by zhuwenwen's avatar zhuwenwen
Browse files

[tests] fix tests of core, engine and detokenizer

parent 734a433d
......@@ -29,18 +29,18 @@ class TestSetting:
"test_setting",
[
# basic llama model
TestSetting(
model="meta-llama/Llama-3.2-1B-Instruct",
model_args=[],
pp_size=2,
tp_size=2,
attn_backend="FLASHINFER",
method="generate",
fullgraph=True,
),
# TestSetting(
# model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
# model_args=[],
# pp_size=2,
# tp_size=2,
# attn_backend="FLASHINFER",
# method="generate",
# fullgraph=True,
# ),
# llama model with quantization
TestSetting(
model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ",
model=os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"),
model_args=["--quantization", "gptq"],
pp_size=1,
tp_size=1,
......@@ -50,7 +50,7 @@ class TestSetting:
),
# MoE model
TestSetting(
model="ibm/PowerMoE-3b",
model=os.path.join(models_path_prefix, "ibm/PowerMoE-3b"),
model_args=[],
pp_size=1,
tp_size=2,
......@@ -60,7 +60,7 @@ class TestSetting:
),
# embedding model
TestSetting(
model="BAAI/bge-multilingual-gemma2",
model=os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"),
model_args=["--task", "embed", "--dtype", "bfloat16"],
pp_size=1,
tp_size=1,
......@@ -69,18 +69,18 @@ class TestSetting:
fullgraph=True,
),
# encoder-based embedding model (BERT)
TestSetting(
model="BAAI/bge-base-en-v1.5",
model_args=["--task", "embed"],
pp_size=1,
tp_size=1,
attn_backend="XFORMERS",
method="encode",
fullgraph=True,
),
# TestSetting(
# model=os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"),
# model_args=["--task", "embed"],
# pp_size=1,
# tp_size=1,
# attn_backend="XFORMERS",
# method="encode",
# fullgraph=True,
# ),
# vision language model
TestSetting(
model="microsoft/Phi-3.5-vision-instruct",
model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size=2,
tp_size=1,
......@@ -146,4 +146,4 @@ def test_compile_correctness(
all_envs[-1][
"VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE"] = "0" # type: ignore
compare_all_settings(model, all_args * 3, all_envs, method=method)
compare_all_settings(model, all_args * 3, all_envs, method=method)
\ No newline at end of file
......@@ -9,6 +9,8 @@ from vllm import SamplingParams
from .conftest import get_token_ids_from_llm_generator
import os
from ....utils import models_path_prefix
import vllm.envs as envs
from vllm.utils import SUPPORT_TC, gpuname
@pytest.mark.parametrize(
......@@ -21,7 +23,7 @@ from ....utils import models_path_prefix
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"num_gpu_blocks_override": 5 * (64 + 1),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
......@@ -104,19 +106,19 @@ def test_block_manager_with_preemption(baseline_llm_generator,
"per_test_common_llm_kwargs",
[
{
"block_size": 16,
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
# Allow only 2 sequences of ~128 tokens in worst case.
# Note 8 = 128/block_size
"num_gpu_blocks_override": 2 * (8 + 1),
},
{
"block_size": 8,
# {
# "block_size": 8,
# Allow only 2 sequences of ~128 tokens in worst case.
# Note 16 = 128/block_size
"num_gpu_blocks_override": 2 * (16 + 2),
}
# # Allow only 2 sequences of ~128 tokens in worst case.
# # Note 16 = 128/block_size
# "num_gpu_blocks_override": 2 * (16 + 2),
# }
])
@pytest.mark.parametrize("baseline_llm_kwargs", [{
"num_lookahead_slots": 0,
......@@ -197,15 +199,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
])
@pytest.mark.parametrize("per_test_common_llm_kwargs",
[{
"block_size": 16,
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"max_num_batched_tokens": 2,
"max_num_seqs": 2,
}, {
"block_size": 16,
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"max_num_batched_tokens": 3,
"max_num_seqs": 2,
}, {
"block_size": 16,
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"max_num_batched_tokens": 256,
"max_num_seqs": 10,
}])
......@@ -271,7 +273,7 @@ def test_chunked_prefill_block_manager(baseline_llm_generator,
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"num_gpu_blocks_override": 5 * (64 + 1),
# Enable prefill cache
......@@ -352,7 +354,7 @@ def test_block_manager_prefix_caching_enabled_with_preemption(
"enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16,
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"num_gpu_blocks_override": 5 * (64 + 1),
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
......@@ -427,7 +429,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
# we keep the blocks small, so that hit eviction quickly
"max_model_len": 48,
"block_size": 16,
"block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"num_gpu_blocks_override": 3,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
......@@ -477,4 +479,4 @@ def test_auto_prefix_caching_after_evition_start(baseline_llm_generator,
test_token_ids):
assert expected_token_ids == actual_token_ids
assert baseline_token_ids == test_token_ids
assert baseline_token_ids == test_token_ids
\ No newline at end of file
......@@ -2,6 +2,7 @@
from unittest.mock import MagicMock
import os
import pytest # noqa
from vllm.config import CacheConfig, SchedulerConfig
......@@ -12,6 +13,9 @@ from vllm.sampling_params import SamplingParams
from vllm.sequence import Logprob, SequenceGroup
from .utils import create_dummy_prompt
from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
def get_sequence_groups(scheduler_output):
......@@ -830,7 +834,7 @@ def test_prefix_caching_with_concurrent_partial_prefills():
assert out.num_batched_tokens == 44
@pytest.mark.parametrize("model", ["facebook/opt-125m"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
@pytest.mark.parametrize("max_num_partial_prefills", [2, 4, 8])
def test_chunked_prefill_with_actual_engine(model: str,
max_num_partial_prefills: int):
......@@ -847,6 +851,7 @@ def test_chunked_prefill_with_actual_engine(model: str,
max_num_seqs=8,
enable_chunked_prefill=True,
gpu_memory_utilization=0.8,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
)
engine = LLMEngine.from_engine_args(engine_args)
......@@ -858,4 +863,4 @@ def test_chunked_prefill_with_actual_engine(model: str,
request_outputs = engine.step()
# means all are prefilling
assert len(request_outputs) == 0
assert len(engine.scheduler[0].running) == max_num_partial_prefills
assert len(engine.scheduler[0].running) == max_num_partial_prefills
\ No newline at end of file
......@@ -9,6 +9,8 @@ from vllm.engine.llm_engine import LLMEngine
from vllm.platforms import current_platform
from vllm.sequence import SequenceGroup
from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
MODEL = os.path.join(models_path_prefix, "JackFram/llama-160m")
......@@ -37,7 +39,8 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
gpu_memory_utilization=0.7,
num_scheduler_steps=num_scheduler_steps,
enable_chunked_prefill=enable_chunked_prefill,
enforce_eager=enforce_eager)
enforce_eager=enforce_eager,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16)
engine: LLMEngine = runner.model.llm_engine
# In multi-step + chunked-prefill there is no separate single prompt step.
......@@ -81,4 +84,4 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
# Test correctness of num_computed_tokens after the sequence finish.
assert seq.data.get_num_computed_tokens(
) == prompt_len + num_output_tokens - 1
) == prompt_len + num_output_tokens - 1
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams
from ..utils import models_path_prefix
import vllm.envs as envs
from vllm.utils import SUPPORT_TC, gpuname
@pytest.mark.skip_v1
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
def test_computed_prefix_blocks(model: str):
# This test checks if the engine generates completions both with and
# without optional detokenization, that detokenization includes text
......@@ -18,7 +22,7 @@ def test_computed_prefix_blocks(model: str):
"paper clips? Is there an easy to follow video tutorial available "
"online for free?")
llm = LLM(model=model)
llm = LLM(model=model, block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16)
sampling_params = SamplingParams(max_tokens=10,
temperature=0.0,
detokenize=False)
......@@ -32,4 +36,4 @@ def test_computed_prefix_blocks(model: str):
assert outputs_no_detokenization.text == ''
assert outputs_with_detokenization.text != ''
assert outputs_no_detokenization.token_ids == \
outputs_with_detokenization.token_ids
outputs_with_detokenization.token_ids
\ No newline at end of file
......@@ -2,11 +2,13 @@
from typing import Any, Optional
import os
import pytest
from vllm import LLM, SamplingParams, envs
from ..utils import models_path_prefix
MODEL = "meta-llama/llama-2-7b-hf"
MODEL = os.path.join(models_path_prefix, "meta-llama/llama-2-7b-hf")
MAX_TOKENS = 200
......@@ -138,4 +140,4 @@ def test_stop_strings():
_stop_token_id(vllm_model)
_set_async_mode(vllm_model, False)
_stop_token_id(vllm_model)
_stop_token_id(vllm_model)
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine
from vllm.sampling_params import SamplingParams
from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
@pytest.mark.parametrize("block_size", [16])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
@pytest.mark.parametrize("block_size", [64] if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else [16])
def test_computed_prefix_blocks(model: str, block_size: int):
# This test checks if we are able to run the engine to completion
# without triggering asserts.
......@@ -33,4 +37,4 @@ def test_computed_prefix_blocks(model: str, block_size: int):
engine.add_request("0", prompt + prompt2, sampling_params)
engine.step()
engine.add_request("1", prompt, sampling_params)
engine.step()
engine.step()
\ No newline at end of file
......@@ -13,6 +13,8 @@ from vllm.executor.uniproc_executor import UniProcExecutor
from vllm.sampling_params import SamplingParams
import os
from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
class Mock:
......@@ -57,6 +59,7 @@ def test_custom_executor(model, tmp_path):
model=model,
distributed_executor_backend=CustomUniExecutor,
enforce_eager=True, # reduce test time
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
)
engine = LLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(max_tokens=1)
......@@ -69,7 +72,7 @@ def test_custom_executor(model, tmp_path):
os.chdir(cwd)
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
def test_custom_executor_async(model, tmp_path):
cwd = os.path.abspath(".")
os.chdir(tmp_path)
......@@ -80,6 +83,7 @@ def test_custom_executor_async(model, tmp_path):
model=model,
distributed_executor_backend=CustomUniExecutorAsync,
enforce_eager=True, # reduce test time
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
)
engine = AsyncLLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(max_tokens=1)
......@@ -96,7 +100,7 @@ def test_custom_executor_async(model, tmp_path):
os.chdir(cwd)
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
def test_respect_ray(model):
# even for TP=1 and PP=1,
# if users specify ray, we should use ray.
......@@ -106,6 +110,7 @@ def test_respect_ray(model):
model=model,
distributed_executor_backend="ray",
enforce_eager=True, # reduce test time
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
)
engine = LLMEngine.from_engine_args(engine_args)
assert engine.model_executor.uses_ray
assert engine.model_executor.uses_ray
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
from ..conftest import IMAGE_ASSETS
......@@ -30,4 +31,4 @@ def test_context_length_too_short(vllm_runner, image_assets, model):
with vllm_model:
vllm_model.generate_greedy([HF_IMAGE_PROMPTS[0]],
max_tokens=1,
images=[images[0]])
images=[images[0]])
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams
from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"])
@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
def test_skip_tokenizer_initialization(model: str):
# This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain
......@@ -14,6 +18,7 @@ def test_skip_tokenizer_initialization(model: str):
llm = LLM(
model=model,
skip_tokenizer_init=True,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
)
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
......@@ -26,4 +31,4 @@ def test_skip_tokenizer_initialization(model: str):
completions = outputs[0].outputs
assert len(completions) > 0
assert completions[0].text == ""
assert completions[0].token_ids
assert completions[0].token_ids
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
import os
from vllm import SamplingParams
from vllm.config import LoadFormat
from ..utils import models_path_prefix
test_model = "openai-community/gpt2"
test_model = os.path.join(models_path_prefix, "openai-community/gpt2")
prompts = [
"Hello, my name is",
......@@ -19,4 +21,4 @@ def test_model_loader_download_files(vllm_runner):
with vllm_runner(test_model,
load_format=LoadFormat.FASTSAFETENSORS) as llm:
deserialized_outputs = llm.generate(prompts, sampling_params)
assert deserialized_outputs
assert deserialized_outputs
\ No newline at end of file
......@@ -2,9 +2,11 @@
import glob
import tempfile
import os
import huggingface_hub.constants
import torch
from ..utils import models_path_prefix
from vllm.model_executor.model_loader.weight_utils import (
download_weights_from_hf, fastsafetensors_weights_iterator,
......@@ -14,7 +16,7 @@ from vllm.model_executor.model_loader.weight_utils import (
def test_fastsafetensors_model_loader():
with tempfile.TemporaryDirectory() as tmpdir:
huggingface_hub.constants.HF_HUB_OFFLINE = False
download_weights_from_hf("openai-community/gpt2",
download_weights_from_hf(os.path.join(models_path_prefix, "openai-community/gpt2"),
allow_patterns=["*.safetensors"],
cache_dir=tmpdir)
safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
......@@ -43,4 +45,4 @@ def test_fastsafetensors_model_loader():
if __name__ == "__main__":
test_fastsafetensors_model_loader()
test_fastsafetensors_model_loader()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment