"docs/vscode:/vscode.git/clone" did not exist on "70b1b330e10f5eba8bf003500834d214c8b4a559"
Commit ced28510 authored by zhuwenwen's avatar zhuwenwen
Browse files

[tests] fix tests of core, engine and detokenizer

parent 734a433d
...@@ -29,18 +29,18 @@ class TestSetting: ...@@ -29,18 +29,18 @@ class TestSetting:
"test_setting", "test_setting",
[ [
# basic llama model # basic llama model
TestSetting( # TestSetting(
model="meta-llama/Llama-3.2-1B-Instruct", # model=os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
model_args=[], # model_args=[],
pp_size=2, # pp_size=2,
tp_size=2, # tp_size=2,
attn_backend="FLASHINFER", # attn_backend="FLASHINFER",
method="generate", # method="generate",
fullgraph=True, # fullgraph=True,
), # ),
# llama model with quantization # llama model with quantization
TestSetting( TestSetting(
model="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", model=os.path.join(models_path_prefix, "TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ"),
model_args=["--quantization", "gptq"], model_args=["--quantization", "gptq"],
pp_size=1, pp_size=1,
tp_size=1, tp_size=1,
...@@ -50,7 +50,7 @@ class TestSetting: ...@@ -50,7 +50,7 @@ class TestSetting:
), ),
# MoE model # MoE model
TestSetting( TestSetting(
model="ibm/PowerMoE-3b", model=os.path.join(models_path_prefix, "ibm/PowerMoE-3b"),
model_args=[], model_args=[],
pp_size=1, pp_size=1,
tp_size=2, tp_size=2,
...@@ -60,7 +60,7 @@ class TestSetting: ...@@ -60,7 +60,7 @@ class TestSetting:
), ),
# embedding model # embedding model
TestSetting( TestSetting(
model="BAAI/bge-multilingual-gemma2", model=os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"),
model_args=["--task", "embed", "--dtype", "bfloat16"], model_args=["--task", "embed", "--dtype", "bfloat16"],
pp_size=1, pp_size=1,
tp_size=1, tp_size=1,
...@@ -69,18 +69,18 @@ class TestSetting: ...@@ -69,18 +69,18 @@ class TestSetting:
fullgraph=True, fullgraph=True,
), ),
# encoder-based embedding model (BERT) # encoder-based embedding model (BERT)
TestSetting( # TestSetting(
model="BAAI/bge-base-en-v1.5", # model=os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"),
model_args=["--task", "embed"], # model_args=["--task", "embed"],
pp_size=1, # pp_size=1,
tp_size=1, # tp_size=1,
attn_backend="XFORMERS", # attn_backend="XFORMERS",
method="encode", # method="encode",
fullgraph=True, # fullgraph=True,
), # ),
# vision language model # vision language model
TestSetting( TestSetting(
model="microsoft/Phi-3.5-vision-instruct", model=os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct"),
model_args=["--trust-remote-code", "--max-model-len", "2048"], model_args=["--trust-remote-code", "--max-model-len", "2048"],
pp_size=2, pp_size=2,
tp_size=1, tp_size=1,
......
...@@ -9,6 +9,8 @@ from vllm import SamplingParams ...@@ -9,6 +9,8 @@ from vllm import SamplingParams
from .conftest import get_token_ids_from_llm_generator from .conftest import get_token_ids_from_llm_generator
import os import os
from ....utils import models_path_prefix from ....utils import models_path_prefix
import vllm.envs as envs
from vllm.utils import SUPPORT_TC, gpuname
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -21,7 +23,7 @@ from ....utils import models_path_prefix ...@@ -21,7 +23,7 @@ from ....utils import models_path_prefix
"enforce_eager": True, "enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case. # Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16, "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"num_gpu_blocks_override": 5 * (64 + 1), "num_gpu_blocks_override": 5 * (64 + 1),
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
...@@ -104,19 +106,19 @@ def test_block_manager_with_preemption(baseline_llm_generator, ...@@ -104,19 +106,19 @@ def test_block_manager_with_preemption(baseline_llm_generator,
"per_test_common_llm_kwargs", "per_test_common_llm_kwargs",
[ [
{ {
"block_size": 16, "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
# Allow only 2 sequences of ~128 tokens in worst case. # Allow only 2 sequences of ~128 tokens in worst case.
# Note 8 = 128/block_size # Note 8 = 128/block_size
"num_gpu_blocks_override": 2 * (8 + 1), "num_gpu_blocks_override": 2 * (8 + 1),
}, },
{ # {
"block_size": 8, # "block_size": 8,
# Allow only 2 sequences of ~128 tokens in worst case. # # Allow only 2 sequences of ~128 tokens in worst case.
# Note 16 = 128/block_size # # Note 16 = 128/block_size
"num_gpu_blocks_override": 2 * (16 + 2), # "num_gpu_blocks_override": 2 * (16 + 2),
} # }
]) ])
@pytest.mark.parametrize("baseline_llm_kwargs", [{ @pytest.mark.parametrize("baseline_llm_kwargs", [{
"num_lookahead_slots": 0, "num_lookahead_slots": 0,
...@@ -197,15 +199,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator, ...@@ -197,15 +199,15 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
]) ])
@pytest.mark.parametrize("per_test_common_llm_kwargs", @pytest.mark.parametrize("per_test_common_llm_kwargs",
[{ [{
"block_size": 16, "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"max_num_batched_tokens": 2, "max_num_batched_tokens": 2,
"max_num_seqs": 2, "max_num_seqs": 2,
}, { }, {
"block_size": 16, "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"max_num_batched_tokens": 3, "max_num_batched_tokens": 3,
"max_num_seqs": 2, "max_num_seqs": 2,
}, { }, {
"block_size": 16, "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"max_num_batched_tokens": 256, "max_num_batched_tokens": 256,
"max_num_seqs": 10, "max_num_seqs": 10,
}]) }])
...@@ -271,7 +273,7 @@ def test_chunked_prefill_block_manager(baseline_llm_generator, ...@@ -271,7 +273,7 @@ def test_chunked_prefill_block_manager(baseline_llm_generator,
"enforce_eager": True, "enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case. # Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16, "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"num_gpu_blocks_override": 5 * (64 + 1), "num_gpu_blocks_override": 5 * (64 + 1),
# Enable prefill cache # Enable prefill cache
...@@ -352,7 +354,7 @@ def test_block_manager_prefix_caching_enabled_with_preemption( ...@@ -352,7 +354,7 @@ def test_block_manager_prefix_caching_enabled_with_preemption(
"enforce_eager": True, "enforce_eager": True,
# Allow only 5 sequences of ~1024 tokens in worst case. # Allow only 5 sequences of ~1024 tokens in worst case.
"block_size": 16, "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"num_gpu_blocks_override": 5 * (64 + 1), "num_gpu_blocks_override": 5 * (64 + 1),
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
...@@ -427,7 +429,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator, ...@@ -427,7 +429,7 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
# we keep the blocks small, so that hit eviction quickly # we keep the blocks small, so that hit eviction quickly
"max_model_len": 48, "max_model_len": 48,
"block_size": 16, "block_size": 64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
"num_gpu_blocks_override": 3, "num_gpu_blocks_override": 3,
}]) }])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}]) @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
from unittest.mock import MagicMock from unittest.mock import MagicMock
import os
import pytest # noqa import pytest # noqa
from vllm.config import CacheConfig, SchedulerConfig from vllm.config import CacheConfig, SchedulerConfig
...@@ -12,6 +13,9 @@ from vllm.sampling_params import SamplingParams ...@@ -12,6 +13,9 @@ from vllm.sampling_params import SamplingParams
from vllm.sequence import Logprob, SequenceGroup from vllm.sequence import Logprob, SequenceGroup
from .utils import create_dummy_prompt from .utils import create_dummy_prompt
from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
def get_sequence_groups(scheduler_output): def get_sequence_groups(scheduler_output):
...@@ -830,7 +834,7 @@ def test_prefix_caching_with_concurrent_partial_prefills(): ...@@ -830,7 +834,7 @@ def test_prefix_caching_with_concurrent_partial_prefills():
assert out.num_batched_tokens == 44 assert out.num_batched_tokens == 44
@pytest.mark.parametrize("model", ["facebook/opt-125m"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "facebook/opt-125m")])
@pytest.mark.parametrize("max_num_partial_prefills", [2, 4, 8]) @pytest.mark.parametrize("max_num_partial_prefills", [2, 4, 8])
def test_chunked_prefill_with_actual_engine(model: str, def test_chunked_prefill_with_actual_engine(model: str,
max_num_partial_prefills: int): max_num_partial_prefills: int):
...@@ -847,6 +851,7 @@ def test_chunked_prefill_with_actual_engine(model: str, ...@@ -847,6 +851,7 @@ def test_chunked_prefill_with_actual_engine(model: str,
max_num_seqs=8, max_num_seqs=8,
enable_chunked_prefill=True, enable_chunked_prefill=True,
gpu_memory_utilization=0.8, gpu_memory_utilization=0.8,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
) )
engine = LLMEngine.from_engine_args(engine_args) engine = LLMEngine.from_engine_args(engine_args)
......
...@@ -9,6 +9,8 @@ from vllm.engine.llm_engine import LLMEngine ...@@ -9,6 +9,8 @@ from vllm.engine.llm_engine import LLMEngine
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.sequence import SequenceGroup from vllm.sequence import SequenceGroup
from ..utils import models_path_prefix from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
MODEL = os.path.join(models_path_prefix, "JackFram/llama-160m") MODEL = os.path.join(models_path_prefix, "JackFram/llama-160m")
...@@ -37,7 +39,8 @@ def test_num_computed_tokens_update(num_scheduler_steps: int, ...@@ -37,7 +39,8 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
gpu_memory_utilization=0.7, gpu_memory_utilization=0.7,
num_scheduler_steps=num_scheduler_steps, num_scheduler_steps=num_scheduler_steps,
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
enforce_eager=enforce_eager) enforce_eager=enforce_eager,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16)
engine: LLMEngine = runner.model.llm_engine engine: LLMEngine = runner.model.llm_engine
# In multi-step + chunked-prefill there is no separate single prompt step. # In multi-step + chunked-prefill there is no separate single prompt step.
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
from vllm.entrypoints.llm import LLM from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from ..utils import models_path_prefix
import vllm.envs as envs
from vllm.utils import SUPPORT_TC, gpuname
@pytest.mark.skip_v1 @pytest.mark.skip_v1
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
def test_computed_prefix_blocks(model: str): def test_computed_prefix_blocks(model: str):
# This test checks if the engine generates completions both with and # This test checks if the engine generates completions both with and
# without optional detokenization, that detokenization includes text # without optional detokenization, that detokenization includes text
...@@ -18,7 +22,7 @@ def test_computed_prefix_blocks(model: str): ...@@ -18,7 +22,7 @@ def test_computed_prefix_blocks(model: str):
"paper clips? Is there an easy to follow video tutorial available " "paper clips? Is there an easy to follow video tutorial available "
"online for free?") "online for free?")
llm = LLM(model=model) llm = LLM(model=model, block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16)
sampling_params = SamplingParams(max_tokens=10, sampling_params = SamplingParams(max_tokens=10,
temperature=0.0, temperature=0.0,
detokenize=False) detokenize=False)
......
...@@ -2,11 +2,13 @@ ...@@ -2,11 +2,13 @@
from typing import Any, Optional from typing import Any, Optional
import os
import pytest import pytest
from vllm import LLM, SamplingParams, envs from vllm import LLM, SamplingParams, envs
from ..utils import models_path_prefix
MODEL = "meta-llama/llama-2-7b-hf" MODEL = os.path.join(models_path_prefix, "meta-llama/llama-2-7b-hf")
MAX_TOKENS = 200 MAX_TOKENS = 200
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.engine.llm_engine import LLMEngine from vllm.engine.llm_engine import LLMEngine
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
@pytest.mark.parametrize("block_size", [16]) @pytest.mark.parametrize("block_size", [64] if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else [16])
def test_computed_prefix_blocks(model: str, block_size: int): def test_computed_prefix_blocks(model: str, block_size: int):
# This test checks if we are able to run the engine to completion # This test checks if we are able to run the engine to completion
# without triggering asserts. # without triggering asserts.
......
...@@ -13,6 +13,8 @@ from vllm.executor.uniproc_executor import UniProcExecutor ...@@ -13,6 +13,8 @@ from vllm.executor.uniproc_executor import UniProcExecutor
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
import os import os
from ..utils import models_path_prefix from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
class Mock: class Mock:
...@@ -57,6 +59,7 @@ def test_custom_executor(model, tmp_path): ...@@ -57,6 +59,7 @@ def test_custom_executor(model, tmp_path):
model=model, model=model,
distributed_executor_backend=CustomUniExecutor, distributed_executor_backend=CustomUniExecutor,
enforce_eager=True, # reduce test time enforce_eager=True, # reduce test time
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
) )
engine = LLMEngine.from_engine_args(engine_args) engine = LLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(max_tokens=1) sampling_params = SamplingParams(max_tokens=1)
...@@ -69,7 +72,7 @@ def test_custom_executor(model, tmp_path): ...@@ -69,7 +72,7 @@ def test_custom_executor(model, tmp_path):
os.chdir(cwd) os.chdir(cwd)
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
def test_custom_executor_async(model, tmp_path): def test_custom_executor_async(model, tmp_path):
cwd = os.path.abspath(".") cwd = os.path.abspath(".")
os.chdir(tmp_path) os.chdir(tmp_path)
...@@ -80,6 +83,7 @@ def test_custom_executor_async(model, tmp_path): ...@@ -80,6 +83,7 @@ def test_custom_executor_async(model, tmp_path):
model=model, model=model,
distributed_executor_backend=CustomUniExecutorAsync, distributed_executor_backend=CustomUniExecutorAsync,
enforce_eager=True, # reduce test time enforce_eager=True, # reduce test time
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
) )
engine = AsyncLLMEngine.from_engine_args(engine_args) engine = AsyncLLMEngine.from_engine_args(engine_args)
sampling_params = SamplingParams(max_tokens=1) sampling_params = SamplingParams(max_tokens=1)
...@@ -96,7 +100,7 @@ def test_custom_executor_async(model, tmp_path): ...@@ -96,7 +100,7 @@ def test_custom_executor_async(model, tmp_path):
os.chdir(cwd) os.chdir(cwd)
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
def test_respect_ray(model): def test_respect_ray(model):
# even for TP=1 and PP=1, # even for TP=1 and PP=1,
# if users specify ray, we should use ray. # if users specify ray, we should use ray.
...@@ -106,6 +110,7 @@ def test_respect_ray(model): ...@@ -106,6 +110,7 @@ def test_respect_ray(model):
model=model, model=model,
distributed_executor_backend="ray", distributed_executor_backend="ray",
enforce_eager=True, # reduce test time enforce_eager=True, # reduce test time
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
) )
engine = LLMEngine.from_engine_args(engine_args) engine = LLMEngine.from_engine_args(engine_args)
assert engine.model_executor.uses_ray assert engine.model_executor.uses_ray
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
from ..conftest import IMAGE_ASSETS from ..conftest import IMAGE_ASSETS
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
import pytest import pytest
from vllm.entrypoints.llm import LLM from vllm.entrypoints.llm import LLM
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from ..utils import models_path_prefix
from vllm.utils import SUPPORT_TC, gpuname
import vllm.envs as envs
@pytest.mark.parametrize("model", ["distilbert/distilgpt2"]) @pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "distilbert/distilgpt2")])
def test_skip_tokenizer_initialization(model: str): def test_skip_tokenizer_initialization(model: str):
# This test checks if the flag skip_tokenizer_init skips the initialization # This test checks if the flag skip_tokenizer_init skips the initialization
# of tokenizer and detokenizer. The generated output is expected to contain # of tokenizer and detokenizer. The generated output is expected to contain
...@@ -14,6 +18,7 @@ def test_skip_tokenizer_initialization(model: str): ...@@ -14,6 +18,7 @@ def test_skip_tokenizer_initialization(model: str):
llm = LLM( llm = LLM(
model=model, model=model,
skip_tokenizer_init=True, skip_tokenizer_init=True,
block_size=64 if gpuname.startswith('BW') and envs.VLLM_FLASH_ATTN_BACKEND else 16,
) )
sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True) sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
import os
from vllm import SamplingParams from vllm import SamplingParams
from vllm.config import LoadFormat from vllm.config import LoadFormat
from ..utils import models_path_prefix
test_model = "openai-community/gpt2" test_model = os.path.join(models_path_prefix, "openai-community/gpt2")
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
......
...@@ -2,9 +2,11 @@ ...@@ -2,9 +2,11 @@
import glob import glob
import tempfile import tempfile
import os
import huggingface_hub.constants import huggingface_hub.constants
import torch import torch
from ..utils import models_path_prefix
from vllm.model_executor.model_loader.weight_utils import ( from vllm.model_executor.model_loader.weight_utils import (
download_weights_from_hf, fastsafetensors_weights_iterator, download_weights_from_hf, fastsafetensors_weights_iterator,
...@@ -14,7 +16,7 @@ from vllm.model_executor.model_loader.weight_utils import ( ...@@ -14,7 +16,7 @@ from vllm.model_executor.model_loader.weight_utils import (
def test_fastsafetensors_model_loader(): def test_fastsafetensors_model_loader():
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
huggingface_hub.constants.HF_HUB_OFFLINE = False huggingface_hub.constants.HF_HUB_OFFLINE = False
download_weights_from_hf("openai-community/gpt2", download_weights_from_hf(os.path.join(models_path_prefix, "openai-community/gpt2"),
allow_patterns=["*.safetensors"], allow_patterns=["*.safetensors"],
cache_dir=tmpdir) cache_dir=tmpdir)
safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True) safetensors = glob.glob(f"{tmpdir}/**/*.safetensors", recursive=True)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment