Commit 469e903b authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.2' into v0.8.2-dev

parents 389ebcf7 25f560a6
......@@ -36,8 +36,7 @@ def _validate_image_max_tokens_one(
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
def test_processor_max_tokens(model_id):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": 1},
)
......@@ -136,8 +135,7 @@ def _test_image_prompt_replacements(
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_prompt_replacements_regression(model_id, num_imgs):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
......@@ -166,8 +164,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
@pytest.mark.parametrize("num_imgs", [1])
def test_processor_prompt_replacements_all(model_id, num_imgs):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
......
......@@ -37,8 +37,7 @@ def _validate_image_max_tokens_one(
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
def test_processor_max_tokens(model_id):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": 1},
)
......@@ -136,8 +135,7 @@ def _test_image_prompt_replacements(
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_prompt_replacements_regression(model_id, num_imgs):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
......@@ -167,8 +165,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
@pytest.mark.parametrize("num_imgs", [1])
def test_processor_prompt_replacements_all(model_id, num_imgs):
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
model_id,
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
......
......@@ -35,9 +35,7 @@ def test_processor_override(
from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
trust_remote_code=True,
model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
......
......@@ -30,8 +30,7 @@ def test_processor_override(
):
"""Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
ctx = build_model_context(
model_name=model_id,
tokenizer_name=model_id,
model_id,
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
......
# SPDX-License-Identifier: Apache-2.0
from collections.abc import Mapping, Set
from dataclasses import dataclass, field
from typing import AbstractSet, Any, Literal, Mapping, Optional
from typing import Any, Literal, Optional
import pytest
from packaging.version import Version
......@@ -123,6 +124,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"FalconForCausalLM": _HfExamplesInfo("tiiuae/falcon-7b"),
"GemmaForCausalLM": _HfExamplesInfo("google/gemma-2b"),
"Gemma2ForCausalLM": _HfExamplesInfo("google/gemma-2-9b"),
"Gemma3ForCausalLM": _HfExamplesInfo("google/gemma-3-1b-it",
min_transformers_version="4.50"),
"GlmForCausalLM": _HfExamplesInfo("THUDM/glm-4-9b-chat-hf"),
"GPT2LMHeadModel": _HfExamplesInfo("gpt2"),
"GPTBigCodeForCausalLM": _HfExamplesInfo("bigcode/starcoder"),
......@@ -130,6 +133,10 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"GPTNeoXForCausalLM": _HfExamplesInfo("EleutherAI/pythia-160m"),
"GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
"GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
"GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts", # noqa: E501
min_transformers_version="4.49"), # noqa: E501
"Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
trust_remote_code=True),
"InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
trust_remote_code=True),
"InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
......@@ -185,17 +192,16 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct"),
"TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
trust_remote_code=True),
"TeleFLMForCausalLM": _HfExamplesInfo("CofeAI/FLM-2-52B-Instruct-2407",
trust_remote_code=True),
"XverseForCausalLM": _HfExamplesInfo("xverse/XVERSE-7B-Chat",
is_available_online=False,
trust_remote_code=True),
"Zamba2ForCausalLM": _HfExamplesInfo("Zyphra/Zamba2-7B-instruct",
min_transformers_version="4.49"),
# [Encoder-decoder]
"BartModel": _HfExamplesInfo("facebook/bart-base"),
"BartForConditionalGeneration": _HfExamplesInfo("facebook/bart-large-cnn"),
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
# Therefore, we borrow the BartTokenizer from the original Bart model
"Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base", # noqa: E501
tokenizer="facebook/bart-base",
trust_remote_code=True), # noqa: E501
}
_EMBEDDING_EXAMPLE_MODELS = {
......@@ -214,7 +220,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
"Qwen2ForSequenceClassification": _HfExamplesInfo("jason9693/Qwen2.5-1.5B-apeach"), # noqa: E501
"RobertaModel": _HfExamplesInfo("sentence-transformers/stsb-roberta-base-v2"), # noqa: E501
"RobertaForMaskedLM": _HfExamplesInfo("sentence-transformers/all-roberta-large-v1"), # noqa: E501
"XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-large"),
"XLMRobertaModel": _HfExamplesInfo("intfloat/multilingual-e5-small"),
# [Multimodal]
"LlavaNextForConditionalGeneration": _HfExamplesInfo("royokong/e5-v"),
"Phi3VForCausalLM": _HfExamplesInfo("TIGER-Lab/VLM2Vec-Full",
......@@ -241,6 +247,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"DeepseekVLV2ForCausalLM": _HfExamplesInfo("deepseek-ai/deepseek-vl2-tiny", # noqa: E501
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}), # noqa: E501
"FuyuForCausalLM": _HfExamplesInfo("adept/fuyu-8b"),
"Gemma3ForConditionalGeneration": _HfExamplesInfo("google/gemma-3-4b-it",
min_transformers_version="4.50"),
"GLM4VForCausalLM": _HfExamplesInfo("THUDM/glm-4v-9b",
trust_remote_code=True,
hf_overrides={"architectures": ["GLM4VForCausalLM"]}), # noqa: E501
......@@ -252,7 +260,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501
"LlavaForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-1.5-7b-hf",
extras={"mistral": "mistral-community/pixtral-12b"}), # noqa: E501
extras={"mistral": "mistral-community/pixtral-12b", # noqa: E501
"mistral-fp8": "nm-testing/pixtral-12b-FP8-dynamic"}), # noqa: E501
"LlavaNextForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-v1.6-mistral-7b-hf"), # noqa: E501
"LlavaNextVideoForConditionalGeneration": _HfExamplesInfo("llava-hf/LLaVA-NeXT-Video-7B-hf"), # noqa: E501
"LlavaOnevisionForConditionalGeneration": _HfExamplesInfo("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"), # noqa: E501
......@@ -271,6 +280,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224", # noqa: E501
extras={"v2": "google/paligemma2-3b-ft-docci-448"}), # noqa: E501
"Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
trust_remote_code=True,
extras={"phi3.5": "microsoft/Phi-3.5-vision-instruct"}), # noqa: E501),
"Phi4MMForCausalLM": _HfExamplesInfo("microsoft/Phi-4-multimodal-instruct",
trust_remote_code=True),
"PixtralForConditionalGeneration": _HfExamplesInfo("mistralai/Pixtral-12B-2409", # noqa: E501
tokenizer_mode="mistral"),
......@@ -282,9 +294,14 @@ _MULTIMODAL_EXAMPLE_MODELS = {
"Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501
"Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501
min_transformers_version="4.49"), # noqa: E501
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", # noqa: E501
trust_remote_code=True),
# [Encoder-decoder]
# Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
# Therefore, we borrow the BartTokenizer from the original Bart model
"Florence2ForConditionalGeneration": _HfExamplesInfo("microsoft/Florence-2-base", # noqa: E501
tokenizer="facebook/bart-base",
trust_remote_code=True), # noqa: E501
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
"WhisperForConditionalGeneration": _HfExamplesInfo("openai/whisper-large-v3"), # noqa: E501
}
......@@ -321,7 +338,7 @@ class HfExampleModels:
self.hf_models = hf_models
def get_supported_archs(self) -> AbstractSet[str]:
def get_supported_archs(self) -> Set[str]:
return self.hf_models.keys()
def get_hf_info(self, model_arch: str) -> _HfExamplesInfo:
......
......@@ -6,8 +6,9 @@ import pytest
from transformers import PretrainedConfig
from vllm import LLM
from vllm.engine.llm_engine import LLMEngine as V0LLMEngine
from vllm.v1.engine.core import EngineCore as V1EngineCore
from ..conftest import MODELS_ON_S3
from .registry import HF_EXAMPLE_MODELS
......@@ -37,17 +38,20 @@ def test_can_initialize(model_arch):
return hf_config
# Avoid calling model.forward()
def _initialize_kv_caches(self) -> None:
def _initialize_kv_caches_v0(self) -> None:
self.cache_config.num_gpu_blocks = 0
self.cache_config.num_cpu_blocks = 0
with patch.object(LLM.get_engine_class(), "_initialize_kv_caches",
_initialize_kv_caches):
model_name = model_info.default
if model_name in MODELS_ON_S3:
model_name = f"s3://vllm-ci-model-weights/{model_name.split('/')[-1]}"
def _initalize_kv_caches_v1(self, vllm_config):
# gpu_blocks (> 0), cpu_blocks
return 1, 0
with (patch.object(V0LLMEngine, "_initialize_kv_caches",
_initialize_kv_caches_v0),
patch.object(V1EngineCore, "_initialize_kv_caches",
_initalize_kv_caches_v1)):
LLM(
model_name,
model_info.default,
tokenizer=model_info.tokenizer,
tokenizer_mode=model_info.tokenizer_mode,
speculative_model=model_info.speculative_model,
......
# SPDX-License-Identifier: Apache-2.0
import os
import pytest
from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
from ..utils import fork_new_process_for_each_test
@fork_new_process_for_each_test
def test_plugin(dummy_opt_path):
os.environ["VLLM_PLUGINS"] = ""
with pytest.raises(Exception) as excinfo:
LLM(model=dummy_opt_path, load_format="dummy")
error_msg = "has no vLLM implementation and " \
"the Transformers implementation is not compatible with vLLM."
assert (error_msg in str(excinfo.value))
@fork_new_process_for_each_test
def test_oot_registration_text_generation(dummy_opt_path):
os.environ["VLLM_PLUGINS"] = "register_dummy_model"
prompts = ["Hello, my name is", "The text does not matter"]
sampling_params = SamplingParams(temperature=0)
llm = LLM(model=dummy_opt_path, load_format="dummy")
first_token = llm.get_tokenizer().decode(0)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
generated_text = output.outputs[0].text
# make sure only the first token is generated
rest = generated_text.replace(first_token, "")
assert rest == ""
@fork_new_process_for_each_test
def test_oot_registration_embedding(dummy_gemma2_embedding_path):
os.environ["VLLM_PLUGINS"] = "register_dummy_model"
prompts = ["Hello, my name is", "The text does not matter"]
llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
outputs = llm.embed(prompts)
for output in outputs:
assert all(v == 0 for v in output.outputs.embedding)
from ..utils import create_new_process_for_each_test
@create_new_process_for_each_test()
def test_plugin(
monkeypatch: pytest.MonkeyPatch,
dummy_opt_path: str,
):
# V1 shuts down rather than raising an error here.
with monkeypatch.context() as m:
m.setenv("VLLM_USE_V1", "0")
m.setenv("VLLM_PLUGINS", "")
with pytest.raises(Exception) as excinfo:
LLM(model=dummy_opt_path, load_format="dummy")
error_msg = "has no vLLM implementation and the Transformers implementation is not compatible with vLLM" # noqa: E501
assert (error_msg in str(excinfo.value))
@create_new_process_for_each_test()
def test_oot_registration_text_generation(
monkeypatch: pytest.MonkeyPatch,
dummy_opt_path: str,
):
with monkeypatch.context() as m:
m.setenv("VLLM_PLUGINS", "register_dummy_model")
prompts = ["Hello, my name is", "The text does not matter"]
sampling_params = SamplingParams(temperature=0)
llm = LLM(model=dummy_opt_path, load_format="dummy")
first_token = llm.get_tokenizer().decode(0)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
generated_text = output.outputs[0].text
# make sure only the first token is generated
rest = generated_text.replace(first_token, "")
assert rest == ""
@create_new_process_for_each_test()
def test_oot_registration_embedding(
monkeypatch: pytest.MonkeyPatch,
dummy_gemma2_embedding_path: str,
):
with monkeypatch.context() as m:
m.setenv("VLLM_PLUGINS", "register_dummy_model")
prompts = ["Hello, my name is", "The text does not matter"]
llm = LLM(model=dummy_gemma2_embedding_path, load_format="dummy")
outputs = llm.embed(prompts)
for output in outputs:
assert all(v == 0 for v in output.outputs.embedding)
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
@fork_new_process_for_each_test
def test_oot_registration_multimodal(dummy_llava_path):
os.environ["VLLM_PLUGINS"] = "register_dummy_model"
prompts = [{
"prompt": "What's in the image?<image>",
"multi_modal_data": {
"image": image
},
}, {
"prompt": "Describe the image<image>",
"multi_modal_data": {
"image": image
},
}]
sampling_params = SamplingParams(temperature=0)
llm = LLM(model=dummy_llava_path,
load_format="dummy",
max_num_seqs=1,
trust_remote_code=True,
gpu_memory_utilization=0.98,
max_model_len=4096,
enforce_eager=True,
limit_mm_per_prompt={"image": 1})
first_token = llm.get_tokenizer().decode(0)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
generated_text = output.outputs[0].text
# make sure only the first token is generated
rest = generated_text.replace(first_token, "")
assert rest == ""
@create_new_process_for_each_test()
def test_oot_registration_multimodal(
monkeypatch: pytest.MonkeyPatch,
dummy_llava_path: str,
):
with monkeypatch.context() as m:
m.setenv("VLLM_PLUGINS", "register_dummy_model")
prompts = [{
"prompt": "What's in the image?<image>",
"multi_modal_data": {
"image": image
},
}, {
"prompt": "Describe the image<image>",
"multi_modal_data": {
"image": image
},
}]
sampling_params = SamplingParams(temperature=0)
llm = LLM(model=dummy_llava_path,
load_format="dummy",
max_num_seqs=1,
trust_remote_code=True,
gpu_memory_utilization=0.98,
max_model_len=4096,
enforce_eager=True,
limit_mm_per_prompt={"image": 1})
first_token = llm.get_tokenizer().decode(0)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
generated_text = output.outputs[0].text
# make sure only the first token is generated
rest = generated_text.replace(first_token, "")
assert rest == ""
......@@ -17,7 +17,7 @@ from vllm.model_executor.models.registry import (_MULTIMODAL_MODELS,
ModelRegistry)
from vllm.platforms import current_platform
from ..utils import fork_new_process_for_each_test
from ..utils import create_new_process_for_each_test
from .registry import HF_EXAMPLE_MODELS
......@@ -45,7 +45,7 @@ def test_registry_imports(model_arch):
assert supports_multimodal(model_cls)
@fork_new_process_for_each_test
@create_new_process_for_each_test()
@pytest.mark.parametrize("model_arch,is_mm,init_cuda,is_ce", [
("LlamaForCausalLM", False, False, False),
("MllamaForConditionalGeneration", True, False, False),
......@@ -70,7 +70,7 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):
stacklevel=2)
@fork_new_process_for_each_test
@create_new_process_for_each_test()
@pytest.mark.parametrize("model_arch,is_pp,init_cuda", [
("MLPSpeculatorPreTrainedModel", False, False),
("DeepseekV2ForCausalLM", True, False),
......
......@@ -4,7 +4,6 @@
Run `pytest tests/models/test_transformers.py`.
"""
from contextlib import nullcontext
from typing import Type
import pytest
......@@ -14,8 +13,8 @@ from .utils import check_logprobs_close
def check_implementation(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
example_prompts: list[str],
model: str,
**kwargs,
......@@ -47,8 +46,8 @@ def check_implementation(
("ArthurZ/Ilama-3.2-1B", "auto"), # CUSTOM CODE
]) # trust_remote_code=True by default
def test_models(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
example_prompts: list[str],
model: str,
model_impl: str,
......@@ -71,8 +70,8 @@ def test_models(
@multi_gpu_test(num_gpus=2)
def test_distributed(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
example_prompts,
):
kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2}
......@@ -92,7 +91,7 @@ def test_distributed(
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_quantization(
vllm_runner: Type[VllmRunner],
vllm_runner: type[VllmRunner],
example_prompts: list[str],
model: str,
quantization_kwargs: dict[str, str],
......
# SPDX-License-Identifier: Apache-2.0
import warnings
from typing import Dict, List, Optional, Sequence, Tuple, Union
from collections.abc import Sequence
from typing import Any, Optional, Union
import torch
......@@ -9,7 +10,9 @@ from vllm.config import ModelConfig, TaskOption
from vllm.inputs import InputContext
from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
TokensText = Tuple[List[int], str]
from .registry import HF_EXAMPLE_MODELS
TokensText = tuple[list[int], str]
def check_outputs_equal(
......@@ -46,7 +49,7 @@ def check_outputs_equal(
# * List of top sample logprobs for each sampled token
#
# Assumes prompt logprobs were not requested.
TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
TokensTextLogprobs = tuple[list[int], str, Optional[Union[list[dict[int,
float]],
SampleLogprobs]]]
......@@ -57,8 +60,8 @@ TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
# * Optional list of top sample logprobs for each sampled token
#
# Assumes prompt logprobs were not requested.
TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]],
List[Dict[str,
TextTextLogprobs = tuple[list[str], str, Optional[Union[list[dict[str, float]],
list[dict[str,
Logprob]]]]]
# Representation of generated sequence as a tuple of
......@@ -68,9 +71,9 @@ TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]],
# * Optional list of top prompt logprobs for each prompt token
#
# Allows prompt logprobs to be requested.
TokensTextLogprobsPromptLogprobs = Tuple[
List[int], str, Optional[Union[List[Dict[int, float]], SampleLogprobs]],
Optional[Union[List[Optional[Dict[int, float]]], PromptLogprobs]]]
TokensTextLogprobsPromptLogprobs = tuple[
list[int], str, Optional[Union[list[dict[int, float]], SampleLogprobs]],
Optional[Union[list[Optional[dict[int, float]]], PromptLogprobs]]]
def check_logprobs_close(
......@@ -249,21 +252,17 @@ def check_logprobs_close(
def build_model_context(
model_name: str,
model_id: str,
task: TaskOption = "auto",
tokenizer_name: Optional[str] = None,
trust_remote_code: bool = False,
dtype: Optional[Union[str, torch.dtype]] = None,
mm_processor_kwargs: Optional[Dict] = None,
limit_mm_per_prompt: Optional[Dict] = None,
dtype: Union[str, torch.dtype] = "auto",
mm_processor_kwargs: Optional[dict[str, Any]] = None,
limit_mm_per_prompt: Optional[dict[str, int]] = None,
disable_mm_preprocessor_cache: bool = True,
):
"""Creates an InputContext for a given model.
Args:
model_name: Name of the model being considered.
tokenizer_name: Name of the tokenizer being considered.
trust_remote_code: Whether or not to allow loading remote code.
model_id: ID of the model being considered.
mm_processor_kwargs: optional processor kwargs for to be leveraged
in the input processor, mapper, dummy data creation, etc.
limit_mm_per_prompt: Multimodal limits.
......@@ -271,21 +270,21 @@ def build_model_context(
Returns:
InputContext for the model being considered.
"""
if tokenizer_name is None:
tokenizer_name = model_name
if dtype is None:
dtype = "half"
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
model_config = ModelConfig(
model_name,
model_id,
task=task,
tokenizer=tokenizer_name,
tokenizer_mode="auto",
trust_remote_code=trust_remote_code,
tokenizer=model_info.tokenizer or model_id,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
dtype=dtype,
seed=0,
mm_processor_kwargs=mm_processor_kwargs,
limit_mm_per_prompt=limit_mm_per_prompt,
disable_mm_preprocessor_cache=disable_mm_preprocessor_cache,
hf_overrides=model_info.hf_overrides,
)
return InputContext(model_config)
# SPDX-License-Identifier: Apache-2.0
import pytest
@pytest.fixture(scope="function", autouse=True)
def use_v0_only(monkeypatch):
"""
Since this module is V0 only, set VLLM_USE_V1=0 for
all tests in the module.
"""
monkeypatch.setenv('VLLM_USE_V1', '0')
......@@ -13,7 +13,7 @@ from vllm.engine.arg_utils import AsyncEngineArgs
from ..utils import models_path_prefix
MODEL = os.path.join(models_path_prefix, "gemma-1.1-2b-it")
ENGINE_ARGS = AsyncEngineArgs(model=MODEL, load_format="runai_streamer")
ENGINE_ARGS = AsyncEngineArgs(model=MODEL)
RAISED_ERROR = KeyError
RAISED_VALUE = "foo"
EXPECTED_TOKENS = 250
......
......@@ -19,14 +19,13 @@ from vllm.engine.multiprocessing.engine import MQLLMEngine
from vllm.entrypoints.openai.api_server import build_async_engine_client
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.lora.request import LoRARequest
from vllm.sequence import SequenceGroupMetadata
from vllm.usage.usage_lib import UsageContext
from vllm.utils import FlexibleArgumentParser
from ..utils import models_path_prefix
MODEL = os.path.join(models_path_prefix, "gemma-1.1-2b-it")
ENGINE_ARGS = AsyncEngineArgs(model=MODEL,
load_format="runai_streamer",
enforce_eager=True)
ENGINE_ARGS = AsyncEngineArgs(model=MODEL, enforce_eager=True)
RAISED_ERROR = KeyError
RAISED_VALUE = "foo"
......@@ -238,25 +237,28 @@ async def test_bad_request(tmp_socket):
@pytest.mark.asyncio
async def test_mp_crash_detection(monkeypatch):
async def test_mp_crash_detection(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
parser = FlexibleArgumentParser(description="vLLM's remote OpenAI server.")
parser = make_arg_parser(parser)
args = parser.parse_args([])
parser = FlexibleArgumentParser(
description="vLLM's remote OpenAI server.")
parser = make_arg_parser(parser)
args = parser.parse_args([])
# When LLMEngine is loaded, it will crash.
def mock_init():
raise ValueError
# When LLMEngine is loaded, it will crash.
def mock_init():
raise ValueError
monkeypatch.setattr(LLMEngine, "__init__", mock_init)
m.setattr(LLMEngine, "__init__", mock_init)
start = time.perf_counter()
async with build_async_engine_client(args):
pass
end = time.perf_counter()
start = time.perf_counter()
async with build_async_engine_client(args):
pass
end = time.perf_counter()
assert end - start < 60, ("Expected vLLM to gracefully shutdown in <60s "
"if there is an error in the startup.")
assert end - start < 60, (
"Expected vLLM to gracefully shutdown in <60s "
"if there is an error in the startup.")
@pytest.mark.asyncio
......@@ -296,3 +298,80 @@ async def test_engine_process_death(tmp_socket):
await client.check_health()
client.close()
def run_with_evil_input_processing(engine_args: AsyncEngineArgs,
ipc_path: str):
"""Simulate an exception while preparing inputs for the model.
In the wild, this could be something like a multimodal input processor
failing on invalid image data."""
# Make engine.
engine = MQLLMEngine.from_engine_args(
engine_args=engine_args,
usage_context=UsageContext.UNKNOWN_CONTEXT,
ipc_path=ipc_path)
runner = engine.engine.model_executor.driver_worker.worker.model_runner
# Raise error in the model runner when adding a sequence group.
# See class ModelInputForGPUBuilder
def raiser(_, seq_group_metadata: SequenceGroupMetadata):
if seq_group_metadata.request_id.startswith("evil"):
raise RAISED_ERROR(RAISED_VALUE)
runner.builder.per_seq_group_compute_fns.append(raiser)
# Run engine.
engine.start()
@pytest.mark.asyncio
async def test_failed_inputs(tmp_socket):
with RemoteMQLLMEngine(engine_args=ENGINE_ARGS,
ipc_path=tmp_socket,
run_fn=run_with_evil_input_processing) as engine:
client = await engine.make_client()
assert client.is_running
# Engine should be healthy
await client.check_health()
async def run_failing_request():
async for _ in client.generate(
prompt="Hello my name is",
sampling_params=SamplingParams(max_tokens=10),
request_id="evil" + str(uuid.uuid4())):
pass
async def run_passing_request():
async for _ in client.generate(
prompt="Hello my name is",
sampling_params=SamplingParams(max_tokens=10),
request_id=str(uuid.uuid4())):
pass
passing_tasks = [
asyncio.create_task(run_passing_request()) for _ in range(10)
]
failing_tasks = [
asyncio.create_task(run_failing_request()) for _ in range(10)
]
await asyncio.gather(*failing_tasks, return_exceptions=True)
await asyncio.gather(*passing_tasks)
# All the bad inputs should have raised
for task in failing_tasks:
with pytest.raises(RAISED_ERROR):
task.result()
# But all good inputs should have still succeeded
for task in passing_tasks:
task.result()
# And the engine should remain healthy
assert not client.errored
await client.check_health()
client.close()
......@@ -17,9 +17,7 @@ NUM_EXPECTED_TOKENS = 10
NUM_REQUESTS = 10000
# Scenarios to test for num generated token.
ENGINE_ARGS = AsyncEngineArgs(model=MODEL,
load_format="runai_streamer",
disable_log_requests=True)
ENGINE_ARGS = AsyncEngineArgs(model=MODEL, disable_log_requests=True)
@pytest.fixture(scope="function")
......
......@@ -2,7 +2,7 @@
import asyncio
import multiprocessing
from typing import Callable, Tuple, Union
from typing import Callable, Union
from vllm import SamplingParams
from vllm.engine.arg_utils import AsyncEngineArgs
......@@ -16,7 +16,7 @@ async def generate(
client: MQLLMEngineClient,
request_id: str,
num_tokens: int,
return_output: bool = False) -> Union[RequestOutput, Tuple[int, str]]:
return_output: bool = False) -> Union[RequestOutput, tuple[int, str]]:
final_output = None
count = 0
......
# SPDX-License-Identifier: Apache-2.0
# Test the AsyncLLMEngine with multi-step-decoding
from typing import List, Optional
from typing import Optional
import pytest
import os
from tests.kernels.utils import override_backend_env_variable
from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_logprobs_close
from ..utils import (completions_with_server_args, get_client_text_generations,
......@@ -18,7 +18,7 @@ MODELS = [
NUM_SCHEDULER_STEPS = [8] # Multi-step decoding steps
NUM_PROMPTS = [10]
DEFAULT_SERVER_ARGS: List[str] = [
DEFAULT_SERVER_ARGS: list[str] = [
"--distributed-executor-backend",
"ray",
"--gpu-memory-utilization",
......@@ -54,7 +54,7 @@ async def test_multi_step(
num_logprobs: Optional[int],
attention_backend: str,
enable_chunked_prefill: bool,
monkeypatch,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Test vLLM engine with multi-step scheduling in an OpenAI-protocol
client/server environment.
......@@ -84,67 +84,70 @@ async def test_multi_step(
pytest.skip("Multi-step with Chunked-Prefill only supports"
"PP=1 and FLASH_ATTN backend")
override_backend_env_variable(monkeypatch, attention_backend)
prompts = example_prompts
if len(prompts) < num_prompts:
prompts = prompts * ((num_prompts // len(prompts)) + 1)
prompts = prompts[:num_prompts]
assert len(prompts) == num_prompts
server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
ms_server_args = DEFAULT_SERVER_ARGS + \
["--num-scheduler-steps", f"{num_scheduler_steps}"]
if not is_async:
ms_server_args += ["--disable-async-output-proc"]
if eager_mode:
ms_server_args.append("--enforce-eager")
if enable_chunked_prefill:
ms_server_args.append("--enable-chunked-prefill")
distributed_args = [
"--tensor-parallel-size",
str(tp_size),
"--pipeline-parallel-size",
str(pp_size),
]
# Spin up client/server & issue completion API requests.
# Default `max_wait_seconds` is 240 but was empirically
# was raised 5x to 1200 *just for this test* due to
# observed timeouts in GHA CI
ref_completions = await completions_with_server_args(
prompts,
model,
server_args + distributed_args,
num_logprobs,
max_wait_seconds=5 * 240)
test_completions = await completions_with_server_args(
prompts,
model,
ms_server_args + distributed_args,
num_logprobs,
max_wait_seconds=5 * 240)
# Assert multi-step scheduling produces identical tokens
# to single-step scheduling.
ref_generations = get_client_text_generations(ref_completions)
test_generations = get_client_text_generations(test_completions)
assert ref_generations == test_generations
# Assert multi-step scheduling produces nearly-identical logprobs
# to single-step scheduling.
ref_text_logprobs = get_client_text_logprob_generations(ref_completions)
test_text_logprobs = get_client_text_logprob_generations(test_completions)
check_logprobs_close(
outputs_0_lst=ref_text_logprobs,
outputs_1_lst=test_text_logprobs,
name_0="hf",
name_1="vllm",
)
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
prompts = example_prompts
if len(prompts) < num_prompts:
prompts = prompts * ((num_prompts // len(prompts)) + 1)
prompts = prompts[:num_prompts]
assert len(prompts) == num_prompts
server_args = DEFAULT_SERVER_ARGS + ["--enforce-eager"]
ms_server_args = DEFAULT_SERVER_ARGS + \
["--num-scheduler-steps", f"{num_scheduler_steps}"]
if not is_async:
ms_server_args += ["--disable-async-output-proc"]
if eager_mode:
ms_server_args.append("--enforce-eager")
if enable_chunked_prefill:
ms_server_args.append("--enable-chunked-prefill")
distributed_args = [
"--tensor-parallel-size",
str(tp_size),
"--pipeline-parallel-size",
str(pp_size),
]
# Spin up client/server & issue completion API requests.
# Default `max_wait_seconds` is 240 but was empirically
# was raised 5x to 1200 *just for this test* due to
# observed timeouts in GHA CI
ref_completions = await completions_with_server_args(
prompts,
model,
server_args + distributed_args,
num_logprobs,
max_wait_seconds=5 * 240)
test_completions = await completions_with_server_args(
prompts,
model,
ms_server_args + distributed_args,
num_logprobs,
max_wait_seconds=5 * 240)
# Assert multi-step scheduling produces identical tokens
# to single-step scheduling.
ref_generations = get_client_text_generations(ref_completions)
test_generations = get_client_text_generations(test_completions)
assert ref_generations == test_generations
# Assert multi-step scheduling produces nearly-identical logprobs
# to single-step scheduling.
ref_text_logprobs = get_client_text_logprob_generations(
ref_completions)
test_text_logprobs = get_client_text_logprob_generations(
test_completions)
check_logprobs_close(
outputs_0_lst=ref_text_logprobs,
outputs_1_lst=test_text_logprobs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize(("tp_size, pp_size"), [
......@@ -154,7 +157,7 @@ async def test_multi_step(
async def test_multi_step_pp_smoke(
tp_size: int,
pp_size: int,
monkeypatch,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""
Smoke test for the vLLM engine with multi-step scheduling in an
......@@ -176,54 +179,55 @@ async def test_multi_step_pp_smoke(
attention_backend = "FLASH_ATTN"
max_num_seqs = 3
override_backend_env_variable(monkeypatch, attention_backend)
# Prompt from the ShareGPT dataset
prompts = [
"in the jtbd context whats a push?", # codespell:ignore
"in the jtbd context whats a push?", # codespell:ignore
"in the jtbd context whats a push?", # codespell:ignore
"in the jtbd context whats a push?", # codespell:ignore
]
# Use varying max_tokens to introduce scheduling randomness.
max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
assert len(prompts) == len(max_tokens)
test_args = [
"--tensor-parallel-size",
str(tp_size), "--pipeline-parallel-size",
str(pp_size), "--max-num-seqs",
str(max_num_seqs)
]
server_args = DEFAULT_SERVER_ARGS + test_args
ms_server_args = DEFAULT_SERVER_ARGS + \
["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
test_args
# Spin up client/server & issue completion API requests.
# Default `max_wait_seconds` is 240 but was empirically
# was raised 3x to 720 *just for this test* due to
# observed timeouts in GHA CI
ref_completions = await completions_with_server_args(
prompts=prompts,
model_name=model,
server_cli_args=server_args,
num_logprobs=None,
max_wait_seconds=5 * 240,
max_tokens=max_tokens)
test_completions = await completions_with_server_args(
prompts=prompts,
model_name=model,
server_cli_args=ms_server_args,
num_logprobs=None,
max_wait_seconds=5 * 240,
max_tokens=max_tokens)
# Assert multi-step scheduling produces identical tokens
# to single-step scheduling.
ref_generations = get_client_text_generations(ref_completions)
test_generations = get_client_text_generations(test_completions)
assert ref_generations == test_generations
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
# Prompt from the ShareGPT dataset
prompts = [
"in the jtbd context whats a push?", # codespell:ignore
"in the jtbd context whats a push?", # codespell:ignore
"in the jtbd context whats a push?", # codespell:ignore
"in the jtbd context whats a push?", # codespell:ignore
]
# Use varying max_tokens to introduce scheduling randomness.
max_tokens = [10 * i for i in range(1, len(prompts) + 1)]
assert len(prompts) == len(max_tokens)
test_args = [
"--tensor-parallel-size",
str(tp_size), "--pipeline-parallel-size",
str(pp_size), "--max-num-seqs",
str(max_num_seqs)
]
server_args = DEFAULT_SERVER_ARGS + test_args
ms_server_args = DEFAULT_SERVER_ARGS + \
["--num-scheduler-steps", f"{num_scheduler_steps}"] + \
test_args
# Spin up client/server & issue completion API requests.
# Default `max_wait_seconds` is 240 but was empirically
# was raised 3x to 720 *just for this test* due to
# observed timeouts in GHA CI
ref_completions = await completions_with_server_args(
prompts=prompts,
model_name=model,
server_cli_args=server_args,
num_logprobs=None,
max_wait_seconds=5 * 240,
max_tokens=max_tokens)
test_completions = await completions_with_server_args(
prompts=prompts,
model_name=model,
server_cli_args=ms_server_args,
num_logprobs=None,
max_wait_seconds=5 * 240,
max_tokens=max_tokens)
# Assert multi-step scheduling produces identical tokens
# to single-step scheduling.
ref_generations = get_client_text_generations(ref_completions)
test_generations = get_client_text_generations(test_completions)
assert ref_generations == test_generations
......@@ -8,7 +8,7 @@ from typing import Optional
import pytest
import os
from tests.kernels.utils import override_backend_env_variable
from vllm.utils import STR_BACKEND_ENV_VAR
from ..models.utils import check_logprobs_close, check_outputs_equal
from ..utils import models_path_prefix
......@@ -44,7 +44,7 @@ def test_multi_step_llm(
num_prompts: int,
num_logprobs: Optional[int],
attention_backend: str,
monkeypatch,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Test vLLM engine with multi-step scheduling via sync LLM Engine.
......@@ -72,48 +72,49 @@ def test_multi_step_llm(
num_logprobs: corresponds to the `logprobs` argument to the OpenAI
completions endpoint; `None` -> 1 logprob returned.
"""
override_backend_env_variable(monkeypatch, attention_backend)
prompts = example_prompts
if len(prompts) < num_prompts:
prompts = prompts * ((num_prompts // len(prompts)) + 1)
prompts = prompts[:num_prompts]
assert len(prompts) == num_prompts
with vllm_runner(
model,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size,
enable_chunked_prefill=enable_chunked_prefill,
num_scheduler_steps=num_scheduler_steps,
) as vllm_model:
vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
if num_logprobs is None else
vllm_model.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs))
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
if num_logprobs is None else
hf_model.generate_greedy_logprobs_limit(
prompts, max_tokens, num_logprobs))
if num_logprobs is None:
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
else:
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
prompts = example_prompts
if len(prompts) < num_prompts:
prompts = prompts * ((num_prompts // len(prompts)) + 1)
prompts = prompts[:num_prompts]
assert len(prompts) == num_prompts
with vllm_runner(
model,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size,
enable_chunked_prefill=enable_chunked_prefill,
num_scheduler_steps=num_scheduler_steps,
) as vllm_model:
vllm_outputs = (vllm_model.generate_greedy(prompts, max_tokens)
if num_logprobs is None else
vllm_model.generate_greedy_logprobs(
prompts, max_tokens, num_logprobs))
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = (hf_model.generate_greedy(prompts, max_tokens)
if num_logprobs is None else
hf_model.generate_greedy_logprobs_limit(
prompts, max_tokens, num_logprobs))
if num_logprobs is None:
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
else:
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", MODELS)
......@@ -138,7 +139,7 @@ def test_multi_step_llm_w_prompt_logprobs(
num_logprobs: Optional[int],
num_prompt_logprobs: Optional[int],
attention_backend: str,
monkeypatch,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Test prompt logprobs with multi-step scheduling via sync LLM Engine.
......@@ -168,47 +169,48 @@ def test_multi_step_llm_w_prompt_logprobs(
note that this argument is not supported by the
OpenAI completions endpoint.
"""
override_backend_env_variable(monkeypatch, attention_backend)
prompts = example_prompts
if len(prompts) < num_prompts:
prompts = prompts * ((num_prompts // len(prompts)) + 1)
prompts = prompts[:num_prompts]
assert len(prompts) == num_prompts
with vllm_runner(
model,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size,
num_scheduler_steps=num_scheduler_steps,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs,
num_prompt_logprobs=num_prompt_logprobs)
with vllm_runner(
model,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size,
) as vllm_model:
single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs,
num_prompt_logprobs=num_prompt_logprobs)
check_logprobs_close(
outputs_0_lst=single_step_vllm_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
prompts = example_prompts
if len(prompts) < num_prompts:
prompts = prompts * ((num_prompts // len(prompts)) + 1)
prompts = prompts[:num_prompts]
assert len(prompts) == num_prompts
with vllm_runner(
model,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size,
num_scheduler_steps=num_scheduler_steps,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs,
num_prompt_logprobs=num_prompt_logprobs)
with vllm_runner(
model,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size,
) as vllm_model:
single_step_vllm_outputs = vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs,
num_prompt_logprobs=num_prompt_logprobs)
check_logprobs_close(
outputs_0_lst=single_step_vllm_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", MODELS)
......@@ -232,7 +234,7 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
num_prompts: int,
num_logprobs: Optional[int],
attention_backend: str,
monkeypatch,
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Test vLLM engine with multi-step+"single-step chunked prefill"+APC.
......@@ -295,77 +297,78 @@ def test_multi_step_llm_chunked_prefill_prefix_cache(
#
# The Incorrect scheduling behavior - if it occurs - will cause an exception
# in the model runner resulting from `do_sample=False`.
override_backend_env_variable(monkeypatch, attention_backend)
assert len(example_prompts) >= 2
challenge_prompts = copy.deepcopy(example_prompts)
challenge_prompts[0] = ('vLLM is a high-throughput and memory-efficient '
'inference and serving engine for LLMs.\n'
) # 24 tok
challenge_prompts[1] = (
'Briefly describe the major milestones in the '
'development of artificial intelligence from 1950 to 2020.\n'
) # 30 tok
# If necessary, adjust the length of `challenge_prompts` to match
# `num_prompts`
if len(challenge_prompts) < num_prompts:
challenge_prompts = (challenge_prompts *
((num_prompts // len(challenge_prompts)) + 1))
challenge_prompts = challenge_prompts[:num_prompts]
assert len(challenge_prompts) == num_prompts
# Single-step scheduler baseline
with vllm_runner(
model,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size,
num_scheduler_steps=num_scheduler_steps,
max_model_len=48,
max_num_batched_tokens=48,
max_num_seqs=4,
block_size=16,
) as vllm_model:
outputs_baseline = (vllm_model.generate_greedy(
challenge_prompts, max_tokens) if num_logprobs is None else
vllm_model.generate_greedy_logprobs(
challenge_prompts, max_tokens, num_logprobs))
# multi-step+"single-step chunked prefill"+APC
with vllm_runner(
model,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size,
enable_chunked_prefill=True,
enable_prefix_caching=True,
num_scheduler_steps=num_scheduler_steps,
max_model_len=48,
max_num_batched_tokens=48,
max_num_seqs=4,
block_size=16,
) as vllm_model:
outputs_w_features = (vllm_model.generate_greedy(
challenge_prompts, max_tokens) if num_logprobs is None else
vllm_model.generate_greedy_logprobs(
challenge_prompts, max_tokens, num_logprobs))
if num_logprobs is None:
# No-logprobs test
check_outputs_equal(
outputs_0_lst=outputs_baseline,
outputs_1_lst=outputs_w_features,
name_0="multi-step",
name_1="multi-step+features",
)
else:
# Yes-logprobs test
check_logprobs_close(
outputs_0_lst=outputs_baseline,
outputs_1_lst=outputs_w_features,
name_0="multi-step",
name_1="multi-step+features",
)
with monkeypatch.context() as m:
m.setenv(STR_BACKEND_ENV_VAR, attention_backend)
assert len(example_prompts) >= 2
challenge_prompts = copy.deepcopy(example_prompts)
challenge_prompts[0] = (
'vLLM is a high-throughput and memory-efficient '
'inference and serving engine for LLMs.\n') # 24 tok
challenge_prompts[1] = (
'Briefly describe the major milestones in the '
'development of artificial intelligence from 1950 to 2020.\n'
) # 30 tok
# If necessary, adjust the length of `challenge_prompts` to match
# `num_prompts`
if len(challenge_prompts) < num_prompts:
challenge_prompts = (challenge_prompts *
((num_prompts // len(challenge_prompts)) + 1))
challenge_prompts = challenge_prompts[:num_prompts]
assert len(challenge_prompts) == num_prompts
# Single-step scheduler baseline
with vllm_runner(
model,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size,
num_scheduler_steps=num_scheduler_steps,
max_model_len=48,
max_num_batched_tokens=48,
max_num_seqs=4,
block_size=16,
) as vllm_model:
outputs_baseline = (
vllm_model.generate_greedy(challenge_prompts, max_tokens) if
num_logprobs is None else vllm_model.generate_greedy_logprobs(
challenge_prompts, max_tokens, num_logprobs))
# multi-step+"single-step chunked prefill"+APC
with vllm_runner(
model,
dtype=dtype,
enforce_eager=enforce_eager,
gpu_memory_utilization=0.7,
tensor_parallel_size=tp_size,
enable_chunked_prefill=True,
enable_prefix_caching=True,
num_scheduler_steps=num_scheduler_steps,
max_model_len=48,
max_num_batched_tokens=48,
max_num_seqs=4,
block_size=16,
) as vllm_model:
outputs_w_features = (
vllm_model.generate_greedy(challenge_prompts, max_tokens) if
num_logprobs is None else vllm_model.generate_greedy_logprobs(
challenge_prompts, max_tokens, num_logprobs))
if num_logprobs is None:
# No-logprobs test
check_outputs_equal(
outputs_0_lst=outputs_baseline,
outputs_1_lst=outputs_w_features,
name_0="multi-step",
name_1="multi-step+features",
)
else:
# Yes-logprobs test
check_logprobs_close(
outputs_0_lst=outputs_baseline,
outputs_1_lst=outputs_w_features,
name_0="multi-step",
name_1="multi-step+features",
)
......@@ -7,18 +7,24 @@ from unittest.mock import MagicMock
import numpy as np
import pytest
import torch
from transformers import ProcessorMixin
from vllm.config import ModelConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
MultiModalKwargsItem,
MultiModalSharedField)
# yapf conflicts with isort for this block
# yapf: disable
from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
PromptReplacement,
ProcessingCache, PromptIndexTargets,
PromptInsertion, PromptReplacement,
apply_text_matches,
apply_token_matches,
find_mm_placeholders,
find_text_matches, find_token_matches,
iter_token_matches,
replace_text_matches,
replace_token_matches)
# yapf: enable
from vllm.multimodal.profiling import MultiModalProfiler
......@@ -89,6 +95,58 @@ def test_iter_token_matches(token_ids, match_ids, expected):
assert all(match_len == len(match_ids) for match_len in match_lens)
# yapf: disable
@pytest.mark.parametrize(
("token_ids", "match_ids", "new_ids", "expected"),
[
([], [], [-1], []),
([], [32000], [-1], []),
(
[32000, 32000, 32000],
[32000],
[-1],
[-1, -1, -1],
),
(
[32000, 32000, 32000],
[32000, 32000],
[-1],
[-1, 32000],
),
(
[32000, 32000, 32000],
[32000, 32000, 32000],
[-1],
[-1],
),
(
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
[28747, 32000],
[-1],
[9833, -1, 32000, 32000, 9833, -1, 32000, 918],
),
(
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
[28747, 32000, 32000, 32000],
[-1],
[9833, -1, 9833, 28747, 32000, 32000, 918],
),
(
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
[28747, 0, 32000],
[-1],
[9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
),
],
)
# yapf: enable
def test_replace_token_matches(token_ids, match_ids, new_ids, expected):
result = replace_token_matches(token_ids, match_ids, new_ids)
# Manually constructed results
assert result == expected
# yapf: disable
@pytest.mark.parametrize(
("prompt", "target_by_key", "expected_by_key"),
......@@ -98,11 +156,21 @@ def test_iter_token_matches(token_ids, match_ids, expected):
{
"pattern_1": [],
"pattern_2": [32000],
"pattern_3": PromptIndexTargets.start(),
"pattern_4": PromptIndexTargets.prefix([32000]),
"pattern_5": PromptIndexTargets.end(),
},
{
"pattern_1": [],
"pattern_2": [],
}
"pattern_3": [
{ "start_idx": 0, "end_idx": 0 },
],
"pattern_4": [],
"pattern_5": [
{ "start_idx": 0, "end_idx": 0 },
],
},
),
(
[32000, 32000, 32000, 32000],
......@@ -110,6 +178,9 @@ def test_iter_token_matches(token_ids, match_ids, expected):
"pattern_1": [32000],
"pattern_2": [32000, 32000],
"pattern_3": [32000, 32000, 32000],
"pattern_4": PromptIndexTargets.start(),
"pattern_5": PromptIndexTargets.prefix([32000]),
"pattern_6": PromptIndexTargets.end(),
},
{
"pattern_1": [
......@@ -125,6 +196,15 @@ def test_iter_token_matches(token_ids, match_ids, expected):
"pattern_3": [
{ "start_idx": 0, "end_idx": 3 },
],
"pattern_4": [
{ "start_idx": 0, "end_idx": 0 },
],
"pattern_5": [
{ "start_idx": 1, "end_idx": 1 },
],
"pattern_6": [
{ "start_idx": 4, "end_idx": 4 },
],
},
),
(
......@@ -133,6 +213,9 @@ def test_iter_token_matches(token_ids, match_ids, expected):
"pattern_1": [28747, 32000],
"pattern_2": [28747, 32000, 32000, 32000],
"pattern_3": [28747, 0, 32000],
"pattern_4": PromptIndexTargets.start(),
"pattern_5": PromptIndexTargets.prefix([28747, 32000]),
"pattern_6": PromptIndexTargets.end(),
},
{
"pattern_1": [
......@@ -143,20 +226,33 @@ def test_iter_token_matches(token_ids, match_ids, expected):
{ "start_idx": 1, "end_idx": 5 },
],
"pattern_3": [],
"pattern_4": [
{ "start_idx": 0, "end_idx": 0 },
],
"pattern_5": [],
"pattern_6": [
{ "start_idx": 10, "end_idx": 10 },
],
},
),
],
)
@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
# yapf: enable
def test_find_token_matches(prompt, target_by_key, expected_by_key):
def test_find_token_matches(
prompt,
target_by_key,
expected_by_key,
update_type,
):
# Should not be used since there is nothing to convert to token IDs
mock_tokenizer = cast(AnyTokenizer, object())
prompt_repls = [
PromptReplacement(key, target, []).bind(mock_tokenizer)
prompt_updates = [
update_type(key, target, []).bind(mock_tokenizer)
for key, target in target_by_key.items()
]
result = find_token_matches(prompt, prompt_repls)
result = find_token_matches(prompt, prompt_updates)
# Only displayed on error
print("result:", result)
......@@ -183,10 +279,20 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
{
"pattern_1": "",
"pattern_2": "<image>",
"pattern_3": PromptIndexTargets.start(),
"pattern_4": PromptIndexTargets.prefix("<image>"),
"pattern_5": PromptIndexTargets.end(),
},
{
"pattern_1": [{ "start_idx": 0, "end_idx": 0 }],
"pattern_2": [],
"pattern_3": [
{ "start_idx": 0, "end_idx": 0 },
],
"pattern_4": [],
"pattern_5": [
{ "start_idx": 0, "end_idx": 0 },
],
}
),
(
......@@ -195,6 +301,9 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
"pattern_1": "<image>",
"pattern_2": "<image><image>",
"pattern_3": "<image><image><image>",
"pattern_4": PromptIndexTargets.start(),
"pattern_5": PromptIndexTargets.prefix("<image>"),
"pattern_6": PromptIndexTargets.end(),
},
{
"pattern_1": [
......@@ -210,6 +319,15 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
"pattern_3": [
{ "start_idx": 0, "end_idx": 21 },
],
"pattern_4": [
{ "start_idx": 0, "end_idx": 0 },
],
"pattern_5": [
{ "start_idx": 7, "end_idx": 7 },
],
"pattern_6": [
{ "start_idx": 28, "end_idx": 28 },
],
},
),
(
......@@ -218,6 +336,9 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
"pattern_1": "Image:<image>",
"pattern_2": "Image:<image><image><image>",
"pattern_3": "Image:<unk><image>",
"pattern_4": PromptIndexTargets.start(),
"pattern_5": PromptIndexTargets.prefix("Image:<image>"),
"pattern_6": PromptIndexTargets.end(),
},
{
"pattern_1": [
......@@ -228,6 +349,15 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
{ "start_idx": 0, "end_idx": 27 },
],
"pattern_3": [],
"pattern_4": [
{ "start_idx": 0, "end_idx": 0 },
],
"pattern_5": [
{ "start_idx": 13, "end_idx": 13 },
],
"pattern_6": [
{ "start_idx": 48, "end_idx": 48 },
],
},
),
# Test regex escape
......@@ -254,16 +384,22 @@ def test_find_token_matches(prompt, target_by_key, expected_by_key):
),
],
)
@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
# yapf: enable
def test_find_text_matches(prompt, target_by_key, expected_by_key):
def test_find_text_matches(
prompt,
target_by_key,
expected_by_key,
update_type,
):
# Should not be used since there is nothing to convert to text
mock_tokenizer = cast(AnyTokenizer, object())
prompt_repls = [
PromptReplacement(key, target, []).bind(mock_tokenizer)
prompt_updates = [
update_type(key, target, []).bind(mock_tokenizer)
for key, target in target_by_key.items()
]
result = find_text_matches(prompt, prompt_repls)
result = find_text_matches(prompt, prompt_updates)
# Only displayed on error
print("result:", result)
......@@ -281,7 +417,7 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):
# yapf: disable
@pytest.mark.parametrize(
("prompt", "target_by_key", "repl_by_key"),
("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"), # noqa: E501
[
(
"Image:<image>Image:<image><image>!",
......@@ -300,58 +436,160 @@ def test_find_text_matches(prompt, target_by_key, expected_by_key):
# Test dynamic replacement (beyond the form of `unit * count`)
"pattern_3": "?!?",
},
{
PromptInsertion: {
0: "Image:<image>Image:<image><image>!",
1: "Image:<image><image><image>Image:<image><image>!?!?",
2: "Image:<image><image><image><image><image>Image:<image><image>!?!??!?", # noqa: E501
},
PromptReplacement: {
0: "Image:<image>Image:<image><image>!",
1: "<image><image>Image:<image><image>?!?",
2: "<image><image><image><image><image>?!?",
},
},
),
# Test index targets
(
"",
{
"pattern_1": PromptIndexTargets.start(),
"pattern_2": PromptIndexTargets.prefix("<image>"),
"pattern_3": PromptIndexTargets.end(),
},
{
"pattern_1": "1",
"pattern_2": "2",
"pattern_3": "3",
},
{
PromptInsertion: {
0: "",
1: "13",
2: "1133",
},
PromptReplacement: {
0: "",
1: "13",
2: "1133",
},
},
),
(
"<image>",
{
"pattern_1": PromptIndexTargets.start(),
"pattern_2": PromptIndexTargets.prefix("<image>"),
"pattern_3": PromptIndexTargets.end(),
},
{
"pattern_1": "1",
"pattern_2": "2",
"pattern_3": "3",
},
{
PromptInsertion: {
0: "<image>",
1: "1<image>23",
2: "11<image>2233",
},
PromptReplacement: {
0: "<image>",
1: "1<image>23",
2: "11<image>2233",
},
},
),
# Test different replacement per item
(
"<image><image><image>",
{
"pattern_1": "<image>",
},
{
"pattern_1": lambda idx: str(idx + 1),
},
{
PromptInsertion: {
0: "<image><image><image>",
1: "<image>1<image><image>",
2: "<image>12<image><image>",
},
PromptReplacement: {
0: "<image><image><image>",
1: "1<image><image>",
2: "12<image>",
},
},
),
(
"<image><image><image>",
{
"pattern_1": PromptIndexTargets.prefix("<image>"),
},
{
"pattern_1": lambda idx: str(idx + 1),
},
{
PromptInsertion: {
0: "<image><image><image>",
1: "<image>1<image><image>",
2: "<image>12<image><image>",
},
PromptReplacement: {
0: "<image><image><image>",
1: "<image>1<image><image>",
2: "<image>12<image><image>",
},
},
),
]
)
@pytest.mark.parametrize(
("mm_count", "expected"),
[
(0, "Image:<image>Image:<image><image>!"),
(1, "<image><image>Image:<image><image>?!?"),
(2, "<image><image><image><image><image>?!?"),
]
)
# yapf: enable
def test_find_replace_text(
def test_find_update_text(
prompt,
target_by_key,
repl_by_key,
mm_count,
expected,
expected_by_update_type_mm_count,
):
# Should not be used since there is nothing to convert to text
mock_tokenizer = cast(AnyTokenizer, object())
mm_prompt_repls = {
key: [
PromptReplacement(key, target,
repl_by_key[key]).bind(mock_tokenizer)
]
for key, target in target_by_key.items()
}
mm_matches = {
key: find_text_matches(prompt, prompt_repls)
for key, prompt_repls in mm_prompt_repls.items()
}
result = replace_text_matches(
prompt,
mm_matches,
{key: mm_count
for key in repl_by_key},
)
# Only displayed on error
print("mm_matches:", mm_matches)
print("result:", result)
# Manually constructed results
assert result == expected
for (
update_type,
expected_by_mm_count,
) in expected_by_update_type_mm_count.items():
mm_prompt_updates = {
key:
[update_type(key, target, repl_by_key[key]).bind(mock_tokenizer)]
for key, target in target_by_key.items()
}
mm_matches = {
key: find_text_matches(prompt, updates)
for key, updates in mm_prompt_updates.items()
}
for mm_count, expected in expected_by_mm_count.items():
result = apply_text_matches(
prompt,
mm_matches,
{key: mm_count
for key in repl_by_key},
)
# Only displayed on error
print("update_type:", update_type)
print("mm_count:", mm_count)
print("mm_matches:", mm_matches)
print("result:", result)
# Manually constructed results
assert result == expected
# yapf: disable
@pytest.mark.parametrize(
("prompt", "target_by_key", "repl_by_key"),
("prompt", "target_by_key", "repl_by_key", "expected_by_update_type_mm_count"), # noqa: E501
[
# Tokenized test cases of `test_find_replace_text`
# using the vocab of llava-hf/llava-v1.6-mistral-7b-hf
......@@ -372,53 +610,155 @@ def test_find_replace_text(
# Test dynamic replacement (beyond the form of `unit * count`)
"pattern_3": [1550, 918, 1550],
},
{
PromptInsertion: {
0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
1: [1, 9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918, 1550, 918, 1550], # noqa: E501
2: [1, 9833, 28747, 32000, 32000, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918, 1550, 918, 1550, 1550, 918, 1550], # noqa: E501
},
PromptReplacement: {
0: [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918],
1: [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550], # noqa: E501
2: [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550],
},
},
),
# Test index targets
(
[],
{
"pattern_1": PromptIndexTargets.start(),
"pattern_2": PromptIndexTargets.prefix([32000]),
"pattern_3": PromptIndexTargets.end(),
},
{
"pattern_1": [-1],
"pattern_2": [-2],
"pattern_3": [-3],
},
{
PromptInsertion: {
0: [],
1: [-1, -3],
2: [-1, -1, -3, -3],
},
PromptReplacement: {
0: [],
1: [-1, -3],
2: [-1, -1, -3, -3],
},
},
),
(
[32000],
{
"pattern_1": PromptIndexTargets.start(),
"pattern_2": PromptIndexTargets.prefix([32000]),
"pattern_3": PromptIndexTargets.end(),
},
{
"pattern_1": [-1],
"pattern_2": [-2],
"pattern_3": [-3],
},
{
PromptInsertion: {
0: [32000],
1: [-1, 32000, -2, -3],
2: [-1, -1, 32000, -2, -2, -3, -3],
},
PromptReplacement: {
0: [32000],
1: [-1, 32000, -2, -3],
2: [-1, -1, 32000, -2, -2, -3, -3],
},
},
),
# Test different replacement per item
(
[32000, 32000, 32000],
{
"pattern_1": [32000],
},
{
"pattern_1": lambda idx: [-(idx + 1)],
},
{
PromptInsertion: {
0: [32000, 32000, 32000],
1: [32000, -1, 32000, 32000],
2: [32000, -1, -2, 32000, 32000],
},
PromptReplacement: {
0: [32000, 32000, 32000],
1: [-1, 32000, 32000],
2: [-1, -2, 32000],
},
},
),
(
[32000, 32000, 32000],
{
"pattern_1": PromptIndexTargets.prefix([32000]),
},
{
"pattern_1": lambda idx: [-(idx + 1)],
},
{
PromptInsertion: {
0: [32000, 32000, 32000],
1: [32000, -1, 32000, 32000],
2: [32000, -1, -2, 32000, 32000],
},
PromptReplacement: {
0: [32000, 32000, 32000],
1: [32000, -1, 32000, 32000],
2: [32000, -1, -2, 32000, 32000],
},
},
),
]
)
@pytest.mark.parametrize(
("mm_count", "expected"),
[
(0, [1, 9833, 28747, 32000, 9833, 28747, 32000, 32000, 918]),
(1, [1, 32000, 32000, 9833, 28747, 32000, 32000, 1550, 918, 1550]),
(2, [1, 32000, 32000, 32000, 32000, 32000, 1550, 918, 1550]),
]
)
# yapf: enable
def test_find_replace_tokens(
def test_find_update_tokens(
prompt,
target_by_key,
repl_by_key,
mm_count,
expected,
expected_by_update_type_mm_count,
):
# Should not be used since there is nothing to convert to tokens
mock_tokenizer = cast(AnyTokenizer, object())
mm_prompt_repls = {
key: [
PromptReplacement(key, target,
repl_by_key[key]).bind(mock_tokenizer)
]
for key, target in target_by_key.items()
}
mm_matches = {
key: find_token_matches(prompt, prompt_repls)
for key, prompt_repls in mm_prompt_repls.items()
}
result = replace_token_matches(
prompt,
mm_matches,
{key: mm_count
for key in repl_by_key},
)
# Only displayed on error
print("mm_matches:", mm_matches)
print("result:", result)
# Manually constructed results
assert result == expected
for (
update_type,
expected_by_mm_count,
) in expected_by_update_type_mm_count.items():
mm_prompt_updates = {
key:
[update_type(key, target, repl_by_key[key]).bind(mock_tokenizer)]
for key, target in target_by_key.items()
}
mm_matches = {
key: find_token_matches(prompt, updates)
for key, updates in mm_prompt_updates.items()
}
for mm_count, expected in expected_by_mm_count.items():
result = apply_token_matches(
prompt,
mm_matches,
{key: mm_count
for key in repl_by_key},
)
# Only displayed on error
print("update_type:", update_type)
print("mm_count:", mm_count)
print("mm_matches:", mm_matches)
print("result:", result)
# Manually constructed results
assert result == expected
# yapf: disable
......@@ -524,22 +864,24 @@ def test_find_replace_tokens(
),
]
)
@pytest.mark.parametrize("update_type", [PromptInsertion, PromptReplacement])
# yapf: enable
def test_find_mm_placeholders(
repl_by_key,
prompt,
expected,
update_type,
):
# Should not be used since there is nothing to convert to tokens
mock_tokenizer = cast(AnyTokenizer, object())
mm_prompt_repls = {
key: [PromptReplacement(key, [], repl).bind(mock_tokenizer)]
mm_prompt_updates = {
key: [update_type(key, [], repl).bind(mock_tokenizer)]
for key, repl in repl_by_key.items()
}
result = find_mm_placeholders(
mm_prompt_repls,
mm_prompt_updates,
prompt,
# Effectively match all occurrences in the prompt
{key: 3
......@@ -553,8 +895,46 @@ def test_find_mm_placeholders(
assert result == expected
def _dummy_elem(modality: str, key: str, size: int):
return MultiModalFieldElem(
modality=modality,
key=key,
data=torch.empty((size, ), dtype=torch.int8),
field=MultiModalSharedField(1),
)
def _dummy_item(modality: str, size_by_key: dict[str, int]):
return MultiModalKwargsItem.from_elems([
_dummy_elem(modality, key, size) for key, size in size_by_key.items()
])
def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
return MultiModalKwargs.from_items([
_dummy_item(modality, size_by_key)
for modality, size_by_key in size_by_key_modality.items()
])
# yapf: disable
@pytest.mark.parametrize(
"model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
("item", "expected_size"),
[
(_dummy_item("a", {"a1": 100}), 100),
(_dummy_item("a", {"a1": 100, "a2": 110}), 210),
(_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460), # noqa: E501
],
)
# yapf: enable
def test_cache_item_size(item, expected_size):
cache = ProcessingCache.get_lru_cache(2048, type(item))
cache[""] = item
assert cache.currsize == expected_size
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize(
("limit", "num_supported", "is_valid"),
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
......@@ -570,7 +950,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="half",
dtype="auto",
revision=None,
limit_mm_per_prompt=limit_mm_per_prompt,
)
......@@ -590,11 +970,10 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
exc_ctx = pytest.raises(ValueError, match="this model only supports")
with exc_ctx:
profiler.get_dummy_data(model_config.max_model_len)
profiler.get_decoder_dummy_data(model_config.max_model_len)
@pytest.mark.parametrize(
"model_id", ["s3://vllm-ci-model-weights/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize(
("num_images", "limit", "is_valid"),
[(0, 0, True), (0, 1, True), (1, 0, False), (1, 1, True), (1, 2, True),
......@@ -610,7 +989,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="half",
dtype="auto",
revision=None,
limit_mm_per_prompt=limit_mm_per_prompt,
)
......@@ -683,7 +1062,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
dtype="half",
dtype="auto",
revision=None,
)
......
......@@ -4,7 +4,7 @@ import base64
import mimetypes
import os
from tempfile import NamedTemporaryFile, TemporaryDirectory
from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, Tuple
from typing import TYPE_CHECKING, NamedTuple, Optional
import numpy as np
import pytest
......@@ -33,7 +33,7 @@ TEST_IMAGE_URLS = [
@pytest.fixture(scope="module")
def url_images() -> Dict[str, Image.Image]:
def url_images() -> dict[str, Image.Image]:
connector = MediaConnector()
return {
......@@ -42,7 +42,7 @@ def url_images() -> Dict[str, Image.Image]:
}
def get_supported_suffixes() -> Tuple[str, ...]:
def get_supported_suffixes() -> tuple[str, ...]:
# We should at least test the file types mentioned in GPT-4 with Vision
OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif')
......@@ -69,7 +69,7 @@ async def test_fetch_image_http(image_url: str):
@pytest.mark.asyncio
@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
@pytest.mark.parametrize("suffix", get_supported_suffixes())
async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
async def test_fetch_image_base64(url_images: dict[str, Image.Image],
image_url: str, suffix: str):
connector = MediaConnector()
url_image = url_images[image_url]
......
# SPDX-License-Identifier: Apache-2.0
import pytest
import torch
import torch.nn.functional as F
from vllm.model_executor.layers.activation import FastGELU, SiluAndMul
from vllm.platforms import current_platform
@pytest.mark.parametrize("activation", ["silu_and_mul", "gelu_fast"])
@pytest.mark.parametrize("num_tokens,d,dtype", [
(7, 512, torch.half),
(7, 512, torch.float),
(83, 512, torch.half),
])
@torch.inference_mode()
def test_act_and_mul(
activation: str,
num_tokens: int,
d: int,
dtype: torch.dtype,
) -> None:
import torch_xla.core.xla_model as xm
device = xm.xla_device()
current_platform.seed_everything(0)
torch.set_default_device("cpu")
x = torch.randn(num_tokens, 2 * d, dtype=dtype).to(device=device)
if activation == "silu_and_mul":
layer = SiluAndMul()
fn = layer.forward_native
elif activation == "gelu_fast":
layer = FastGELU()
fn = F.gelu
else:
raise NotImplementedError(
f"activation {activation} is not implemented.")
assert x.is_xla, "input tensor under testing is expected to be XLA tensor."
out = layer.to(device=device).forward_neuron(x)
ref_out = fn(x.cpu())
torch.testing.assert_close(out.cpu(), ref_out, atol=0.01, rtol=0.0)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment