Commit 2216a4e5 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'mirror/main'

parents ad385667 51c24c97
import contextlib
import gc
import tempfile
from collections import OrderedDict
from typing import Dict, List, TypedDict
from unittest.mock import MagicMock, patch
import pytest
import ray
import torch
import torch.nn as nn
from huggingface_hub import snapshot_download
import vllm
from vllm.config import LoRAConfig
from vllm.distributed import (destroy_distributed_environment,
destroy_model_parallel,
from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment,
initialize_model_parallel)
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
......@@ -48,16 +44,6 @@ LONG_LORA_INFOS: List[ContextIDInfo] = [{
}]
def cleanup():
destroy_model_parallel()
destroy_distributed_environment()
with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group()
gc.collect()
torch.cuda.empty_cache()
ray.shutdown()
@pytest.fixture()
def should_do_global_cleanup_after_test(request) -> bool:
"""Allow subdirectories to skip global cleanup by overriding this fixture.
......@@ -72,7 +58,7 @@ def should_do_global_cleanup_after_test(request) -> bool:
def cleanup_fixture(should_do_global_cleanup_after_test: bool):
yield
if should_do_global_cleanup_after_test:
cleanup()
cleanup_dist_env_and_memory(shutdown_ray=True)
@pytest.fixture
......@@ -87,7 +73,7 @@ def dist_init():
)
initialize_model_parallel(1, 1)
yield
cleanup()
cleanup_dist_env_and_memory(shutdown_ray=True)
@pytest.fixture
......@@ -238,7 +224,7 @@ def long_context_lora_files_32k():
def long_context_infos(long_context_lora_files_16k_1,
long_context_lora_files_16k_2,
long_context_lora_files_32k):
cleanup()
cleanup_dist_env_and_memory(shutdown_ray=True)
infos: Dict[int, ContextInfo] = {}
for lora_checkpoint_info in LONG_LORA_INFOS:
lora_id = lora_checkpoint_info["lora_id"]
......@@ -259,7 +245,7 @@ def long_context_infos(long_context_lora_files_16k_1,
@pytest.fixture
def llama_2_7b_engine_extra_embeddings():
cleanup()
cleanup_dist_env_and_memory(shutdown_ray=True)
get_model_old = get_model
def get_model_patched(*, model_config, device_config, **kwargs):
......@@ -272,7 +258,7 @@ def llama_2_7b_engine_extra_embeddings():
engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
yield engine.llm_engine
del engine
cleanup()
cleanup_dist_env_and_memory(shutdown_ray=True)
@pytest.fixture
......
......@@ -3,10 +3,9 @@ from typing import List
import pytest
import vllm
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.lora.request import LoRARequest
from .conftest import cleanup
MODEL_PATH = "baichuan-inc/Baichuan-7B"
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
......@@ -80,7 +79,7 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
del llm_tp1
cleanup()
cleanup_dist_env_and_memory()
llm_tp2 = vllm.LLM(MODEL_PATH,
enable_lora=True,
......@@ -93,7 +92,7 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)
del llm_tp2
cleanup()
cleanup_dist_env_and_memory()
assert output_tp1 == output_tp2
......@@ -108,6 +107,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)
del llm_tp4
cleanup()
cleanup_dist_env_and_memory()
assert output_tp1 == output_tp4
......@@ -4,10 +4,9 @@ import pytest
import ray
import vllm
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.lora.request import LoRARequest
from .conftest import cleanup
MODEL_PATH = "meta-llama/Llama-2-7b-hf"
......@@ -93,7 +92,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
output_tp1 = do_sample(llm_tp1, sql_lora_files, lora_id=1)
del llm_tp1
cleanup()
cleanup_dist_env_and_memory()
llm_tp2 = vllm.LLM(MODEL_PATH,
enable_lora=True,
......@@ -103,7 +102,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
output_tp2 = do_sample(llm_tp2, sql_lora_files, lora_id=1)
del llm_tp2
cleanup()
cleanup_dist_env_and_memory()
assert output_tp1 == output_tp2
......@@ -115,7 +114,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
output_tp4 = do_sample(llm_tp4, sql_lora_files, lora_id=1)
del llm_tp4
cleanup()
cleanup_dist_env_and_memory()
assert output_tp1 == output_tp4
......
......@@ -28,9 +28,15 @@ sampling_params = SamplingParams(
def _create_lora_request(lora_id, long_context_infos):
context_len = long_context_infos[lora_id]["context_length"]
scaling_factor = context_len_to_scaling_factor[context_len]
return LoRARequest(context_len, lora_id,
long_context_infos[lora_id]["lora"], None,
4096 * scaling_factor)
return LoRARequest(
# There are 2 LoRAs for 16K, we need to add lora_id to indicate
# they are different LoRAs.
context_len + str(lora_id),
lora_id,
long_context_infos[lora_id]["lora"],
None,
4096 * scaling_factor,
)
def evaluate_json_response(model_response, golden_response):
......@@ -108,14 +114,17 @@ def lora_llm(long_context_infos):
for info in long_context_infos.values()
]
llm = vllm.LLM("meta-llama/Llama-2-13b-chat-hf",
enable_lora=True,
max_num_seqs=16,
max_loras=2,
long_lora_scaling_factors=tuple(scaling_factors),
max_num_batched_tokens=4096 * 8,
tensor_parallel_size=4,
distributed_executor_backend="mp")
llm = vllm.LLM(
"meta-llama/Llama-2-13b-chat-hf",
enable_lora=True,
max_num_seqs=16,
max_loras=2,
long_lora_scaling_factors=tuple(scaling_factors),
max_num_batched_tokens=4096 * 8,
tensor_parallel_size=4,
# FIXME enable async output processor
disable_async_output_proc=True,
distributed_executor_backend="mp")
yield llm
del llm
......
......@@ -61,6 +61,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
max_loras=4,
max_lora_rank=64,
trust_remote_code=True,
gpu_memory_utilization=0.97 # This model is pretty big for CI gpus
)
output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
......
......@@ -6,11 +6,10 @@ from typing import List
import pytest
import vllm
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.lora.request import LoRARequest
from vllm.utils import is_hip
from .conftest import cleanup
@dataclass
class ModelWithQuantization:
......@@ -160,7 +159,7 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
print("removing lora")
del llm
cleanup()
cleanup_dist_env_and_memory()
@pytest.mark.parametrize("model", MODELS)
......@@ -181,7 +180,7 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
del llm_tp1
cleanup()
cleanup_dist_env_and_memory()
llm_tp2 = vllm.LLM(
model=model.model_path,
......@@ -194,6 +193,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
del llm_tp2
cleanup()
cleanup_dist_env_and_memory()
assert output_tp1 == output_tp2
......@@ -15,7 +15,8 @@ def test_worker_apply_lora(sql_lora_files):
worker = Worker(
model_config=ModelConfig(
"meta-llama/Llama-2-7b-hf",
"meta-llama/Llama-2-7b-hf",
task="auto",
tokenizer="meta-llama/Llama-2-7b-hf",
tokenizer_mode="auto",
trust_remote_code=False,
seed=0,
......@@ -27,7 +28,7 @@ def test_worker_apply_lora(sql_lora_files):
load_format="dummy",
),
parallel_config=ParallelConfig(1, 1, False),
scheduler_config=SchedulerConfig(32, 32, 32),
scheduler_config=SchedulerConfig("generate", 32, 32, 32),
device_config=DeviceConfig("cuda"),
cache_config=CacheConfig(block_size=16,
gpu_memory_utilization=1.,
......
......@@ -6,13 +6,12 @@ import ray
from prometheus_client import REGISTRY
from vllm import EngineArgs, LLMEngine
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.metrics import RayPrometheusStatLogger
from vllm.sampling_params import SamplingParams
from ..conftest import cleanup
MODELS = [
"facebook/opt-125m",
]
......@@ -85,6 +84,45 @@ def test_metric_counter_generation_tokens(
f"metric: {metric_count!r}")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [128, 129])
@pytest.mark.parametrize("disable_async_output_proc", [True, False])
def test_metric_counter_generation_tokens_multi_step(
vllm_runner,
example_prompts,
model: str,
max_tokens: int,
disable_async_output_proc: bool,
) -> None:
num_scheduler_steps = 8
with vllm_runner(
model,
disable_log_stats=False,
gpu_memory_utilization=0.4,
num_scheduler_steps=num_scheduler_steps,
disable_async_output_proc=disable_async_output_proc,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
tokenizer = vllm_model.model.get_tokenizer()
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
metric_count = stat_logger.metrics.counter_generation_tokens.labels(
**stat_logger.labels)._value.get()
vllm_generation_count = 0
for i in range(len(example_prompts)):
vllm_output_ids, vllm_output_str = vllm_outputs[i]
prompt_ids = tokenizer.encode(example_prompts[i])
# vllm_output_ids contains both prompt tokens and generation tokens.
# We're interested only in the count of the generation tokens.
vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
# The multi-step scheduling will continue to execute forward even when
# encountering EOS, leading to slightly imprecise metrics.
assert abs(vllm_generation_count - metric_count) <\
len(example_prompts) * num_scheduler_steps, \
(f"generation token count: {vllm_generation_count!r}\n"
f"metric: {metric_count!r}")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize(
......@@ -307,7 +345,7 @@ def test_metric_spec_decode_interval(
finally:
del engine
cleanup()
cleanup_dist_env_and_memory()
def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
......
import os
from typing import List
import pytest
from vllm.model_executor.custom_op import CustomOp
from vllm.model_executor.layers.activation import (GeluAndMul,
ReLUSquaredActivation,
SiluAndMul)
from vllm.model_executor.layers.layernorm import RMSNorm
# Registered subclass for test
@CustomOp.register("relu3")
class Relu3(ReLUSquaredActivation):
pass
@pytest.mark.parametrize(
"env, torch_level, ops_enabled, default_on",
[
# Default values based on compile level
("", 0, [True] * 4, True),
("", 1, [True] * 4, True),
("", 2, [True] * 4, True), # All by default
("", 3, [False] * 4, False),
("", 4, [False] * 4, False), # None by default
# Explicitly enabling/disabling
#
# Default: all
#
# All but SiluAndMul
("+rms_norm,-silu_and_mul", 0, [1, 0, 1, 1], True),
# Only ReLU3
("none,-rms_norm,+relu3", 0, [0, 0, 0, 1], False),
# All but SiluAndMul
("all,-silu_and_mul", 1, [1, 0, 1, 1], True),
# All but ReLU3 (even if ReLU2 is on)
("-relu3,relu2", 1, [1, 1, 1, 0], True),
# GeluAndMul and SiluAndMul
("none,-relu3,+gelu_and_mul,+silu_and_mul", 2, [0, 1, 1, 0], False),
# All but RMSNorm
("-rms_norm", 2, [0, 1, 1, 1], True),
#
# Default: none
#
# Only ReLU3
("-silu_and_mul,+relu3", 3, [0, 0, 0, 1], False),
# All but RMSNorm
("all,-rms_norm", 4, [0, 1, 1, 1], True),
])
def test_enabled_ops(env: str, torch_level: int, ops_enabled: List[int],
default_on: bool):
os.environ["VLLM_CUSTOM_OPS"] = env
os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(torch_level)
# Reset default_on (computed once):
CustomOp.default_on.cache_clear()
assert CustomOp.default_on() == default_on
ops_enabled = [bool(x) for x in ops_enabled]
assert RMSNorm(1024).enabled() == ops_enabled[0]
assert CustomOp.op_registry["rms_norm"].enabled() == ops_enabled[0]
assert SiluAndMul().enabled() == ops_enabled[1]
assert CustomOp.op_registry["silu_and_mul"].enabled() == ops_enabled[1]
assert GeluAndMul().enabled() == ops_enabled[2]
assert CustomOp.op_registry["gelu_and_mul"].enabled() == ops_enabled[2]
# If registered, subclasses should follow their own name
assert Relu3().enabled() == ops_enabled[3]
assert CustomOp.op_registry["relu3"].enabled() == ops_enabled[3]
# Unregistered subclass
class SiluAndMul2(SiluAndMul):
pass
# Subclasses should not require registration
assert SiluAndMul2().enabled() == SiluAndMul().enabled()
@pytest.mark.parametrize(
"env", ["all,none", "all,+rms_norm,all", "+rms_norm,-rms_norm"])
def test_enabled_ops_invalid(env: str):
os.environ["VLLM_CUSTOM_OPS"] = env
CustomOp.default_on.cache_clear()
with pytest.raises(AssertionError):
RMSNorm(1024).enabled()
......@@ -21,10 +21,14 @@ MODELS = [
]
if not current_platform.is_cpu():
# MiniCPM requires fused_moe which is not supported by CPU
MODELS.append("openbmb/MiniCPM3-4B")
MODELS += [
# fused_moe which not supported on CPU
"openbmb/MiniCPM3-4B",
# Head size isn't supported on CPU
"h2oai/h2o-danube3-4b-base",
]
#TODO: remove this after CPU float16 support ready
# TODO: remove this after CPU float16 support ready
target_dtype = "float" if current_platform.is_cpu() else "half"
......
"""Compare the outputs of HF and vLLM when using greedy sampling.
This tests danube3 separately because its head size isn't supported on CPU yet.
Run `pytest tests/models/test_danube3_4b.py`.
"""
import pytest
from ...utils import check_outputs_equal
MODELS = ["h2oai/h2o-danube3-4b-base"]
target_dtype = "half"
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [32])
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [target_dtype])
def test_model_print(
vllm_runner,
model: str,
dtype: str,
) -> None:
with vllm_runner(model, dtype=dtype) as vllm_model:
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model)
......@@ -10,7 +10,7 @@ from vllm.worker.model_runner import _get_graph_batch_size
from ...utils import check_outputs_equal
MODELS = ["state-spaces/mamba-130m-hf"]
MODELS = ["state-spaces/mamba-130m-hf", "tiiuae/falcon-mamba-tiny-dev"]
# Use lower-level interfaces to create this greedy generator, as mamba will
......
......@@ -5,7 +5,7 @@ Run `pytest tests/models/test_phimoe.py`.
import pytest
import torch
from vllm.utils import is_cpu
from vllm.platforms import current_platform
from ....utils import large_gpu_test
from ...utils import check_logprobs_close
......@@ -70,7 +70,7 @@ def test_phimoe_routing_function():
assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
@pytest.mark.skipif(condition=is_cpu(),
@pytest.mark.skipif(condition=current_platform.is_cpu(),
reason="This test takes a lot time to run on CPU, "
"and vllm CI's disk space is not enough for this model.")
@large_gpu_test(min_gb=80)
......
......@@ -3,8 +3,8 @@ from typing import List, Optional, Tuple, Type
import pytest
from vllm.multimodal.utils import rescale_image_size
from vllm.platforms import current_platform
from vllm.sequence import SampleLogprobs
from vllm.utils import is_cpu
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from ...utils import check_logprobs_close
......@@ -46,7 +46,7 @@ def run_test(
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
......@@ -103,7 +103,7 @@ def run_test(
target_dtype = "half"
if is_cpu():
if current_platform.is_cpu():
target_dtype = "bfloat16"
......
......@@ -6,7 +6,7 @@ import torch.nn as nn
from huggingface_hub import snapshot_download
from transformers import AutoConfig, AutoModel, CLIPImageProcessor
from ....conftest import _ImageAssets, cleanup
from ....conftest import _ImageAssets
# we use snapshot_download to prevent conflicts between
# dynamic_module and trust_remote_code for hf_runner
......@@ -45,12 +45,13 @@ def run_intern_vit_test(
for pixel_value in pixel_values
]
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.model_executor.models.intern_vit import InternVisionModel
vllm_model = InternVisionModel(config)
vllm_model.load_weights(hf_model.state_dict().items())
del hf_model
cleanup()
cleanup_dist_env_and_memory()
vllm_model = vllm_model.to("cuda", dtype)
vllm_outputs_per_image = [
......@@ -58,7 +59,7 @@ def run_intern_vit_test(
for pixel_value in pixel_values
]
del vllm_model
cleanup()
cleanup_dist_env_and_memory()
cos_similar = nn.CosineSimilarity(dim=-1)
for vllm_output, hf_output in zip(vllm_outputs_per_image,
......
......@@ -7,7 +7,6 @@ from PIL.Image import Image
from transformers import AutoConfig
from vllm.multimodal.utils import rescale_image_size
from vllm.utils import is_cpu
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
_ImageAssets)
......@@ -19,15 +18,20 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"cherry_blossom":
"<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
})
HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: <image>\nImage-2: <image>\nDescribe the two images in detail.<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: <image>\nImage-2: <image>\nDescribe the two images in short.<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
models = [
"OpenGVLab/InternVL2-1B",
"OpenGVLab/InternVL2-2B",
# NOTE: Mono-InternVL-2B doesn't work with fp16,
# it will result NaN during inference.
# See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
"OpenGVLab/Mono-InternVL-2B",
# Broken due to outdated implementation of Phi-3
# See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3
# "OpenGVLab/InternVL2-4B",
]
target_dtype = "bfloat16"
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py
......@@ -52,9 +56,15 @@ def generate(
input_embeds = input_embeds.reshape(B, N, C)
outputs = self.language_model.generate(
forward_kwargs = dict(
inputs_embeds=input_embeds,
attention_mask=attention_mask,
)
if getattr(self, "use_visual_token_mask", False):
visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype)
forward_kwargs["visual_token_mask"] = visual_token_mask
outputs = self.language_model.generate(
**forward_kwargs,
**generate_kwargs,
)
......@@ -78,7 +88,7 @@ def run_test(
All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
......@@ -243,11 +253,6 @@ def run_awq_test(
)
target_dtype = "half"
if is_cpu():
target_dtype = "bfloat16"
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
......
......@@ -10,8 +10,9 @@ from vllm.inputs import InputContext, token_inputs
from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
from vllm.multimodal import MultiModalRegistry
from vllm.multimodal.utils import rescale_image_size
from vllm.platforms import current_platform
from vllm.sequence import SampleLogprobs
from vllm.utils import is_cpu, is_hip
from vllm.utils import is_hip
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
_ImageAssets)
......@@ -49,7 +50,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
target_dtype = "half"
if is_cpu():
if current_platform.is_cpu():
target_dtype = "bfloat16"
# ROCm Triton FA can run into shared memory issues with these models,
......@@ -89,6 +90,7 @@ def run_test(
# max_model_len should be greater than image_feature_size
with vllm_runner(model,
task="generate",
max_model_len=4096,
max_num_seqs=2,
dtype=dtype,
......
......@@ -6,21 +6,31 @@ import pytest
from ..utils import check_embeddings_close
# Model, Guard
MODELS = [
"intfloat/e5-mistral-7b-instruct",
"BAAI/bge-base-en-v1.5",
"BAAI/bge-multilingual-gemma2",
]
ENCODER_ONLY = [
"BAAI/bge-base-en-v1.5",
]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models(
monkeypatch,
hf_runner,
vllm_runner,
example_prompts,
model: str,
model,
dtype: str,
) -> None:
if model in ENCODER_ONLY:
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
# The example_prompts has ending "\n", for example:
# "Write a short story about a robot that dreams for the first time.\n"
# sentence_transformers will strip the input texts, see:
......@@ -33,7 +43,7 @@ def test_models(
is_sentence_transformer=True) as hf_model:
hf_outputs = hf_model.encode(example_prompts)
with vllm_runner(model, dtype=dtype) as vllm_model:
with vllm_runner(model, dtype=dtype, max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts)
check_embeddings_close(
......
......@@ -16,7 +16,8 @@ def check_embeddings_close(
for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
zip(embeddings_0_lst, embeddings_1_lst)):
assert len(embeddings_0) == len(embeddings_1)
assert len(embeddings_0) == len(embeddings_1), (
f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
sim = F.cosine_similarity(torch.tensor(embeddings_0),
torch.tensor(embeddings_1),
......
from typing import List, Type
import pytest
import torch.nn.functional as F
from transformers import AutoModelForVision2Seq
from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
from ....utils import large_gpu_test
from ..utils import check_embeddings_close
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501
HF_TEXT_PROMPTS = [
# T -> X
llama3_template.format(
"The label of the object is stop sign\nSummary above sentence in one word: " # noqa: E501
),
# T -> X
llama3_template.format(
"cherry blossom\nSummary above sentence in one word: "),
]
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
# I -> X
"stop_sign":
llama3_template.format("<image>\nSummary above image in one word: "),
# I -> X
"cherry_blossom":
llama3_template.format("<image>\nSummary above image in one word: "),
})
MODELS = ["royokong/e5-v"]
def _run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
input_texts: List[str],
input_images: PromptImageInput,
model: str,
*,
dtype: str,
) -> None:
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with vllm_runner(model,
task="embedding",
dtype=dtype,
max_model_len=4096,
enforce_eager=True) as vllm_model:
vllm_outputs = vllm_model.encode(input_texts, images=input_images)
with hf_runner(model, dtype=dtype,
auto_cls=AutoModelForVision2Seq) as hf_model:
# Patch the issue where image_token_id
# exceeds the maximum allowed vocab size
hf_model.model.resize_token_embeddings(
hf_model.model.language_model.vocab_size + 1)
all_inputs = hf_model.get_inputs(input_texts, images=input_images)
all_outputs = []
for inputs in all_inputs:
# Based on: https://huggingface.co/royokong/e5-v
outputs = hf_model.model(
**hf_model.wrap_device(inputs,
device=hf_model.model.device.type),
return_dict=True,
output_hidden_states=True,
)
pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :],
dim=-1)
all_outputs.append(pooled_output.tolist())
hf_outputs = all_outputs
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_text(
hf_runner,
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]
_run_test(
hf_runner,
vllm_runner,
input_texts,
input_images, # type: ignore
model,
dtype=dtype,
)
@large_gpu_test(min_gb=48)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_models_image(
hf_runner,
vllm_runner,
image_assets,
model: str,
dtype: str,
) -> None:
input_texts_images = [
(text, asset.pil_image)
for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
]
input_texts = [text for text, _ in input_texts_images]
input_images = [image for _, image in input_texts_images]
_run_test(
hf_runner,
vllm_runner,
input_texts,
input_images,
model,
dtype=dtype,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment