Commit 2216a4e5 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'mirror/main'

parents ad385667 51c24c97
import contextlib
import gc
import tempfile import tempfile
from collections import OrderedDict from collections import OrderedDict
from typing import Dict, List, TypedDict from typing import Dict, List, TypedDict
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
import pytest import pytest
import ray
import torch import torch
import torch.nn as nn import torch.nn as nn
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
import vllm import vllm
from vllm.config import LoRAConfig from vllm.config import LoRAConfig
from vllm.distributed import (destroy_distributed_environment, from vllm.distributed import (cleanup_dist_env_and_memory,
destroy_model_parallel,
init_distributed_environment, init_distributed_environment,
initialize_model_parallel) initialize_model_parallel)
from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.linear import (ColumnParallelLinear,
...@@ -48,16 +44,6 @@ LONG_LORA_INFOS: List[ContextIDInfo] = [{ ...@@ -48,16 +44,6 @@ LONG_LORA_INFOS: List[ContextIDInfo] = [{
}] }]
def cleanup():
destroy_model_parallel()
destroy_distributed_environment()
with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group()
gc.collect()
torch.cuda.empty_cache()
ray.shutdown()
@pytest.fixture() @pytest.fixture()
def should_do_global_cleanup_after_test(request) -> bool: def should_do_global_cleanup_after_test(request) -> bool:
"""Allow subdirectories to skip global cleanup by overriding this fixture. """Allow subdirectories to skip global cleanup by overriding this fixture.
...@@ -72,7 +58,7 @@ def should_do_global_cleanup_after_test(request) -> bool: ...@@ -72,7 +58,7 @@ def should_do_global_cleanup_after_test(request) -> bool:
def cleanup_fixture(should_do_global_cleanup_after_test: bool): def cleanup_fixture(should_do_global_cleanup_after_test: bool):
yield yield
if should_do_global_cleanup_after_test: if should_do_global_cleanup_after_test:
cleanup() cleanup_dist_env_and_memory(shutdown_ray=True)
@pytest.fixture @pytest.fixture
...@@ -87,7 +73,7 @@ def dist_init(): ...@@ -87,7 +73,7 @@ def dist_init():
) )
initialize_model_parallel(1, 1) initialize_model_parallel(1, 1)
yield yield
cleanup() cleanup_dist_env_and_memory(shutdown_ray=True)
@pytest.fixture @pytest.fixture
...@@ -238,7 +224,7 @@ def long_context_lora_files_32k(): ...@@ -238,7 +224,7 @@ def long_context_lora_files_32k():
def long_context_infos(long_context_lora_files_16k_1, def long_context_infos(long_context_lora_files_16k_1,
long_context_lora_files_16k_2, long_context_lora_files_16k_2,
long_context_lora_files_32k): long_context_lora_files_32k):
cleanup() cleanup_dist_env_and_memory(shutdown_ray=True)
infos: Dict[int, ContextInfo] = {} infos: Dict[int, ContextInfo] = {}
for lora_checkpoint_info in LONG_LORA_INFOS: for lora_checkpoint_info in LONG_LORA_INFOS:
lora_id = lora_checkpoint_info["lora_id"] lora_id = lora_checkpoint_info["lora_id"]
...@@ -259,7 +245,7 @@ def long_context_infos(long_context_lora_files_16k_1, ...@@ -259,7 +245,7 @@ def long_context_infos(long_context_lora_files_16k_1,
@pytest.fixture @pytest.fixture
def llama_2_7b_engine_extra_embeddings(): def llama_2_7b_engine_extra_embeddings():
cleanup() cleanup_dist_env_and_memory(shutdown_ray=True)
get_model_old = get_model get_model_old = get_model
def get_model_patched(*, model_config, device_config, **kwargs): def get_model_patched(*, model_config, device_config, **kwargs):
...@@ -272,7 +258,7 @@ def llama_2_7b_engine_extra_embeddings(): ...@@ -272,7 +258,7 @@ def llama_2_7b_engine_extra_embeddings():
engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False) engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
yield engine.llm_engine yield engine.llm_engine
del engine del engine
cleanup() cleanup_dist_env_and_memory(shutdown_ray=True)
@pytest.fixture @pytest.fixture
......
...@@ -3,10 +3,9 @@ from typing import List ...@@ -3,10 +3,9 @@ from typing import List
import pytest import pytest
import vllm import vllm
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from .conftest import cleanup
MODEL_PATH = "baichuan-inc/Baichuan-7B" MODEL_PATH = "baichuan-inc/Baichuan-7B"
PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:""" # noqa: E501
...@@ -80,7 +79,7 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files, ...@@ -80,7 +79,7 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1) output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)
del llm_tp1 del llm_tp1
cleanup() cleanup_dist_env_and_memory()
llm_tp2 = vllm.LLM(MODEL_PATH, llm_tp2 = vllm.LLM(MODEL_PATH,
enable_lora=True, enable_lora=True,
...@@ -93,7 +92,7 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files, ...@@ -93,7 +92,7 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2) output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)
del llm_tp2 del llm_tp2
cleanup() cleanup_dist_env_and_memory()
assert output_tp1 == output_tp2 assert output_tp1 == output_tp2
...@@ -108,6 +107,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files, ...@@ -108,6 +107,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2) output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)
del llm_tp4 del llm_tp4
cleanup() cleanup_dist_env_and_memory()
assert output_tp1 == output_tp4 assert output_tp1 == output_tp4
...@@ -4,10 +4,9 @@ import pytest ...@@ -4,10 +4,9 @@ import pytest
import ray import ray
import vllm import vllm
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from .conftest import cleanup
MODEL_PATH = "meta-llama/Llama-2-7b-hf" MODEL_PATH = "meta-llama/Llama-2-7b-hf"
...@@ -93,7 +92,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available): ...@@ -93,7 +92,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
output_tp1 = do_sample(llm_tp1, sql_lora_files, lora_id=1) output_tp1 = do_sample(llm_tp1, sql_lora_files, lora_id=1)
del llm_tp1 del llm_tp1
cleanup() cleanup_dist_env_and_memory()
llm_tp2 = vllm.LLM(MODEL_PATH, llm_tp2 = vllm.LLM(MODEL_PATH,
enable_lora=True, enable_lora=True,
...@@ -103,7 +102,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available): ...@@ -103,7 +102,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
output_tp2 = do_sample(llm_tp2, sql_lora_files, lora_id=1) output_tp2 = do_sample(llm_tp2, sql_lora_files, lora_id=1)
del llm_tp2 del llm_tp2
cleanup() cleanup_dist_env_and_memory()
assert output_tp1 == output_tp2 assert output_tp1 == output_tp2
...@@ -115,7 +114,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available): ...@@ -115,7 +114,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
output_tp4 = do_sample(llm_tp4, sql_lora_files, lora_id=1) output_tp4 = do_sample(llm_tp4, sql_lora_files, lora_id=1)
del llm_tp4 del llm_tp4
cleanup() cleanup_dist_env_and_memory()
assert output_tp1 == output_tp4 assert output_tp1 == output_tp4
......
...@@ -28,9 +28,15 @@ sampling_params = SamplingParams( ...@@ -28,9 +28,15 @@ sampling_params = SamplingParams(
def _create_lora_request(lora_id, long_context_infos): def _create_lora_request(lora_id, long_context_infos):
context_len = long_context_infos[lora_id]["context_length"] context_len = long_context_infos[lora_id]["context_length"]
scaling_factor = context_len_to_scaling_factor[context_len] scaling_factor = context_len_to_scaling_factor[context_len]
return LoRARequest(context_len, lora_id, return LoRARequest(
long_context_infos[lora_id]["lora"], None, # There are 2 LoRAs for 16K, we need to add lora_id to indicate
4096 * scaling_factor) # they are different LoRAs.
context_len + str(lora_id),
lora_id,
long_context_infos[lora_id]["lora"],
None,
4096 * scaling_factor,
)
def evaluate_json_response(model_response, golden_response): def evaluate_json_response(model_response, golden_response):
...@@ -108,14 +114,17 @@ def lora_llm(long_context_infos): ...@@ -108,14 +114,17 @@ def lora_llm(long_context_infos):
for info in long_context_infos.values() for info in long_context_infos.values()
] ]
llm = vllm.LLM("meta-llama/Llama-2-13b-chat-hf", llm = vllm.LLM(
enable_lora=True, "meta-llama/Llama-2-13b-chat-hf",
max_num_seqs=16, enable_lora=True,
max_loras=2, max_num_seqs=16,
long_lora_scaling_factors=tuple(scaling_factors), max_loras=2,
max_num_batched_tokens=4096 * 8, long_lora_scaling_factors=tuple(scaling_factors),
tensor_parallel_size=4, max_num_batched_tokens=4096 * 8,
distributed_executor_backend="mp") tensor_parallel_size=4,
# FIXME enable async output processor
disable_async_output_proc=True,
distributed_executor_backend="mp")
yield llm yield llm
del llm del llm
......
...@@ -61,6 +61,7 @@ def test_minicpmv_lora(minicpmv_lora_files): ...@@ -61,6 +61,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
max_loras=4, max_loras=4,
max_lora_rank=64, max_lora_rank=64,
trust_remote_code=True, trust_remote_code=True,
gpu_memory_utilization=0.97 # This model is pretty big for CI gpus
) )
output1 = do_sample(llm, minicpmv_lora_files, lora_id=1) output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)
......
...@@ -6,11 +6,10 @@ from typing import List ...@@ -6,11 +6,10 @@ from typing import List
import pytest import pytest
import vllm import vllm
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.utils import is_hip from vllm.utils import is_hip
from .conftest import cleanup
@dataclass @dataclass
class ModelWithQuantization: class ModelWithQuantization:
...@@ -160,7 +159,7 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model, ...@@ -160,7 +159,7 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
print("removing lora") print("removing lora")
del llm del llm
cleanup() cleanup_dist_env_and_memory()
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
...@@ -181,7 +180,7 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, ...@@ -181,7 +180,7 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1) output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)
del llm_tp1 del llm_tp1
cleanup() cleanup_dist_env_and_memory()
llm_tp2 = vllm.LLM( llm_tp2 = vllm.LLM(
model=model.model_path, model=model.model_path,
...@@ -194,6 +193,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available, ...@@ -194,6 +193,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1) output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)
del llm_tp2 del llm_tp2
cleanup() cleanup_dist_env_and_memory()
assert output_tp1 == output_tp2 assert output_tp1 == output_tp2
...@@ -15,7 +15,8 @@ def test_worker_apply_lora(sql_lora_files): ...@@ -15,7 +15,8 @@ def test_worker_apply_lora(sql_lora_files):
worker = Worker( worker = Worker(
model_config=ModelConfig( model_config=ModelConfig(
"meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-7b-hf",
"meta-llama/Llama-2-7b-hf", task="auto",
tokenizer="meta-llama/Llama-2-7b-hf",
tokenizer_mode="auto", tokenizer_mode="auto",
trust_remote_code=False, trust_remote_code=False,
seed=0, seed=0,
...@@ -27,7 +28,7 @@ def test_worker_apply_lora(sql_lora_files): ...@@ -27,7 +28,7 @@ def test_worker_apply_lora(sql_lora_files):
load_format="dummy", load_format="dummy",
), ),
parallel_config=ParallelConfig(1, 1, False), parallel_config=ParallelConfig(1, 1, False),
scheduler_config=SchedulerConfig(32, 32, 32), scheduler_config=SchedulerConfig("generate", 32, 32, 32),
device_config=DeviceConfig("cuda"), device_config=DeviceConfig("cuda"),
cache_config=CacheConfig(block_size=16, cache_config=CacheConfig(block_size=16,
gpu_memory_utilization=1., gpu_memory_utilization=1.,
......
...@@ -6,13 +6,12 @@ import ray ...@@ -6,13 +6,12 @@ import ray
from prometheus_client import REGISTRY from prometheus_client import REGISTRY
from vllm import EngineArgs, LLMEngine from vllm import EngineArgs, LLMEngine
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.metrics import RayPrometheusStatLogger from vllm.engine.metrics import RayPrometheusStatLogger
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from ..conftest import cleanup
MODELS = [ MODELS = [
"facebook/opt-125m", "facebook/opt-125m",
] ]
...@@ -85,6 +84,45 @@ def test_metric_counter_generation_tokens( ...@@ -85,6 +84,45 @@ def test_metric_counter_generation_tokens(
f"metric: {metric_count!r}") f"metric: {metric_count!r}")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [128, 129])
@pytest.mark.parametrize("disable_async_output_proc", [True, False])
def test_metric_counter_generation_tokens_multi_step(
vllm_runner,
example_prompts,
model: str,
max_tokens: int,
disable_async_output_proc: bool,
) -> None:
num_scheduler_steps = 8
with vllm_runner(
model,
disable_log_stats=False,
gpu_memory_utilization=0.4,
num_scheduler_steps=num_scheduler_steps,
disable_async_output_proc=disable_async_output_proc,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
tokenizer = vllm_model.model.get_tokenizer()
stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
metric_count = stat_logger.metrics.counter_generation_tokens.labels(
**stat_logger.labels)._value.get()
vllm_generation_count = 0
for i in range(len(example_prompts)):
vllm_output_ids, vllm_output_str = vllm_outputs[i]
prompt_ids = tokenizer.encode(example_prompts[i])
# vllm_output_ids contains both prompt tokens and generation tokens.
# We're interested only in the count of the generation tokens.
vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
# The multi-step scheduling will continue to execute forward even when
# encountering EOS, leading to slightly imprecise metrics.
assert abs(vllm_generation_count - metric_count) <\
len(example_prompts) * num_scheduler_steps, \
(f"generation token count: {vllm_generation_count!r}\n"
f"metric: {metric_count!r}")
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["float"]) @pytest.mark.parametrize("dtype", ["float"])
@pytest.mark.parametrize( @pytest.mark.parametrize(
...@@ -307,7 +345,7 @@ def test_metric_spec_decode_interval( ...@@ -307,7 +345,7 @@ def test_metric_spec_decode_interval(
finally: finally:
del engine del engine
cleanup() cleanup_dist_env_and_memory()
def assert_metrics(engine: LLMEngine, disable_log_stats: bool, def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
......
import os
from typing import List
import pytest
from vllm.model_executor.custom_op import CustomOp
from vllm.model_executor.layers.activation import (GeluAndMul,
ReLUSquaredActivation,
SiluAndMul)
from vllm.model_executor.layers.layernorm import RMSNorm
# Registered subclass for test
@CustomOp.register("relu3")
class Relu3(ReLUSquaredActivation):
pass
@pytest.mark.parametrize(
"env, torch_level, ops_enabled, default_on",
[
# Default values based on compile level
("", 0, [True] * 4, True),
("", 1, [True] * 4, True),
("", 2, [True] * 4, True), # All by default
("", 3, [False] * 4, False),
("", 4, [False] * 4, False), # None by default
# Explicitly enabling/disabling
#
# Default: all
#
# All but SiluAndMul
("+rms_norm,-silu_and_mul", 0, [1, 0, 1, 1], True),
# Only ReLU3
("none,-rms_norm,+relu3", 0, [0, 0, 0, 1], False),
# All but SiluAndMul
("all,-silu_and_mul", 1, [1, 0, 1, 1], True),
# All but ReLU3 (even if ReLU2 is on)
("-relu3,relu2", 1, [1, 1, 1, 0], True),
# GeluAndMul and SiluAndMul
("none,-relu3,+gelu_and_mul,+silu_and_mul", 2, [0, 1, 1, 0], False),
# All but RMSNorm
("-rms_norm", 2, [0, 1, 1, 1], True),
#
# Default: none
#
# Only ReLU3
("-silu_and_mul,+relu3", 3, [0, 0, 0, 1], False),
# All but RMSNorm
("all,-rms_norm", 4, [0, 1, 1, 1], True),
])
def test_enabled_ops(env: str, torch_level: int, ops_enabled: List[int],
default_on: bool):
os.environ["VLLM_CUSTOM_OPS"] = env
os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(torch_level)
# Reset default_on (computed once):
CustomOp.default_on.cache_clear()
assert CustomOp.default_on() == default_on
ops_enabled = [bool(x) for x in ops_enabled]
assert RMSNorm(1024).enabled() == ops_enabled[0]
assert CustomOp.op_registry["rms_norm"].enabled() == ops_enabled[0]
assert SiluAndMul().enabled() == ops_enabled[1]
assert CustomOp.op_registry["silu_and_mul"].enabled() == ops_enabled[1]
assert GeluAndMul().enabled() == ops_enabled[2]
assert CustomOp.op_registry["gelu_and_mul"].enabled() == ops_enabled[2]
# If registered, subclasses should follow their own name
assert Relu3().enabled() == ops_enabled[3]
assert CustomOp.op_registry["relu3"].enabled() == ops_enabled[3]
# Unregistered subclass
class SiluAndMul2(SiluAndMul):
pass
# Subclasses should not require registration
assert SiluAndMul2().enabled() == SiluAndMul().enabled()
@pytest.mark.parametrize(
"env", ["all,none", "all,+rms_norm,all", "+rms_norm,-rms_norm"])
def test_enabled_ops_invalid(env: str):
os.environ["VLLM_CUSTOM_OPS"] = env
CustomOp.default_on.cache_clear()
with pytest.raises(AssertionError):
RMSNorm(1024).enabled()
...@@ -21,10 +21,14 @@ MODELS = [ ...@@ -21,10 +21,14 @@ MODELS = [
] ]
if not current_platform.is_cpu(): if not current_platform.is_cpu():
# MiniCPM requires fused_moe which is not supported by CPU MODELS += [
MODELS.append("openbmb/MiniCPM3-4B") # fused_moe which not supported on CPU
"openbmb/MiniCPM3-4B",
# Head size isn't supported on CPU
"h2oai/h2o-danube3-4b-base",
]
#TODO: remove this after CPU float16 support ready # TODO: remove this after CPU float16 support ready
target_dtype = "float" if current_platform.is_cpu() else "half" target_dtype = "float" if current_platform.is_cpu() else "half"
......
"""Compare the outputs of HF and vLLM when using greedy sampling.
This tests danube3 separately because its head size isn't supported on CPU yet.
Run `pytest tests/models/test_danube3_4b.py`.
"""
import pytest
from ...utils import check_outputs_equal
MODELS = ["h2oai/h2o-danube3-4b-base"]
target_dtype = "half"
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [32])
def test_models(
hf_runner,
vllm_runner,
example_prompts,
model: str,
dtype: str,
max_tokens: int,
) -> None:
with hf_runner(model, dtype=dtype) as hf_model:
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
with vllm_runner(model, dtype=dtype) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", [target_dtype])
def test_model_print(
vllm_runner,
model: str,
dtype: str,
) -> None:
with vllm_runner(model, dtype=dtype) as vllm_model:
# This test is for verifying whether the model's extra_repr
# can be printed correctly.
print(vllm_model.model.llm_engine.model_executor.driver_worker.
model_runner.model)
...@@ -10,7 +10,7 @@ from vllm.worker.model_runner import _get_graph_batch_size ...@@ -10,7 +10,7 @@ from vllm.worker.model_runner import _get_graph_batch_size
from ...utils import check_outputs_equal from ...utils import check_outputs_equal
MODELS = ["state-spaces/mamba-130m-hf"] MODELS = ["state-spaces/mamba-130m-hf", "tiiuae/falcon-mamba-tiny-dev"]
# Use lower-level interfaces to create this greedy generator, as mamba will # Use lower-level interfaces to create this greedy generator, as mamba will
......
...@@ -5,7 +5,7 @@ Run `pytest tests/models/test_phimoe.py`. ...@@ -5,7 +5,7 @@ Run `pytest tests/models/test_phimoe.py`.
import pytest import pytest
import torch import torch
from vllm.utils import is_cpu from vllm.platforms import current_platform
from ....utils import large_gpu_test from ....utils import large_gpu_test
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
...@@ -70,7 +70,7 @@ def test_phimoe_routing_function(): ...@@ -70,7 +70,7 @@ def test_phimoe_routing_function():
assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"]) assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
@pytest.mark.skipif(condition=is_cpu(), @pytest.mark.skipif(condition=current_platform.is_cpu(),
reason="This test takes a lot time to run on CPU, " reason="This test takes a lot time to run on CPU, "
"and vllm CI's disk space is not enough for this model.") "and vllm CI's disk space is not enough for this model.")
@large_gpu_test(min_gb=80) @large_gpu_test(min_gb=80)
......
...@@ -3,8 +3,8 @@ from typing import List, Optional, Tuple, Type ...@@ -3,8 +3,8 @@ from typing import List, Optional, Tuple, Type
import pytest import pytest
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
from vllm.platforms import current_platform
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.utils import is_cpu
from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from ...utils import check_logprobs_close from ...utils import check_logprobs_close
...@@ -46,7 +46,7 @@ def run_test( ...@@ -46,7 +46,7 @@ def run_test(
All the image fixtures for the test are from IMAGE_ASSETS. All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input. For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input. and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract. Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf. The text output is sanitized to be able to compare with hf.
...@@ -103,7 +103,7 @@ def run_test( ...@@ -103,7 +103,7 @@ def run_test(
target_dtype = "half" target_dtype = "half"
if is_cpu(): if current_platform.is_cpu():
target_dtype = "bfloat16" target_dtype = "bfloat16"
......
...@@ -6,7 +6,7 @@ import torch.nn as nn ...@@ -6,7 +6,7 @@ import torch.nn as nn
from huggingface_hub import snapshot_download from huggingface_hub import snapshot_download
from transformers import AutoConfig, AutoModel, CLIPImageProcessor from transformers import AutoConfig, AutoModel, CLIPImageProcessor
from ....conftest import _ImageAssets, cleanup from ....conftest import _ImageAssets
# we use snapshot_download to prevent conflicts between # we use snapshot_download to prevent conflicts between
# dynamic_module and trust_remote_code for hf_runner # dynamic_module and trust_remote_code for hf_runner
...@@ -45,12 +45,13 @@ def run_intern_vit_test( ...@@ -45,12 +45,13 @@ def run_intern_vit_test(
for pixel_value in pixel_values for pixel_value in pixel_values
] ]
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.model_executor.models.intern_vit import InternVisionModel from vllm.model_executor.models.intern_vit import InternVisionModel
vllm_model = InternVisionModel(config) vllm_model = InternVisionModel(config)
vllm_model.load_weights(hf_model.state_dict().items()) vllm_model.load_weights(hf_model.state_dict().items())
del hf_model del hf_model
cleanup() cleanup_dist_env_and_memory()
vllm_model = vllm_model.to("cuda", dtype) vllm_model = vllm_model.to("cuda", dtype)
vllm_outputs_per_image = [ vllm_outputs_per_image = [
...@@ -58,7 +59,7 @@ def run_intern_vit_test( ...@@ -58,7 +59,7 @@ def run_intern_vit_test(
for pixel_value in pixel_values for pixel_value in pixel_values
] ]
del vllm_model del vllm_model
cleanup() cleanup_dist_env_and_memory()
cos_similar = nn.CosineSimilarity(dim=-1) cos_similar = nn.CosineSimilarity(dim=-1)
for vllm_output, hf_output in zip(vllm_outputs_per_image, for vllm_output, hf_output in zip(vllm_outputs_per_image,
......
...@@ -7,7 +7,6 @@ from PIL.Image import Image ...@@ -7,7 +7,6 @@ from PIL.Image import Image
from transformers import AutoConfig from transformers import AutoConfig
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
from vllm.utils import is_cpu
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
_ImageAssets) _ImageAssets)
...@@ -19,15 +18,20 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ ...@@ -19,15 +18,20 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"cherry_blossom": "cherry_blossom":
"<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
}) })
HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: <image>\nImage-2: <image>\nDescribe the two images in detail.<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501 HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: <image>\nImage-2: <image>\nDescribe the two images in short.<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
models = [ models = [
"OpenGVLab/InternVL2-1B", "OpenGVLab/InternVL2-1B",
"OpenGVLab/InternVL2-2B", "OpenGVLab/InternVL2-2B",
# NOTE: Mono-InternVL-2B doesn't work with fp16,
# it will result NaN during inference.
# See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
"OpenGVLab/Mono-InternVL-2B",
# Broken due to outdated implementation of Phi-3 # Broken due to outdated implementation of Phi-3
# See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3 # See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3
# "OpenGVLab/InternVL2-4B", # "OpenGVLab/InternVL2-4B",
] ]
target_dtype = "bfloat16"
# adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py
...@@ -52,9 +56,15 @@ def generate( ...@@ -52,9 +56,15 @@ def generate(
input_embeds = input_embeds.reshape(B, N, C) input_embeds = input_embeds.reshape(B, N, C)
outputs = self.language_model.generate( forward_kwargs = dict(
inputs_embeds=input_embeds, inputs_embeds=input_embeds,
attention_mask=attention_mask, attention_mask=attention_mask,
)
if getattr(self, "use_visual_token_mask", False):
visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype)
forward_kwargs["visual_token_mask"] = visual_token_mask
outputs = self.language_model.generate(
**forward_kwargs,
**generate_kwargs, **generate_kwargs,
) )
...@@ -78,7 +88,7 @@ def run_test( ...@@ -78,7 +88,7 @@ def run_test(
All the image fixtures for the test are from IMAGE_ASSETS. All the image fixtures for the test are from IMAGE_ASSETS.
For huggingface runner, we provide the PIL images as input. For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects For vllm runner, we provide MultiModalDataDict objects
and corresponding MultiModalConfig as input. and corresponding MultiModalConfig as input.
Note, the text input is also adjusted to abide by vllm contract. Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf. The text output is sanitized to be able to compare with hf.
...@@ -243,11 +253,6 @@ def run_awq_test( ...@@ -243,11 +253,6 @@ def run_awq_test(
) )
target_dtype = "half"
if is_cpu():
target_dtype = "bfloat16"
@pytest.mark.parametrize("model", models) @pytest.mark.parametrize("model", models)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"size_factors", "size_factors",
......
...@@ -10,8 +10,9 @@ from vllm.inputs import InputContext, token_inputs ...@@ -10,8 +10,9 @@ from vllm.inputs import InputContext, token_inputs
from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
from vllm.multimodal import MultiModalRegistry from vllm.multimodal import MultiModalRegistry
from vllm.multimodal.utils import rescale_image_size from vllm.multimodal.utils import rescale_image_size
from vllm.platforms import current_platform
from vllm.sequence import SampleLogprobs from vllm.sequence import SampleLogprobs
from vllm.utils import is_cpu, is_hip from vllm.utils import is_hip
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
_ImageAssets) _ImageAssets)
...@@ -49,7 +50,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, ...@@ -49,7 +50,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
target_dtype = "half" target_dtype = "half"
if is_cpu(): if current_platform.is_cpu():
target_dtype = "bfloat16" target_dtype = "bfloat16"
# ROCm Triton FA can run into shared memory issues with these models, # ROCm Triton FA can run into shared memory issues with these models,
...@@ -89,6 +90,7 @@ def run_test( ...@@ -89,6 +90,7 @@ def run_test(
# max_model_len should be greater than image_feature_size # max_model_len should be greater than image_feature_size
with vllm_runner(model, with vllm_runner(model,
task="generate",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
dtype=dtype, dtype=dtype,
......
...@@ -6,21 +6,31 @@ import pytest ...@@ -6,21 +6,31 @@ import pytest
from ..utils import check_embeddings_close from ..utils import check_embeddings_close
# Model, Guard
MODELS = [ MODELS = [
"intfloat/e5-mistral-7b-instruct", "intfloat/e5-mistral-7b-instruct",
"BAAI/bge-base-en-v1.5",
"BAAI/bge-multilingual-gemma2", "BAAI/bge-multilingual-gemma2",
] ]
ENCODER_ONLY = [
"BAAI/bge-base-en-v1.5",
]
@pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("dtype", ["half"])
def test_models( def test_models(
monkeypatch,
hf_runner, hf_runner,
vllm_runner, vllm_runner,
example_prompts, example_prompts,
model: str, model,
dtype: str, dtype: str,
) -> None: ) -> None:
if model in ENCODER_ONLY:
monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
# The example_prompts has ending "\n", for example: # The example_prompts has ending "\n", for example:
# "Write a short story about a robot that dreams for the first time.\n" # "Write a short story about a robot that dreams for the first time.\n"
# sentence_transformers will strip the input texts, see: # sentence_transformers will strip the input texts, see:
...@@ -33,7 +43,7 @@ def test_models( ...@@ -33,7 +43,7 @@ def test_models(
is_sentence_transformer=True) as hf_model: is_sentence_transformer=True) as hf_model:
hf_outputs = hf_model.encode(example_prompts) hf_outputs = hf_model.encode(example_prompts)
with vllm_runner(model, dtype=dtype) as vllm_model: with vllm_runner(model, dtype=dtype, max_model_len=None) as vllm_model:
vllm_outputs = vllm_model.encode(example_prompts) vllm_outputs = vllm_model.encode(example_prompts)
check_embeddings_close( check_embeddings_close(
......
...@@ -16,7 +16,8 @@ def check_embeddings_close( ...@@ -16,7 +16,8 @@ def check_embeddings_close(
for prompt_idx, (embeddings_0, embeddings_1) in enumerate( for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
zip(embeddings_0_lst, embeddings_1_lst)): zip(embeddings_0_lst, embeddings_1_lst)):
assert len(embeddings_0) == len(embeddings_1) assert len(embeddings_0) == len(embeddings_1), (
f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
sim = F.cosine_similarity(torch.tensor(embeddings_0), sim = F.cosine_similarity(torch.tensor(embeddings_0),
torch.tensor(embeddings_1), torch.tensor(embeddings_1),
......
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment