Merge remote-tracking branch 'mirror/main'

2216a4e5 · zhuwenwen · ad385667 · 51c24c97 · 2216a4e5 · 2216a4e5
Commit 2216a4e5 authored Oct 23, 2024 by zhuwenwen
20 changed files
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
-import contextlib
-import gc
 import tempfile
 from collections import OrderedDict
 from typing import Dict, List, TypedDict
 from unittest.mock import MagicMock, patch

 import pytest
-import ray
 import torch
 import torch.nn as nn
 from huggingface_hub import snapshot_download

 import vllm
 from vllm.config import LoRAConfig
-from vllm.distributed import (destroy_distributed_environment,
-                              destroy_model_parallel,
+from vllm.distributed import (cleanup_dist_env_and_memory,
                              init_distributed_environment,
                              initialize_model_parallel)
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -48,16 +44,6 @@ LONG_LORA_INFOS: List[ContextIDInfo] = [{
 }]


-def cleanup():
-    destroy_model_parallel()
-    destroy_distributed_environment()
-    with contextlib.suppress(AssertionError):
-        torch.distributed.destroy_process_group()
-    gc.collect()
-    torch.cuda.empty_cache()
-    ray.shutdown()
-
-
 @pytest.fixture()
 def should_do_global_cleanup_after_test(request) -> bool:
    """Allow subdirectories to skip global cleanup by overriding this fixture.
@@ -72,7 +58,7 @@ def should_do_global_cleanup_after_test(request) -> bool:
 def cleanup_fixture(should_do_global_cleanup_after_test: bool):
    yield
    if should_do_global_cleanup_after_test:
-        cleanup()
+        cleanup_dist_env_and_memory(shutdown_ray=True)


 @pytest.fixture
@@ -87,7 +73,7 @@ def dist_init():
    )
    initialize_model_parallel(1, 1)
    yield
-    cleanup()
+    cleanup_dist_env_and_memory(shutdown_ray=True)


 @pytest.fixture
@@ -238,7 +224,7 @@ def long_context_lora_files_32k():
 def long_context_infos(long_context_lora_files_16k_1,
                       long_context_lora_files_16k_2,
                       long_context_lora_files_32k):
-    cleanup()
+    cleanup_dist_env_and_memory(shutdown_ray=True)
    infos: Dict[int, ContextInfo] = {}
    for lora_checkpoint_info in LONG_LORA_INFOS:
        lora_id = lora_checkpoint_info["lora_id"]
@@ -259,7 +245,7 @@ def long_context_infos(long_context_lora_files_16k_1,

 @pytest.fixture
 def llama_2_7b_engine_extra_embeddings():
-    cleanup()
+    cleanup_dist_env_and_memory(shutdown_ray=True)
    get_model_old = get_model

    def get_model_patched(*, model_config, device_config, **kwargs):
@@ -272,7 +258,7 @@ def llama_2_7b_engine_extra_embeddings():
        engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
    yield engine.llm_engine
    del engine
-    cleanup()
+    cleanup_dist_env_and_memory(shutdown_ray=True)


 @pytest.fixture

--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -3,10 +3,9 @@ from typing import List
 import pytest

 import vllm
+from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.lora.request import LoRARequest

-from .conftest import cleanup
-
 MODEL_PATH = "baichuan-inc/Baichuan-7B"

 PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example database, you need only to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request.\n"\n##Instruction:\nconcert_singer contains tables such as stadium, singer, concert, singer_in_concert. Table stadium has columns such as Stadium_ID, Location, Name, Capacity, Highest, Lowest, Average. Stadium_ID is the primary key.\nTable singer has columns such as Singer_ID, Name, Country, Song_Name, Song_release_year, Age, Is_male. Singer_ID is the primary key.\nTable concert has columns such as concert_ID, concert_Name, Theme, Stadium_ID, Year. concert_ID is the primary key.\nTable singer_in_concert has columns such as concert_ID, Singer_ID. concert_ID is the primary key.\nThe Stadium_ID of concert is the foreign key of Stadium_ID of stadium.\nThe Singer_ID of singer_in_concert is the foreign key of Singer_ID of singer.\nThe concert_ID of singer_in_concert is the foreign key of concert_ID of concert.\n\n###Input:\n{query}\n\n###Response:"""  # noqa: E501
@@ -80,7 +79,7 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
    output_tp1 = do_sample(llm_tp1, baichuan_lora_files, lora_id=1)

    del llm_tp1
-    cleanup()
+    cleanup_dist_env_and_memory()

    llm_tp2 = vllm.LLM(MODEL_PATH,
                       enable_lora=True,
@@ -93,7 +92,7 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
    output_tp2 = do_sample(llm_tp2, baichuan_lora_files, lora_id=2)

    del llm_tp2
-    cleanup()
+    cleanup_dist_env_and_memory()

    assert output_tp1 == output_tp2

@@ -108,6 +107,6 @@ def test_baichuan_tensor_parallel_equality(baichuan_lora_files,
    output_tp4 = do_sample(llm_tp4, baichuan_lora_files, lora_id=2)

    del llm_tp4
-    cleanup()
+    cleanup_dist_env_and_memory()

    assert output_tp1 == output_tp4
--- a/tests/lora/test_llama.py
+++ b/tests/lora/test_llama.py
@@ -4,10 +4,9 @@ import pytest
 import ray

 import vllm
+from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.lora.request import LoRARequest

-from .conftest import cleanup
-
 MODEL_PATH = "meta-llama/Llama-2-7b-hf"


@@ -93,7 +92,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
    output_tp1 = do_sample(llm_tp1, sql_lora_files, lora_id=1)

    del llm_tp1
-    cleanup()
+    cleanup_dist_env_and_memory()

    llm_tp2 = vllm.LLM(MODEL_PATH,
                       enable_lora=True,
@@ -103,7 +102,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
    output_tp2 = do_sample(llm_tp2, sql_lora_files, lora_id=1)

    del llm_tp2
-    cleanup()
+    cleanup_dist_env_and_memory()

    assert output_tp1 == output_tp2

@@ -115,7 +114,7 @@ def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
    output_tp4 = do_sample(llm_tp4, sql_lora_files, lora_id=1)

    del llm_tp4
-    cleanup()
+    cleanup_dist_env_and_memory()

    assert output_tp1 == output_tp4


--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -28,9 +28,15 @@ sampling_params = SamplingParams(
 def _create_lora_request(lora_id, long_context_infos):
    context_len = long_context_infos[lora_id]["context_length"]
    scaling_factor = context_len_to_scaling_factor[context_len]
-    return LoRARequest(context_len, lora_id,
-                       long_context_infos[lora_id]["lora"], None,
-                       4096 * scaling_factor)
+    return LoRARequest(
+        # There are 2 LoRAs for 16K, we need to add lora_id to indicate
+        # they are different LoRAs.
+        context_len + str(lora_id),
+        lora_id,
+        long_context_infos[lora_id]["lora"],
+        None,
+        4096 * scaling_factor,
+    )


 def evaluate_json_response(model_response, golden_response):
@@ -108,14 +114,17 @@ def lora_llm(long_context_infos):
        for info in long_context_infos.values()
    ]

-    llm = vllm.LLM("meta-llama/Llama-2-13b-chat-hf",
-                   enable_lora=True,
-                   max_num_seqs=16,
-                   max_loras=2,
-                   long_lora_scaling_factors=tuple(scaling_factors),
-                   max_num_batched_tokens=4096 * 8,
-                   tensor_parallel_size=4,
-                   distributed_executor_backend="mp")
+    llm = vllm.LLM(
+        "meta-llama/Llama-2-13b-chat-hf",
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=2,
+        long_lora_scaling_factors=tuple(scaling_factors),
+        max_num_batched_tokens=4096 * 8,
+        tensor_parallel_size=4,
+        # FIXME enable async output processor
+        disable_async_output_proc=True,
+        distributed_executor_backend="mp")
    yield llm
    del llm


--- a/tests/lora/test_minicpmv.py
+++ b/tests/lora/test_minicpmv.py
@@ -61,6 +61,7 @@ def test_minicpmv_lora(minicpmv_lora_files):
        max_loras=4,
        max_lora_rank=64,
        trust_remote_code=True,
+        gpu_memory_utilization=0.97  # This model is pretty big for CI gpus
    )

    output1 = do_sample(llm, minicpmv_lora_files, lora_id=1)

--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -6,11 +6,10 @@ from typing import List
 import pytest

 import vllm
+from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.lora.request import LoRARequest
 from vllm.utils import is_hip

-from .conftest import cleanup
-

 @dataclass
 class ModelWithQuantization:
@@ -160,7 +159,7 @@ def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
    print("removing lora")

    del llm
-    cleanup()
+    cleanup_dist_env_and_memory()


 @pytest.mark.parametrize("model", MODELS)
@@ -181,7 +180,7 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
    output_tp1 = do_sample(llm_tp1, tinyllama_lora_files, lora_id=1)

    del llm_tp1
-    cleanup()
+    cleanup_dist_env_and_memory()

    llm_tp2 = vllm.LLM(
        model=model.model_path,
@@ -194,6 +193,6 @@ def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
    output_tp2 = do_sample(llm_tp2, tinyllama_lora_files, lora_id=1)

    del llm_tp2
-    cleanup()
+    cleanup_dist_env_and_memory()

    assert output_tp1 == output_tp2
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -15,7 +15,8 @@ def test_worker_apply_lora(sql_lora_files):
    worker = Worker(
        model_config=ModelConfig(
            "meta-llama/Llama-2-7b-hf",
-            "meta-llama/Llama-2-7b-hf",
+            task="auto",
+            tokenizer="meta-llama/Llama-2-7b-hf",
            tokenizer_mode="auto",
            trust_remote_code=False,
            seed=0,
@@ -27,7 +28,7 @@ def test_worker_apply_lora(sql_lora_files):
            load_format="dummy",
        ),
        parallel_config=ParallelConfig(1, 1, False),
-        scheduler_config=SchedulerConfig(32, 32, 32),
+        scheduler_config=SchedulerConfig("generate", 32, 32, 32),
        device_config=DeviceConfig("cuda"),
        cache_config=CacheConfig(block_size=16,
                                 gpu_memory_utilization=1.,

--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -6,13 +6,12 @@ import ray
 from prometheus_client import REGISTRY

 from vllm import EngineArgs, LLMEngine
+from vllm.distributed import cleanup_dist_env_and_memory
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.metrics import RayPrometheusStatLogger
 from vllm.sampling_params import SamplingParams

-from ..conftest import cleanup
-
 MODELS = [
    "facebook/opt-125m",
 ]
@@ -85,6 +84,45 @@ def test_metric_counter_generation_tokens(
        f"metric: {metric_count!r}")


+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [128, 129])
+@pytest.mark.parametrize("disable_async_output_proc", [True, False])
+def test_metric_counter_generation_tokens_multi_step(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    max_tokens: int,
+    disable_async_output_proc: bool,
+) -> None:
+    num_scheduler_steps = 8
+    with vllm_runner(
+            model,
+            disable_log_stats=False,
+            gpu_memory_utilization=0.4,
+            num_scheduler_steps=num_scheduler_steps,
+            disable_async_output_proc=disable_async_output_proc,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        tokenizer = vllm_model.model.get_tokenizer()
+        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
+        metric_count = stat_logger.metrics.counter_generation_tokens.labels(
+            **stat_logger.labels)._value.get()
+        vllm_generation_count = 0
+        for i in range(len(example_prompts)):
+            vllm_output_ids, vllm_output_str = vllm_outputs[i]
+            prompt_ids = tokenizer.encode(example_prompts[i])
+            # vllm_output_ids contains both prompt tokens and generation tokens.
+            # We're interested only in the count of the generation tokens.
+            vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
+
+    # The multi-step scheduling will continue to execute forward even when
+    # encountering EOS, leading to slightly imprecise metrics.
+    assert abs(vllm_generation_count - metric_count) <\
+        len(example_prompts) * num_scheduler_steps, \
+        (f"generation token count: {vllm_generation_count!r}\n"
+         f"metric: {metric_count!r}")
+
+
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 @pytest.mark.parametrize(
@@ -307,7 +345,7 @@ def test_metric_spec_decode_interval(

    finally:
        del engine
-        cleanup()
+        cleanup_dist_env_and_memory()


 def assert_metrics(engine: LLMEngine, disable_log_stats: bool,

--- a/tests/model_executor/test_enabled_custom_ops.py
+++ b/tests/model_executor/test_enabled_custom_ops.py
+import os
+from typing import List
+
+import pytest
+
+from vllm.model_executor.custom_op import CustomOp
+from vllm.model_executor.layers.activation import (GeluAndMul,
+                                                   ReLUSquaredActivation,
+                                                   SiluAndMul)
+from vllm.model_executor.layers.layernorm import RMSNorm
+
+
+# Registered subclass for test
+@CustomOp.register("relu3")
+class Relu3(ReLUSquaredActivation):
+    pass
+
+
+@pytest.mark.parametrize(
+    "env, torch_level, ops_enabled, default_on",
+    [
+        # Default values based on compile level
+        ("", 0, [True] * 4, True),
+        ("", 1, [True] * 4, True),
+        ("", 2, [True] * 4, True),  # All by default
+        ("", 3, [False] * 4, False),
+        ("", 4, [False] * 4, False),  # None by default
+        # Explicitly enabling/disabling
+        #
+        # Default: all
+        #
+        # All but SiluAndMul
+        ("+rms_norm,-silu_and_mul", 0, [1, 0, 1, 1], True),
+        # Only ReLU3
+        ("none,-rms_norm,+relu3", 0, [0, 0, 0, 1], False),
+        # All but SiluAndMul
+        ("all,-silu_and_mul", 1, [1, 0, 1, 1], True),
+        # All but ReLU3 (even if ReLU2 is on)
+        ("-relu3,relu2", 1, [1, 1, 1, 0], True),
+        # GeluAndMul and SiluAndMul
+        ("none,-relu3,+gelu_and_mul,+silu_and_mul", 2, [0, 1, 1, 0], False),
+        # All but RMSNorm
+        ("-rms_norm", 2, [0, 1, 1, 1], True),
+        #
+        # Default: none
+        #
+        # Only ReLU3
+        ("-silu_and_mul,+relu3", 3, [0, 0, 0, 1], False),
+        # All but RMSNorm
+        ("all,-rms_norm", 4, [0, 1, 1, 1], True),
+    ])
+def test_enabled_ops(env: str, torch_level: int, ops_enabled: List[int],
+                     default_on: bool):
+    os.environ["VLLM_CUSTOM_OPS"] = env
+    os.environ["VLLM_TORCH_COMPILE_LEVEL"] = str(torch_level)
+
+    # Reset default_on (computed once):
+    CustomOp.default_on.cache_clear()
+
+    assert CustomOp.default_on() == default_on
+
+    ops_enabled = [bool(x) for x in ops_enabled]
+
+    assert RMSNorm(1024).enabled() == ops_enabled[0]
+    assert CustomOp.op_registry["rms_norm"].enabled() == ops_enabled[0]
+
+    assert SiluAndMul().enabled() == ops_enabled[1]
+    assert CustomOp.op_registry["silu_and_mul"].enabled() == ops_enabled[1]
+
+    assert GeluAndMul().enabled() == ops_enabled[2]
+    assert CustomOp.op_registry["gelu_and_mul"].enabled() == ops_enabled[2]
+
+    # If registered, subclasses should follow their own name
+    assert Relu3().enabled() == ops_enabled[3]
+    assert CustomOp.op_registry["relu3"].enabled() == ops_enabled[3]
+
+    # Unregistered subclass
+    class SiluAndMul2(SiluAndMul):
+        pass
+
+    # Subclasses should not require registration
+    assert SiluAndMul2().enabled() == SiluAndMul().enabled()
+
+
+@pytest.mark.parametrize(
+    "env", ["all,none", "all,+rms_norm,all", "+rms_norm,-rms_norm"])
+def test_enabled_ops_invalid(env: str):
+    os.environ["VLLM_CUSTOM_OPS"] = env
+    CustomOp.default_on.cache_clear()
+
+    with pytest.raises(AssertionError):
+        RMSNorm(1024).enabled()
--- a/tests/models/decoder_only/language/test_big_models.py
+++ b/tests/models/decoder_only/language/test_big_models.py
@@ -21,10 +21,14 @@ MODELS = [
 ]

 if not current_platform.is_cpu():
-    # MiniCPM requires fused_moe which is not supported by CPU
-    MODELS.append("openbmb/MiniCPM3-4B")
+    MODELS += [
+        # fused_moe which not supported on CPU
+        "openbmb/MiniCPM3-4B",
+        # Head size isn't supported on CPU
+        "h2oai/h2o-danube3-4b-base",
+    ]

-#TODO: remove this after CPU float16 support ready
+# TODO: remove this after CPU float16 support ready
 target_dtype = "float" if current_platform.is_cpu() else "half"



--- a/tests/models/decoder_only/language/test_danube3_4b.py
+++ b/tests/models/decoder_only/language/test_danube3_4b.py
-"""Compare the outputs of HF and vLLM when using greedy sampling.
-
-This tests danube3 separately because its head size isn't supported on CPU yet.
-
-Run `pytest tests/models/test_danube3_4b.py`.
-"""
-import pytest
-
-from ...utils import check_outputs_equal
-
-MODELS = ["h2oai/h2o-danube3-4b-base"]
-
-target_dtype = "half"
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [32])
-def test_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    max_tokens: int,
-) -> None:
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-
-    check_outputs_equal(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
-
-
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", [target_dtype])
-def test_model_print(
-    vllm_runner,
-    model: str,
-    dtype: str,
-) -> None:
-    with vllm_runner(model, dtype=dtype) as vllm_model:
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        print(vllm_model.model.llm_engine.model_executor.driver_worker.
-              model_runner.model)
--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
@@ -10,7 +10,7 @@ from vllm.worker.model_runner import _get_graph_batch_size

 from ...utils import check_outputs_equal

-MODELS = ["state-spaces/mamba-130m-hf"]
+MODELS = ["state-spaces/mamba-130m-hf", "tiiuae/falcon-mamba-tiny-dev"]


 # Use lower-level interfaces to create this greedy generator, as mamba will

--- a/tests/models/decoder_only/language/test_phimoe.py
+++ b/tests/models/decoder_only/language/test_phimoe.py
@@ -5,7 +5,7 @@ Run `pytest tests/models/test_phimoe.py`.
 import pytest
 import torch

-from vllm.utils import is_cpu
+from vllm.platforms import current_platform

 from ....utils import large_gpu_test
 from ...utils import check_logprobs_close
@@ -70,7 +70,7 @@ def test_phimoe_routing_function():
        assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])


-@pytest.mark.skipif(condition=is_cpu(),
+@pytest.mark.skipif(condition=current_platform.is_cpu(),
                    reason="This test takes a lot time to run on CPU, "
                    "and vllm CI's disk space is not enough for this model.")
 @large_gpu_test(min_gb=80)

--- a/tests/models/decoder_only/vision_language/test_fuyu.py
+++ b/tests/models/decoder_only/vision_language/test_fuyu.py
@@ -3,8 +3,8 @@ from typing import List, Optional, Tuple, Type
 import pytest

 from vllm.multimodal.utils import rescale_image_size
+from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
-from vllm.utils import is_cpu

 from ....conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from ...utils import check_logprobs_close
@@ -46,7 +46,7 @@ def run_test(

    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects 
+    For vllm runner, we provide MultiModalDataDict objects
    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
@@ -103,7 +103,7 @@ def run_test(


 target_dtype = "half"
-if is_cpu():
+if current_platform.is_cpu():
    target_dtype = "bfloat16"



--- a/tests/models/decoder_only/vision_language/test_intern_vit.py
+++ b/tests/models/decoder_only/vision_language/test_intern_vit.py
@@ -6,7 +6,7 @@ import torch.nn as nn
 from huggingface_hub import snapshot_download
 from transformers import AutoConfig, AutoModel, CLIPImageProcessor

-from ....conftest import _ImageAssets, cleanup
+from ....conftest import _ImageAssets

 # we use snapshot_download to prevent conflicts between
 # dynamic_module and trust_remote_code for hf_runner
@@ -45,12 +45,13 @@ def run_intern_vit_test(
        for pixel_value in pixel_values
    ]

+    from vllm.distributed import cleanup_dist_env_and_memory
    from vllm.model_executor.models.intern_vit import InternVisionModel
    vllm_model = InternVisionModel(config)
    vllm_model.load_weights(hf_model.state_dict().items())

    del hf_model
-    cleanup()
+    cleanup_dist_env_and_memory()

    vllm_model = vllm_model.to("cuda", dtype)
    vllm_outputs_per_image = [
@@ -58,7 +59,7 @@ def run_intern_vit_test(
        for pixel_value in pixel_values
    ]
    del vllm_model
-    cleanup()
+    cleanup_dist_env_and_memory()

    cos_similar = nn.CosineSimilarity(dim=-1)
    for vllm_output, hf_output in zip(vllm_outputs_per_image,

--- a/tests/models/decoder_only/vision_language/test_internvl.py
+++ b/tests/models/decoder_only/vision_language/test_internvl.py
@@ -7,7 +7,6 @@ from PIL.Image import Image
 from transformers import AutoConfig

 from vllm.multimodal.utils import rescale_image_size
-from vllm.utils import is_cpu

 from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
                          _ImageAssets)
@@ -19,15 +18,20 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
    "cherry_blossom":
    "<|im_start|>User\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n",  # noqa: E501
 })
-HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: <image>\nImage-2: <image>\nDescribe the two images in detail.<|im_end|>\n<|im_start|>Assistant\n"  # noqa: E501
+HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: <image>\nImage-2: <image>\nDescribe the two images in short.<|im_end|>\n<|im_start|>Assistant\n"  # noqa: E501

 models = [
    "OpenGVLab/InternVL2-1B",
    "OpenGVLab/InternVL2-2B",
+    # NOTE: Mono-InternVL-2B doesn't work with fp16,
+    # it will result NaN during inference.
+    # See: https://huggingface.co/OpenGVLab/Mono-InternVL-2B/discussions/9
+    "OpenGVLab/Mono-InternVL-2B",
    # Broken due to outdated implementation of Phi-3
    # See: https://huggingface.co/OpenGVLab/InternVL2-4B/discussions/3
    # "OpenGVLab/InternVL2-4B",
 ]
+target_dtype = "bfloat16"


 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py
@@ -52,9 +56,15 @@ def generate(

    input_embeds = input_embeds.reshape(B, N, C)

-    outputs = self.language_model.generate(
+    forward_kwargs = dict(
        inputs_embeds=input_embeds,
        attention_mask=attention_mask,
+    )
+    if getattr(self, "use_visual_token_mask", False):
+        visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype)
+        forward_kwargs["visual_token_mask"] = visual_token_mask
+    outputs = self.language_model.generate(
+        **forward_kwargs,
        **generate_kwargs,
    )

@@ -78,7 +88,7 @@ def run_test(

    All the image fixtures for the test are from IMAGE_ASSETS.
    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects 
+    For vllm runner, we provide MultiModalDataDict objects
    and corresponding MultiModalConfig as input.
    Note, the text input is also adjusted to abide by vllm contract.
    The text output is sanitized to be able to compare with hf.
@@ -243,11 +253,6 @@ def run_awq_test(
        )


-target_dtype = "half"
-if is_cpu():
-    target_dtype = "bfloat16"
-
-
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
    "size_factors",

--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
@@ -10,8 +10,9 @@ from vllm.inputs import InputContext, token_inputs
 from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
 from vllm.multimodal import MultiModalRegistry
 from vllm.multimodal.utils import rescale_image_size
+from vllm.platforms import current_platform
 from vllm.sequence import SampleLogprobs
-from vllm.utils import is_cpu, is_hip
+from vllm.utils import is_hip

 from ....conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner,
                          _ImageAssets)
@@ -49,7 +50,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,


 target_dtype = "half"
-if is_cpu():
+if current_platform.is_cpu():
    target_dtype = "bfloat16"

 # ROCm Triton FA can run into shared memory issues with these models,
@@ -89,6 +90,7 @@ def run_test(

    # max_model_len should be greater than image_feature_size
    with vllm_runner(model,
+                     task="generate",
                     max_model_len=4096,
                     max_num_seqs=2,
                     dtype=dtype,

--- a/tests/models/embedding/language/test_embedding.py
+++ b/tests/models/embedding/language/test_embedding.py
@@ -6,21 +6,31 @@ import pytest

 from ..utils import check_embeddings_close

+# Model, Guard
 MODELS = [
    "intfloat/e5-mistral-7b-instruct",
+    "BAAI/bge-base-en-v1.5",
    "BAAI/bge-multilingual-gemma2",
 ]

+ENCODER_ONLY = [
+    "BAAI/bge-base-en-v1.5",
+]
+

 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
 def test_models(
+    monkeypatch,
    hf_runner,
    vllm_runner,
    example_prompts,
-    model: str,
+    model,
    dtype: str,
 ) -> None:
+    if model in ENCODER_ONLY:
+        monkeypatch.setenv("VLLM_ATTENTION_BACKEND", "XFORMERS")
+
    # The example_prompts has ending "\n", for example:
    # "Write a short story about a robot that dreams for the first time.\n"
    # sentence_transformers will strip the input texts, see:
@@ -33,7 +43,7 @@ def test_models(
                   is_sentence_transformer=True) as hf_model:
        hf_outputs = hf_model.encode(example_prompts)

-    with vllm_runner(model, dtype=dtype) as vllm_model:
+    with vllm_runner(model, dtype=dtype, max_model_len=None) as vllm_model:
        vllm_outputs = vllm_model.encode(example_prompts)

    check_embeddings_close(

--- a/tests/models/embedding/utils.py
+++ b/tests/models/embedding/utils.py
@@ -16,7 +16,8 @@ def check_embeddings_close(

    for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
            zip(embeddings_0_lst, embeddings_1_lst)):
-        assert len(embeddings_0) == len(embeddings_1)
+        assert len(embeddings_0) == len(embeddings_1), (
+            f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")

        sim = F.cosine_similarity(torch.tensor(embeddings_0),
                                  torch.tensor(embeddings_1),

--- a/tests/models/embedding/vision_language/test_llava_next.py
+++ b/tests/models/embedding/vision_language/test_llava_next.py
+from typing import List, Type
+
+import pytest
+import torch.nn.functional as F
+from transformers import AutoModelForVision2Seq
+
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....utils import large_gpu_test
+from ..utils import check_embeddings_close
+
+llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n'  # noqa: E501
+
+HF_TEXT_PROMPTS = [
+    # T -> X
+    llama3_template.format(
+        "The label of the object is stop sign\nSummary above sentence in one word: "  # noqa: E501
+    ),
+    # T -> X
+    llama3_template.format(
+        "cherry blossom\nSummary above sentence in one word: "),
+]
+
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    # I -> X
+    "stop_sign":
+    llama3_template.format("<image>\nSummary above image in one word: "),
+    # I -> X
+    "cherry_blossom":
+    llama3_template.format("<image>\nSummary above image in one word: "),
+})
+
+MODELS = ["royokong/e5-v"]
+
+
+def _run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    input_texts: List[str],
+    input_images: PromptImageInput,
+    model: str,
+    *,
+    dtype: str,
+) -> None:
+    # NOTE: take care of the order. run vLLM first, and then run HF.
+    # vLLM needs a fresh new process without cuda initialization.
+    # if we run HF first, the cuda initialization will be done and it
+    # will hurt multiprocessing backend with fork method (the default method).
+    with vllm_runner(model,
+                     task="embedding",
+                     dtype=dtype,
+                     max_model_len=4096,
+                     enforce_eager=True) as vllm_model:
+        vllm_outputs = vllm_model.encode(input_texts, images=input_images)
+
+    with hf_runner(model, dtype=dtype,
+                   auto_cls=AutoModelForVision2Seq) as hf_model:
+        # Patch the issue where image_token_id
+        # exceeds the maximum allowed vocab size
+        hf_model.model.resize_token_embeddings(
+            hf_model.model.language_model.vocab_size + 1)
+
+        all_inputs = hf_model.get_inputs(input_texts, images=input_images)
+
+        all_outputs = []
+        for inputs in all_inputs:
+            # Based on: https://huggingface.co/royokong/e5-v
+            outputs = hf_model.model(
+                **hf_model.wrap_device(inputs,
+                                       device=hf_model.model.device.type),
+                return_dict=True,
+                output_hidden_states=True,
+            )
+            pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :],
+                                        dim=-1)
+
+            all_outputs.append(pooled_output.tolist())
+
+        hf_outputs = all_outputs
+
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models_text(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [(text, None) for text in HF_TEXT_PROMPTS]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,  # type: ignore
+        model,
+        dtype=dtype,
+    )
+
+
+@large_gpu_test(min_gb=48)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+def test_models_image(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+    dtype: str,
+) -> None:
+    input_texts_images = [
+        (text, asset.pil_image)
+        for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
+    ]
+    input_texts = [text for text, _ in input_texts_images]
+    input_images = [image for _, image in input_texts_images]
+
+    _run_test(
+        hf_runner,
+        vllm_runner,
+        input_texts,
+        input_images,
+        model,
+        dtype=dtype,
+    )