Merge tag 'v0.6.3.post1' into v0.6.3.post1-dev

6d2051cc · zhuwenwen · 2c7f740a · a2c71c54 · 6d2051cc · 6d2051cc
Commit 6d2051cc authored Oct 21, 2024 by zhuwenwen
20 changed files
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@@ -9,12 +9,9 @@ from vllm.lora.request import LoRARequest
 MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
-def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
+def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
-    prompts = [
+              prompts: List[str]) -> List[str]:
-        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",  # noqa: E501
-        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",  # noqa: E501
-        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",  # noqa: E501
-    ]
    sampling_params = vllm.SamplingParams(temperature=0, max_tokens=256)
    outputs = llm.generate(
        prompts,
@@ -33,22 +30,71 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]:
 @pytest.mark.parametrize("tp_size", [4])
 def test_mixtral_lora(mixtral_lora_files, tp_size):
+    """Original test, the LoRA model has the common target modules, not all"""
    if torch.cuda.device_count() < tp_size:
        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
-    llm = vllm.LLM(MODEL_PATH,
+    prompts = [
-                   enable_lora=True,
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nSpellForce 3 is a pretty bad game. The developer Grimlore Games is clearly a bunch of no-talent hacks, and 2017 was a terrible year for games anyway. [/user] [assistant]",  # noqa: E501
-                   max_num_seqs=16,
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nI wanted to like Grimlore Games' 2017 entry, but in SpellForce 3 they just didn't get anything right. [/user] [assistant]",  # noqa: E501
-                   max_loras=4,
+        "[system] Given a target sentence construct the underlying meaning representation\nof the input sentence as a single function with attributes and attribute\nvalues. This function should describe the target string accurately and the\nfunction must be one of the following ['inform', 'request', 'give_opinion',\n'confirm', 'verify_attribute', 'suggest', 'request_explanation',\n'recommend', 'request_attribute'].\n\nThe attributes must be one of the following:\n['name', 'exp_release_date', 'release_year', 'developer', 'esrb', 'rating',\n'genres', 'player_perspective', 'has_multiplayer', 'platforms',\n'available_on_steam', 'has_linux_release', 'has_mac_release', 'specifier'] [/system] [user] Here is the target sentence:\nBioShock is a good role-playing, action-adventure, shooter that released for PlayStation, Xbox, and PC in 2007. It is available on Steam, and it has a Mac release but not a Linux release. [/user] [assistant]",  # noqa: E501
-                   distributed_executor_backend="ray",
+    ]
-                   tensor_parallel_size=tp_size)
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        distributed_executor_backend="ray",
+        tensor_parallel_size=tp_size,
+    )
    expected_lora_output = [
        "give_opinion(name[SpellForce 3], release_year[2017], developer[Grimlore Games], rating[poor])",  # noqa: E501
        "give_opinion(name[SpellForce 3], developer[Grimlore Games], release_year[2017], rating[poor])",  # noqa: E501
        "inform(name[BioShock], release_year[2007], rating[good], genres[action-adventure, role-playing, shooter], platforms[PlayStation, Xbox, PC], available_on_steam[yes], has_linux_release[no], has_mac_release[yes])",  # noqa: E501
    ]
-    assert do_sample(llm, mixtral_lora_files,
+    assert do_sample(llm, mixtral_lora_files, lora_id=1,
-                     lora_id=1) == expected_lora_output
+                     prompts=prompts) == expected_lora_output
-    assert do_sample(llm, mixtral_lora_files,
+    assert do_sample(llm, mixtral_lora_files, lora_id=2,
-                     lora_id=2) == expected_lora_output
+                     prompts=prompts) == expected_lora_output
+@pytest.mark.parametrize("tp_size", [4])
+def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
+                                         tp_size):
+    """This LoRA model has all supported Mixtral target modules"""
+    if torch.cuda.device_count() < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+    prompts = [
+        "Question: What happens if you touch the eyes of a blind man?\nChoices:\nB: The man's sight will be restored.\nA: Nothing happens if you touch the eyes of a blind man.\nC: The man will get his eyesight back.\nD: The man will no longer be blind.\nAnswer:",  # noqa: E501
+        "Question: Which will increase the energy of motion of water molecules?\nChoices:\nA: add heat\nB: add ice\nC: remove some of the water\nD: place the water in the freezer\nAnswer:",  # noqa: E501
+        "Since Craig threw aluminum cans in the trash and Benjamin recycled, _ was environmentally irresponsible.\nChoices:\n1: Craig\n2: Benjamin\nAnswer:",  # noqa: E501
+    ]
+    llm = vllm.LLM(
+        MODEL_PATH,
+        enable_lora=True,
+        max_num_seqs=16,
+        max_loras=4,
+        distributed_executor_backend="ray",
+        tensor_parallel_size=tp_size,
+        max_lora_rank=32,
+    )
+    expected_lora_output = [
+        "A: Nothing happens if you touch the eyes of a blind man.",
+        "A: add heat",
+        "1: Craig",
+    ]
+    assert do_sample(llm,
+                     mixtral_lora_files_all_target_modules,
+                     lora_id=1,
+                     prompts=prompts) == expected_lora_output
+    assert do_sample(llm,
+                     mixtral_lora_files_all_target_modules,
+                     lora_id=2,
+                     prompts=prompts) == expected_lora_output
--- a/tests/lora/test_quant_model.py
+++ b/tests/lora/test_quant_model.py
@@ -71,10 +71,10 @@ def do_sample(llm: vllm.LLM,
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tp_size", [1])
-def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
+def test_quant_model_lora(tinyllama_lora_files, num_gpus_available, model,
-    # Cannot use as it will initialize torch.cuda too early...
+                          tp_size):
-    # if torch.cuda.device_count() < tp_size:
+    if num_gpus_available < tp_size:
-    #     pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
    llm = vllm.LLM(
        model=model.model_path,
@@ -164,11 +164,10 @@ def test_quant_model_lora(tinyllama_lora_files, model, tp_size):
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.skip("Requires multiple GPUs")
+def test_quant_model_tp_equality(tinyllama_lora_files, num_gpus_available,
-def test_quant_model_tp_equality(tinyllama_lora_files, model):
+                                 model):
-    # Cannot use as it will initialize torch.cuda too early...
+    if num_gpus_available < 2:
-    # if torch.cuda.device_count() < 2:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
-    #     pytest.skip(f"Not enough GPUs for tensor parallelism {2}")
    llm_tp1 = vllm.LLM(
        model=model.model_path,

--- a/tests/lora/test_tokenizer_group.py
+++ b/tests/lora/test_tokenizer_group.py
@@ -41,7 +41,7 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
            lora_request)
-def test_get_lora_tokenizer(sql_lora_files, tmpdir):
+def test_get_lora_tokenizer(sql_lora_files, tmp_path):
    lora_request = None
    tokenizer = get_lora_tokenizer(lora_request)
    assert not tokenizer
@@ -50,6 +50,6 @@ def test_get_lora_tokenizer(sql_lora_files, tmpdir):
    tokenizer = get_lora_tokenizer(lora_request)
    assert tokenizer.get_added_vocab()
-    lora_request = LoRARequest("1", 1, str(tmpdir))
+    lora_request = LoRARequest("1", 1, str(tmp_path))
    tokenizer = get_lora_tokenizer(lora_request)
    assert not tokenizer
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -185,13 +185,14 @@ def test_metric_spec_decode(
 ) -> None:
    k = 5
-    with vllm_runner(model,
+    with vllm_runner(
-                     dtype=dtype,
+            model,
-                     disable_log_stats=False,
+            dtype=dtype,
-                     gpu_memory_utilization=0.4,
+            disable_log_stats=False,
-                     speculative_model=model,
+            gpu_memory_utilization=0.4,
-                     num_speculative_tokens=k,
+            speculative_model=model,
-                     use_v2_block_manager=True) as vllm_model:
+            num_speculative_tokens=k,
+    ) as vllm_model:
        # Force log interval to be 0 to catch all metrics.
        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
@@ -242,7 +243,6 @@ def test_metric_spec_decode_interval(
                             gpu_memory_utilization=0.4,
                             speculative_model=model,
                             num_speculative_tokens=k,
-                             use_v2_block_manager=True,
                             enforce_eager=True)
    engine = LLMEngine.from_engine_args(engine_args)
@@ -326,7 +326,6 @@ def assert_metrics(engine: LLMEngine, disable_log_stats: bool,
            "vllm:e2e_request_latency_seconds",
            "vllm:request_prompt_tokens",
            "vllm:request_generation_tokens",
-            "vllm:request_params_best_of",
            "vllm:request_params_n",
        ]
        for metric_name in request_histogram_metrics:

--- a/tests/model_executor/conftest.py
+++ b/tests/model_executor/conftest.py
+import pytest
+@pytest.fixture
+def sample_regex():
+    return (r"((25[0-5]|(2[0-4]|1\d|[1-9]|)\d)\.){3}"
+            r"(25[0-5]|(2[0-4]|1\d|[1-9]|)\d)")
+@pytest.fixture
+def sample_json_schema():
+    return {
+        "type": "object",
+        "properties": {
+            "name": {
+                "type": "string"
+            },
+            "age": {
+                "type": "integer"
+            },
+            "skills": {
+                "type": "array",
+                "items": {
+                    "type": "string",
+                    "maxLength": 10
+                },
+                "minItems": 3
+            },
+            "work_history": {
+                "type": "array",
+                "items": {
+                    "type": "object",
+                    "properties": {
+                        "company": {
+                            "type": "string"
+                        },
+                        "duration": {
+                            "type": "number"
+                        },
+                        "position": {
+                            "type": "string"
+                        }
+                    },
+                    "required": ["company", "position"]
+                }
+            }
+        },
+        "required": ["name", "age", "skills", "work_history"]
+    }
--- a/tests/entrypoints/openai/test_guided_processors.py
+++ b/tests/entrypoints/openai/test_guided_processors.py
-# This unit test should be moved to a new
-# tests/test_guided_decoding directory.
 import pytest
 import torch
 from transformers import AutoTokenizer
-from vllm.entrypoints.openai.protocol import CompletionRequest
 from vllm.model_executor.guided_decoding import (
    get_guided_decoding_logits_processor)
 from vllm.model_executor.guided_decoding.outlines_logits_processors import (
    JSONLogitsProcessor, RegexLogitsProcessor)
+from vllm.sampling_params import GuidedDecodingParams
 def test_guided_logits_processors(sample_regex, sample_json_schema):
@@ -44,11 +42,9 @@ async def test_guided_logits_processor_black_box(backend: str, sample_regex,
    tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
    token_ids = tokenizer.encode(
        f"Give an example IPv4 address with this regex: {sample_regex}")
-    regex_request = CompletionRequest(model='test',
+    regex_request = GuidedDecodingParams(regex=sample_regex, backend=backend)
-                                      prompt=token_ids,
-                                      guided_regex=sample_regex)
    regex_lp = await get_guided_decoding_logits_processor(
-        backend, regex_request, tokenizer)
+        regex_request, tokenizer)
    assert regex_lp is not None
    tensor = torch.rand(32000)
    original_tensor = torch.clone(tensor)
@@ -59,14 +55,31 @@ async def test_guided_logits_processor_black_box(backend: str, sample_regex,
    token_ids = tokenizer.encode(
        f"Give an employee profile that fits this schema: {sample_json_schema}"
    )
-    json_request = CompletionRequest(model='test',
+    json_request = GuidedDecodingParams(json=sample_json_schema,
-                                     prompt=token_ids,
+                                        backend=backend)
-                                     guided_json=sample_json_schema)
    json_lp = await get_guided_decoding_logits_processor(
-        backend, json_request, tokenizer)
+        json_request, tokenizer)
    assert json_lp is not None
    tensor = torch.rand(32000)
    original_tensor = torch.clone(tensor)
    tensor = json_lp(token_ids, tensor)
    assert tensor.shape == original_tensor.shape
    assert not torch.allclose(tensor, original_tensor)
+def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
+    with pytest.raises(ValueError,
+                       match="You can only use one kind of guided"):
+        GuidedDecodingParams(json=sample_json_schema, regex=sample_regex)
+    with pytest.raises(ValueError,
+                       match="You can only use one kind of guided"):
+        GuidedDecodingParams(json=sample_json_schema, json_object=True)
+    with pytest.raises(ValueError,
+                       match="You can only use one kind of guided"):
+        GuidedDecodingParams(json=sample_json_schema, choice=["a", "b"])
+    with pytest.raises(ValueError,
+                       match="You can only use one kind of guided"):
+        GuidedDecodingParams(json=sample_json_schema, grammar="test grammar")
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -19,12 +19,12 @@ MAX_MODEL_LEN = 1024
 # FIXME: Move this to confest
 MODELS = [
-    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    ("meta-llama/Llama-3.2-1B-Instruct",
-     hf_hub_download("TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
+     hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF",
-                     filename="tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf")),
+                     filename="Llama-3.2-1B-Instruct-Q4_K_M.gguf")),
-    ("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    ("meta-llama/Llama-3.2-1B-Instruct",
-     hf_hub_download("duyntnet/TinyLlama-1.1B-Chat-v1.0-imatrix-GGUF",
+     hf_hub_download("bartowski/Llama-3.2-1B-Instruct-GGUF",
-                     filename="TinyLlama-1.1B-Chat-v1.0-IQ4_XS.gguf")),
+                     filename="Llama-3.2-1B-Instruct-IQ4_XS.gguf")),
    ("Qwen/Qwen2-1.5B-Instruct",
     hf_hub_download("Qwen/Qwen2-1.5B-Instruct-GGUF",
                     filename="qwen2-1_5b-instruct-q4_k_m.gguf")),

--- a/tests/models/decoder_only/language/test_granite.py
+++ b/tests/models/decoder_only/language/test_granite.py
@@ -3,7 +3,6 @@
 Run `pytest tests/models/test_granite.py`.
 """
 import pytest
-import transformers
 from ...utils import check_logprobs_close
@@ -12,9 +11,6 @@ MODELS = [
 ]
-# GraniteForCausalLM will be in transformers >= 4.45
-@pytest.mark.skipif(transformers.__version__ < "4.45",
-                    reason="granite model test requires transformers >= 4.45")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])

--- a/tests/models/decoder_only/language/test_granitemoe.py
+++ b/tests/models/decoder_only/language/test_granitemoe.py
+"""Compare the outputs of HF and vLLM for Granite models using greedy sampling.
+Run `pytest tests/models/test_granite.py`.
+"""
+import pytest
+from ...utils import check_logprobs_close
+MODELS = [
+    "ibm/PowerMoE-3b",
+]
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
--- a/tests/models/decoder_only/language/test_jamba.py
+++ b/tests/models/decoder_only/language/test_jamba.py
 import pytest
+from tests.utils import multi_gpu_test
+from vllm.sampling_params import SamplingParams
 from vllm.worker.model_runner import _get_graph_batch_size
 from ...utils import check_outputs_equal
-MODELS = ["ai21labs/Jamba-tiny-random"]
+MODELS = ["ai21labs/Jamba-tiny-dev"]
-# Fails due to usage of MoE as MLP(E=1_, which is different than the HF impl
-# TODO: Fix this with trained model
-@pytest.mark.skip()
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [10])
+@pytest.mark.parametrize("max_tokens", [96])
 def test_models(
    hf_runner,
    vllm_runner,
@@ -22,7 +21,14 @@ def test_models(
    max_tokens: int,
 ) -> None:
-    with hf_runner(model, dtype=dtype) as hf_model:
+    with hf_runner(
+            model,
+            dtype=dtype,
+            model_kwargs={
+                "use_mamba_kernels":
+                False,  # mamba kernels are not installed so HF 
+                # don't use them
+            }) as hf_model:
        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
    with vllm_runner(model, dtype=dtype) as vllm_model:
@@ -38,8 +44,8 @@ def test_models(
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["float"])
-@pytest.mark.parametrize("max_tokens", [5])
+@pytest.mark.parametrize("max_tokens", [96])
 def test_batching(
    vllm_runner,
    example_prompts,
@@ -65,6 +71,107 @@ def test_batching(
    )
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float16"])
+@pytest.mark.parametrize("max_tokens", [10])
+def test_mamba_prefill_chunking_with_parallel_sampling(
+        hf_runner, vllm_runner, example_prompts, model: str, dtype: str,
+        max_tokens: int) -> None:
+    # Tests prefill chunking in conjunction with n>1, in this case,
+    # prefill is populated with decoding tokens and we test that it
+    # doesn't fail This test might fail if cache is not allocated
+    # correctly for n > 1 decoding steps inside a
+    # chunked prefill forward pass (where we have both prefills
+    # and decoding together )
+    sampling_params = SamplingParams(n=3,
+                                     temperature=1,
+                                     seed=0,
+                                     max_tokens=max_tokens)
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            enable_chunked_prefill=True,
+            max_num_batched_tokens=30,
+            max_num_seqs=10  # forces prefill chunks with decoding
+    ) as vllm_model:
+        vllm_model.generate(example_prompts, sampling_params)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [10])
+def test_mamba_prefill_chunking(hf_runner, vllm_runner, example_prompts,
+                                model: str, dtype: str,
+                                max_tokens: int) -> None:
+    # numeric error during prefill chucking produces different generation
+    # compared to w/o prefill chunking for those examples, removed them for now
+    example_prompts.pop(7)
+    example_prompts.pop(2)
+    example_prompts.pop(1)
+    with hf_runner(
+            model,
+            dtype=dtype,
+            model_kwargs={
+                "use_mamba_kernels":
+                False,  # mamba kernels are not installed so HF 
+                # don't use them
+            }) as hf_model:
+        non_chunked = hf_model.generate_greedy(example_prompts, max_tokens)
+    with vllm_runner(model,
+                     dtype=dtype,
+                     enable_chunked_prefill=True,
+                     max_num_batched_tokens=5,
+                     max_num_seqs=2) as vllm_model:
+        chunked = vllm_model.generate_greedy(example_prompts,
+                                             max_tokens=max_tokens)
+    check_outputs_equal(
+        outputs_0_lst=chunked,
+        outputs_1_lst=non_chunked,
+        name_0="chunked",
+        name_1="non_chunked",
+    )
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [15])
+def test_parallel_sampling(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        for_loop_outputs = []
+        for _ in range(10):
+            for_loop_outputs.append(
+                # using example_prompts index 1 instead of 0 since with 0 the
+                # logprobs get really close and the test doesn't pass
+                vllm_model.generate_greedy([example_prompts[1]], max_tokens)
+                [0])
+        sampling_params = SamplingParams(n=10,
+                                         temperature=0.001,
+                                         seed=0,
+                                         max_tokens=max_tokens)
+        n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
+                                             sampling_params)
+    token_ids, texts = n_lt_1_outputs[0]
+    n_lt_1_outputs = [(token_id, text)
+                      for token_id, text in zip(token_ids, texts)]
+    check_outputs_equal(
+        outputs_0_lst=n_lt_1_outputs,
+        outputs_1_lst=for_loop_outputs,
+        name_0="vllm_n_lt_1_outputs",
+        name_1="vllm",
+    )
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [20])
@@ -164,6 +271,30 @@ def test_state_cleanup(
                    "could be related to finished_requests_ids")
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [64])
+def test_jamba_distributed_produces_identical_generation(
+        vllm_runner, model: str, dtype: str, max_tokens: int,
+        example_prompts) -> None:
+    with vllm_runner(model, dtype=dtype, tensor_parallel_size=2) as vllm_model:
+        vllm_outputs_tp_2 = vllm_model.generate_greedy(example_prompts,
+                                                       max_tokens)
+    with vllm_runner(model, dtype=dtype, tensor_parallel_size=1) as vllm_model:
+        vllm_outputs_tp_1 = vllm_model.generate_greedy(example_prompts,
+                                                       max_tokens)
+    check_outputs_equal(
+        outputs_0_lst=vllm_outputs_tp_1,
+        outputs_1_lst=vllm_outputs_tp_2,
+        name_0="vllm_tp_1",
+        name_1="vllm_tp_2",
+    )
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["float"])
 def test_model_print(

--- a/tests/models/decoder_only/language/test_mamba.py
+++ b/tests/models/decoder_only/language/test_mamba.py
+"""Compare the outputs of HF and vLLM when using greedy sampling for Mamba.
+Run `pytest tests/models/test_mamba.py`.
+"""
+import pytest
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from vllm.sampling_params import SamplingParams
+from vllm.worker.model_runner import _get_graph_batch_size
+from ...utils import check_outputs_equal
+MODELS = ["state-spaces/mamba-130m-hf"]
+# Use lower-level interfaces to create this greedy generator, as mamba will
+# choke on the model_kwarg 'attention_mask' if hf_model.generate_greedy is used.
+def generate_greedy(model_name, example_prompts, max_tokens):
+    # Create a text generation pipeline
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForCausalLM.from_pretrained(model_name)
+    # Generate texts from the prompts
+    outputs = []
+    for prompt in example_prompts:
+        # Tokenize the input prompt with truncation
+        inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
+        input_ids = inputs["input_ids"].to(model.device)
+        # Generate text using the model's generate method directly
+        generated_ids = model.generate(input_ids, max_new_tokens=max_tokens)
+        generated_text = tokenizer.decode(generated_ids[0],
+                                          skip_special_tokens=True)
+        outputs.append((generated_ids[0].tolist(), generated_text))
+    return outputs
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [96])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    hf_outputs = generate_greedy(model, example_prompts, max_tokens)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    for i in range(len(example_prompts)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_outputs[i]
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [96])
+def test_batching(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    # To pass the small model tests, we need full precision.
+    for_loop_outputs = []
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        for prompt in example_prompts:
+            for_loop_outputs.append(
+                vllm_model.generate_greedy([prompt], max_tokens)[0])
+        batched_outputs = vllm_model.generate_greedy(example_prompts,
+                                                     max_tokens)
+    check_outputs_equal(
+        outputs_0_lst=for_loop_outputs,
+        outputs_1_lst=batched_outputs,
+        name_0="for_loop_vllm",
+        name_1="batched_vllm",
+    )
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [10])
+def test_chunked_prefill_with_parallel_sampling(vllm_runner, example_prompts,
+                                                model: str, dtype: str,
+                                                max_tokens: int) -> None:
+    # Tests chunked prefill in conjunction with n>1. In this case, prefill is
+    # populated with decoding tokens and we test that it doesn't fail.
+    # This test might fail if cache is not allocated correctly for n > 1
+    # decoding steps inside a chunked prefill forward pass (where we have both
+    # prefill and decode together )
+    sampling_params = SamplingParams(n=3,
+                                     temperature=1,
+                                     seed=0,
+                                     max_tokens=max_tokens)
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            enable_chunked_prefill=True,
+            max_num_batched_tokens=30,
+            max_num_seqs=10  # forces prefill chunks with decoding
+    ) as vllm_model:
+        vllm_model.generate(example_prompts, sampling_params)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16])
+def test_chunked_prefill(vllm_runner, example_prompts, model: str, dtype: str,
+                         max_tokens: int,
+                         chunked_prefill_token_size: int) -> None:
+    """
+    Checks exact match decode between huggingface model and vllm runner with
+    chunked prefill.
+    """
+    max_num_seqs = chunked_prefill_token_size
+    max_num_batched_tokens = chunked_prefill_token_size
+    non_chunked = generate_greedy(model, example_prompts, max_tokens)
+    with vllm_runner(model,
+                     dtype=dtype,
+                     enable_chunked_prefill=True,
+                     max_num_batched_tokens=max_num_batched_tokens,
+                     max_num_seqs=max_num_seqs) as vllm_model:
+        chunked = vllm_model.generate_greedy(example_prompts,
+                                             max_tokens=max_tokens)
+    check_outputs_equal(
+        outputs_0_lst=chunked,
+        outputs_1_lst=non_chunked,
+        name_0="chunked",
+        name_1="non_chunked",
+    )
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [15])
+def test_parallel_sampling(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        for_loop_outputs = []
+        for _ in range(10):
+            for_loop_outputs.append(
+                # using example_prompts index 1 instead of 0 since with 0 the
+                # logprobs get really close and the test doesn't pass
+                vllm_model.generate_greedy([example_prompts[1]], max_tokens)
+                [0])
+        sampling_params = SamplingParams(n=10,
+                                         temperature=0.001,
+                                         seed=0,
+                                         max_tokens=max_tokens)
+        n_lt_1_outputs = vllm_model.generate([example_prompts[1]],
+                                             sampling_params)
+    token_ids, texts = n_lt_1_outputs[0]
+    n_lt_1_outputs = [(token_id, text)
+                      for token_id, text in zip(token_ids, texts)]
+    check_outputs_equal(
+        outputs_0_lst=n_lt_1_outputs,
+        outputs_1_lst=for_loop_outputs,
+        name_0="vllm_n_lt_1_outputs",
+        name_1="vllm",
+    )
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [20])
+def test_mamba_cache_cg_padding(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    # This test is for verifying that mamba cache is padded to CG captured
+    # batch size. If it's not, a torch RuntimeError will be raised because
+    # tensor dimensions aren't compatible
+    while len(example_prompts) == _get_graph_batch_size(len(example_prompts)):
+        example_prompts.append(example_prompts[0])
+    try:
+        with vllm_runner(model, dtype=dtype) as vllm_model:
+            vllm_model.generate_greedy(example_prompts, max_tokens)
+    except RuntimeError:
+        pytest.fail(
+            "Couldn't run batch size which is not equal to a Cuda Graph "
+            "captured batch size. "
+            "Could be related to mamba cache not padded correctly")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("max_tokens", [20])
+def test_models_preemption_recompute(
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    # Tests that outputs are identical with and w/o preemtions (recompute)
+    assert dtype == "float"
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_model.model.llm_engine.scheduler[
+            0].ENABLE_ARTIFICIAL_PREEMPT = True
+        preempt_vllm_outputs = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+        vllm_model.model.llm_engine.scheduler[
+            0].ENABLE_ARTIFICIAL_PREEMPT = False
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+    check_outputs_equal(
+        outputs_0_lst=preempt_vllm_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="vllm_preepmtions",
+        name_1="vllm",
+    )
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
+    vllm_runner,
+    model: str,
+    dtype: str,
+    example_prompts,
+) -> None:
+    # This test is for verifying that the Mamba inner state management doesn't
+    # collapse in case where the number of incoming requests and
+    # finished_requests_ids is larger than the maximum Mamba block capacity.
+    # This could generally happen due to the fact that Mamba does support
+    # statelessness mechanism where it can cleanup new incoming requests in
+    # a single step.
+    try:
+        with vllm_runner(model, dtype=dtype, max_num_seqs=10) as vllm_model:
+            vllm_model.generate_greedy([example_prompts[0]] * 100, 10)
+    except ValueError:
+        pytest.fail("Mamba inner state wasn't cleaned up properly between"
+                    "steps finished requests registered unnecessarily ")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_state_cleanup(
+    vllm_runner,
+    model: str,
+    dtype: str,
+    example_prompts,
+) -> None:
+    # This test is for verifying that the Mamba state is cleaned up between
+    # steps, If its not cleaned, an error would be expected.
+    try:
+        with vllm_runner(model, dtype=dtype) as vllm_model:
+            for _ in range(10):
+                vllm_model.generate_greedy([example_prompts[0]] * 100, 1)
+    except ValueError:
+        pytest.fail("Mamba inner state wasn't cleaned up between states, "
+                    "could be related to finished_requests_ids")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_model_print(
+    vllm_runner,
+    model: str,
+    dtype: str,
+) -> None:
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        # This test is for verifying whether the model's extra_repr
+        # can be printed correctly.
+        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+              model_runner.model)
--- a/tests/models/decoder_only/language/test_phimoe.py
+++ b/tests/models/decoder_only/language/test_phimoe.py
@@ -7,6 +7,7 @@ import torch
 from vllm.utils import is_cpu
+from ....utils import large_gpu_test
 from ...utils import check_logprobs_close
 MODELS = [
@@ -69,20 +70,10 @@ def test_phimoe_routing_function():
        assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"])
-def get_gpu_memory():
-    try:
-        props = torch.cuda.get_device_properties(torch.cuda.current_device())
-        gpu_memory = props.total_memory / (1024**3)
-        return gpu_memory
-    except Exception:
-        return 0
 @pytest.mark.skipif(condition=is_cpu(),
                    reason="This test takes a lot time to run on CPU, "
                    "and vllm CI's disk space is not enough for this model.")
-@pytest.mark.skipif(condition=get_gpu_memory() < 100,
+@large_gpu_test(min_gb=80)
-                    reason="Skip this test if GPU memory is insufficient.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])

--- a/tests/models/decoder_only/vision_language/test_fuyu.py
+++ b/tests/models/decoder_only/vision_language/test_fuyu.py
@@ -65,8 +65,8 @@ def run_test(
    # max_model_len should be greater than image_feature_size
    with vllm_runner(model,
-                     max_model_len=2560,
+                     max_model_len=2048,
-                     max_num_seqs=1,
+                     max_num_seqs=2,
                     dtype=dtype,
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
@@ -80,8 +80,6 @@ def run_test(
        ]
    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_model.model.get_output_embeddings = lambda: \
-            hf_model.model.language_model.get_output_embeddings()
        eos_token_id = hf_model.processor.tokenizer.eos_token_id
        hf_outputs_per_image = [
            hf_model.generate_greedy_logprobs_limit(prompts,

--- a/tests/models/decoder_only/vision_language/test_glm4.py
+++ b/tests/models/decoder_only/vision_language/test_glm4.py
+from typing import List, Optional, Tuple, Type
+import pytest
+from vllm.multimodal.utils import rescale_image_size
+from vllm.transformers_utils.tokenizer import patch_padding_side
+from ....conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner
+from ....utils import large_gpu_test
+from ...utils import check_logprobs_close
+HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
+    "stop_sign":
+    "What's the content of the image?",
+    "cherry_blossom":
+    "What is the season?",
+})
+models = ["THUDM/glm-4v-9b"]
+target_dtype = "bfloat16"
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    inputs: List[Tuple[List[str], PromptImageInput]],
+    model: str,
+    *,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    mm_limit: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model,
+                     max_model_len=2048,
+                     max_num_seqs=2,
+                     dtype=dtype,
+                     limit_mm_per_prompt={"image": mm_limit},
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
+                     enforce_eager=True) as vllm_model:
+        stop_token_ids = [151329, 151336, 151338]
+        vllm_outputs_per_image = [
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=images,
+                                                stop_token_ids=stop_token_ids)
+            for prompts, images in inputs
+        ]
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_processor = hf_model.processor
+        patch_padding_side(hf_processor)
+        def processor(*args, text="", images=None, **kwargs):
+            if images is None:
+                return hf_processor(*args, **kwargs)
+            return hf_processor.apply_chat_template(
+                [{
+                    "role": "user",
+                    "image": images,
+                    "content": text
+                }],
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                **kwargs,
+            )
+        hf_model.processor = processor
+        hf_model.model.get_output_embeddings = lambda: \
+            hf_model.model.transformer.output_layer
+        hf_outputs_per_image = [
+            hf_model.generate_greedy_logprobs_limit(
+                prompts,
+                max_tokens,
+                num_logprobs=num_logprobs,
+                images=images,
+            ) for prompts, images in inputs
+        ]
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
+        check_logprobs_close(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=vllm_outputs,
+            name_0="hf",
+            name_1="vllm",
+        )
+@large_gpu_test(min_gb=48)
+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", [target_dtype])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype: str, max_tokens: int, num_logprobs: int) -> None:
+    images = [asset.pil_image for asset in image_assets]
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
+    run_test(
+        hf_runner,
+        vllm_runner,
+        inputs_per_image,
+        model,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        mm_limit=1,
+        tensor_parallel_size=1,
+    )
--- a/tests/models/decoder_only/vision_language/test_internvl.py
+++ b/tests/models/decoder_only/vision_language/test_internvl.py
@@ -97,7 +97,8 @@ def run_test(
            self.tokenizer = hf_runner.tokenizer
            self.dtype = hf_runner.model.dtype
-            self.config = AutoConfig.from_pretrained(hf_runner.model_name)
+            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
+                                                     trust_remote_code=True)
            self.vision_config = self.config.vision_config
            self.use_thumbnail = self.config.use_thumbnail
            self.min_num = self.config.min_dynamic_patch

--- a/tests/models/decoder_only/vision_language/test_llava_next_video.py
+++ b/tests/models/decoder_only/vision_language/test_llava_next_video.py
 from typing import List, Optional, Tuple, Type, overload
 import pytest
-import transformers
 from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer
 from vllm.multimodal.utils import (rescale_video_size, resize_video,
@@ -158,8 +157,6 @@ def run_test(
        )
-@pytest.mark.skipif(transformers.__version__ < "4.45",
-                    reason="Waiting for next transformers release")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
    "size_factors",
@@ -203,8 +200,6 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
    )
-@pytest.mark.skipif(transformers.__version__ < "4.45",
-                    reason="Waiting for next transformers release")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
    "sizes",

--- a/tests/models/decoder_only/vision_language/test_llava_onevision.py
+++ b/tests/models/decoder_only/vision_language/test_llava_onevision.py
 from typing import List, Optional, Tuple, Type, overload
 import pytest
-import transformers
 from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
                          BatchEncoding)
@@ -12,13 +11,13 @@ from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
 from ....conftest import (VIDEO_ASSETS, HfRunner, PromptImageInput, VllmRunner,
                          _VideoAssets)
+from ....utils import large_gpu_test
 from ...utils import check_logprobs_close
 # Video test
 HF_VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
    "sample_demo_1":
-    "<|im_start|>user <video>\nwhy is this video funny? \
+    "<|im_start|>user\n<video>\nwhy is this video funny?<|im_end|>\n<|im_start|>assistant\n"  # noqa: E501
-    <|im_end|><|im_start|>assistant\n"
 })
 models = ["llava-hf/llava-onevision-qwen2-7b-ov-hf"]
@@ -166,8 +165,7 @@ def run_video_test(
        )
-@pytest.mark.skipif(transformers.__version__ < "4.45",
+@large_gpu_test(min_gb=48)
-                    reason="Waiting for next transformers release")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
    "size_factors",
@@ -211,8 +209,7 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
    )
-@pytest.mark.skipif(transformers.__version__ < "4.45",
+@large_gpu_test(min_gb=48)
-                    reason="Waiting for next transformers release")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize(
    "sizes",
@@ -259,7 +256,8 @@ def run_image_test(
    # max_model_len should be greater than image_feature_size
    with vllm_runner(model,
                     dtype=dtype,
-                     max_model_len=32768,
+                     max_model_len=16384,
+                     max_num_seqs=2,
                     tensor_parallel_size=tensor_parallel_size,
                     distributed_executor_backend=distributed_executor_backend,
                     enforce_eager=True,
@@ -305,8 +303,7 @@ def run_image_test(
        )
-@pytest.mark.skipif(transformers.__version__ < "4.45",
+@large_gpu_test(min_gb=48)
-                    reason="Waiting for next transformers release")
 @pytest.mark.parametrize("model", models)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
@@ -319,14 +316,10 @@ def test_models_multiple_image_inputs(hf_runner, vllm_runner, image_assets,
    inputs = [(
        [
-            "<|im_start|>user <image><image>\nDescribe 2 images. \
+            "<|im_start|>user\n<image><image>\nDescribe 2 images.<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
-                <|im_end|><|im_start|>assistant\n",
+            "<|im_start|>user\n<image><image>\nDescribe 2 images.<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
-            "<|im_start|>user <image><image>\nDescribe 2 images. \
+            "<|im_start|>user\n<image><image><image><image>\nDescribe 4 images.<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
-                <|im_end|><|im_start|>assistant\n",
+            "<|im_start|>user\n<image>\nWhat is the season?<|im_end|>\n<|im_start|>assistant\n",  # noqa: E501
-            "<|im_start|>user <image><image><image><image>\nDescribe 4 images. \
-                <|im_end|><|im_start|>assistant\n",
-            "<|im_start|>user <image>\nWhat is the season? \
-                <|im_end|><|im_start|>assistant\n",
        ],
        [
            [stop_sign, cherry_blossom],

--- a/tests/models/decoder_only/vision_language/test_minicpmv.py
+++ b/tests/models/decoder_only/vision_language/test_minicpmv.py
@@ -79,7 +79,7 @@ def run_test(
    # max_model_len should be greater than image_feature_size
    with vllm_runner(model,
                     max_model_len=4096,
-                     max_num_seqs=1,
+                     max_num_seqs=2,
                     dtype=dtype,
                     limit_mm_per_prompt={"image": mm_limit},
                     tensor_parallel_size=tensor_parallel_size,

--- a/tests/models/decoder_only/vision_language/test_phi3v.py
+++ b/tests/models/decoder_only/vision_language/test_phi3v.py
 import os
 import re
-from typing import Callable, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type
 import pytest
 import torch
 from transformers import AutoImageProcessor, AutoTokenizer
-from vllm.inputs import InputContext, LLMInputs
+from vllm.inputs import InputContext, token_inputs
 from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
 from vllm.multimodal import MultiModalRegistry
 from vllm.multimodal.utils import rescale_image_size
@@ -90,7 +90,7 @@ def run_test(
    # max_model_len should be greater than image_feature_size
    with vllm_runner(model,
                     max_model_len=4096,
-                     max_num_seqs=1,
+                     max_num_seqs=2,
                     dtype=dtype,
                     limit_mm_per_prompt={"image": mm_limit},
                     tensor_parallel_size=tensor_parallel_size,
@@ -311,7 +311,7 @@ def test_input_mapper_override(model: str, image_assets: _ImageAssets,
    (4, 781),
    (16, 2653),
 ])
-def test_max_tokens_override(get_max_phi3v_image_tokens: Callable, model: str,
+def test_max_tokens_override(get_max_phi3v_image_tokens, model: str,
                             num_crops: int, expected_max_tokens: int):
    """Ensure get_max_phi3v_image_tokens handles num_crops properly."""
    # NOTE: mm_processor_kwargs on the context in this test is unused, since
@@ -343,8 +343,8 @@ def test_max_tokens_override(get_max_phi3v_image_tokens: Callable, model: str,
    (16, 2653, 1),
    (16, 2653, 2),
 ])
-def test_dummy_data_override(dummy_data_for_phi3v: Callable, model: str,
+def test_dummy_data_override(dummy_data_for_phi3v, model: str, num_crops: int,
-                             num_crops: int, toks_per_img: int, num_imgs: int):
+                             toks_per_img: int, num_imgs: int):
    """Ensure dummy_data_for_phi3v handles num_crops properly."""
    # Same as the previous test - don't initialize mm_processor_kwargs
    # in this test and assume that the kwargs will be correctly expanded by
@@ -374,7 +374,7 @@ def test_dummy_data_override(dummy_data_for_phi3v: Callable, model: str,
    (16, 1921, 1),
    (16, 1921, 2),
 ])
-def test_input_processor_override(input_processor_for_phi3v: Callable,
+def test_input_processor_override(input_processor_for_phi3v,
                                  image_assets: _ImageAssets, model: str,
                                  num_crops: int, expected_toks_per_img: int,
                                  num_imgs: int):
@@ -393,16 +393,14 @@ def test_input_processor_override(input_processor_for_phi3v: Callable,
    prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
    images = [image_assets[0].pil_image] * num_imgs
-    llm_inputs = LLMInputs(prompt_token_ids=tokenizer.encode(prompt),
+    inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
-                           prompt=prompt,
+                          prompt=prompt,
-                           multi_modal_data={"image": images})
+                          multi_modal_data={"image": images})
-    proc_llm_inputs = input_processor_for_phi3v(
+    processed_inputs = input_processor_for_phi3v(ctx,
-        ctx=ctx,
+                                                 inputs,
-        llm_inputs=llm_inputs,
+                                                 num_crops=num_crops)
-        num_crops=num_crops,
-    )
    # Ensure we have the right number of placeholders per num_crops size
-    img_tok_count = proc_llm_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
+    img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
    assert img_tok_count == expected_toks_per_img * num_imgs
--- a/tests/models/decoder_only/vision_language/test_pixtral.py
+++ b/tests/models/decoder_only/vision_language/test_pixtral.py
@@ -17,7 +17,7 @@ from vllm import EngineArgs, LLMEngine, SamplingParams, TokensPrompt
 from vllm.multimodal import MultiModalDataBuiltins
 from vllm.sequence import Logprob, SampleLogprobs
-from ....utils import VLLM_PATH
+from ....utils import VLLM_PATH, large_gpu_test
 from ...utils import check_logprobs_close
 if TYPE_CHECKING:
@@ -121,10 +121,7 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
            for tokens, text, logprobs in json_data]
-@pytest.mark.skip(
+@large_gpu_test(min_gb=80)
-    reason=
-    "Model is too big, test passed on A100 locally but will OOM on CI machine."
-)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
@@ -157,10 +154,7 @@ def test_chat(
                         name_1="output")
-@pytest.mark.skip(
+@large_gpu_test(min_gb=80)
-    reason=
-    "Model is too big, test passed on A100 locally but will OOM on CI machine."
-)
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 def test_model_engine(vllm_runner, model: str, dtype: str) -> None: