Merge tag 'v0.10.0' into v0.10.0-dev

711aa9d5 · zhuwenwen · 751c492c · 6d8d0a24 · 711aa9d5 · 711aa9d5
Commit 711aa9d5 authored Jul 30, 2025 by zhuwenwen
20 changed files
--- a/tests/models/language/pooling/test_truncation_control.py
+++ b/tests/models/language/pooling/test_truncation_control.py
@@ -28,7 +28,7 @@ def test_smaller_truncation_size(vllm_runner,

    with vllm_runner(model_name, task="embed",
                     max_model_len=max_model_len) as vllm_model:
-        vllm_output = vllm_model.model.encode(
+        vllm_output = vllm_model.llm.encode(
            input_str, truncate_prompt_tokens=truncate_prompt_tokens)

    prompt_tokens = vllm_output[0].prompt_token_ids
@@ -43,7 +43,7 @@ def test_max_truncation_size(vllm_runner,

    with vllm_runner(model_name, task="embed",
                     max_model_len=max_model_len) as vllm_model:
-        vllm_output = vllm_model.model.encode(
+        vllm_output = vllm_model.llm.encode(
            input_str, truncate_prompt_tokens=truncate_prompt_tokens)

    prompt_tokens = vllm_output[0].prompt_token_ids
@@ -61,7 +61,7 @@ def test_bigger_truncation_size(vllm_runner,
            model_name, task="embed",
            max_model_len=max_model_len) as vllm_model:

-        llm_output = vllm_model.model.encode(
+        llm_output = vllm_model.llm.encode(
            input_str, truncate_prompt_tokens=truncate_prompt_tokens)

        assert llm_output == f"""truncate_prompt_tokens value 

--- a/tests/models/multimodal/generation/test_common.py
+++ b/tests/models/multimodal/generation/test_common.py
@@ -37,6 +37,8 @@ if current_platform.is_rocm():
 REQUIRES_V0_MODELS = [
    # V1 Test: not enough KV cache space in C1.
    "fuyu",
+    # V1 Test: Deadlock issue when processing mm_inputs
+    "llava-onevision-transformers",
 ]

 # yapf: disable
@@ -155,6 +157,7 @@ VLM_TEST_SETTINGS = {
        video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501
        max_model_len=4096,
        max_num_seqs=2,
+        num_logprobs= 6 if current_platform.is_cpu() else 5,
        auto_cls=AutoModelForTextToWaveform,
        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
        patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner,
@@ -172,6 +175,71 @@ VLM_TEST_SETTINGS = {
        hf_output_post_proc=model_utils.ultravox_trunc_hf_output,
        marks=[pytest.mark.core_model, pytest.mark.cpu_model],
    ),
+    #### Transformers fallback to test
+    ## To reduce test burden, we only test batching arbitrary image size
+    # Dynamic image length and number of patches
+    "llava-onevision-transformers": VLMTestInfo(
+        models=["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n",   # noqa: E501
+        max_model_len=16384,
+        hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs("llava-hf/llava-onevision-qwen2-0.5b-ov-hf"),   # noqa: E501
+        auto_cls=AutoModelForImageTextToText,
+        vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
+        image_size_factors=[(0.25, 0.5, 1.0)],
+        vllm_runner_kwargs={
+            "model_impl": "transformers",
+        },
+        marks=[pytest.mark.core_model],
+    ),
+    # FIXME(Isotr0py): Enable this test after
+    # https://github.com/huggingface/transformers/pull/39470 released
+    # "idefics3-transformers": VLMTestInfo(
+    #     models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
+    #     test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+    #     prompt_formatter=lambda img_prompt:f"<|begin_of_text|>User:{img_prompt}<end_of_utterance>\nAssistant:",  # noqa: E501
+    #     img_idx_to_prompt=lambda idx: "<image>",
+    #     max_model_len=8192,
+    #     max_num_seqs=2,
+    #     auto_cls=AutoModelForImageTextToText,
+    #     hf_output_post_proc=model_utils.idefics3_trunc_hf_output,
+    #     image_size_factors=[(0.25, 0.5, 1.0)],
+    #     vllm_runner_kwargs={
+    #         "model_impl": "transformers",
+    #     },
+    #     marks=[pytest.mark.core_model],
+    # ),
+    # Pixel values from processor are not 4D or 5D arrays
+    "qwen2_5_vl-transformers": VLMTestInfo(
+        models=["Qwen/Qwen2.5-VL-3B-Instruct"],
+        test_type=VLMTestType.IMAGE,
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501
+        max_model_len=4096,
+        max_num_seqs=2,
+        auto_cls=AutoModelForImageTextToText,
+        vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output,
+        image_size_factors=[(0.25, 0.2, 0.15)],
+        vllm_runner_kwargs={
+            "model_impl": "transformers",
+        },
+        marks=[large_gpu_mark(min_gb=32)],
+    ),
+    # Check "auto" with fallback to transformers
+    "internvl-transformers": VLMTestInfo(
+        models=["OpenGVLab/InternVL3-1B-hf"],
+        test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
+        prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
+        img_idx_to_prompt=lambda idx: "<IMG_CONTEXT>",
+        max_model_len=4096,
+        use_tokenizer_eos=True,
+        image_size_factors=[(0.25, 0.5, 1.0)],
+        vllm_runner_kwargs={
+            "model_impl": "auto",
+        },
+        auto_cls=AutoModelForImageTextToText,
+        marks=[pytest.mark.core_model],
+    ),
    #### Extended model tests
    "aria": VLMTestInfo(
        models=[os.path.join(models_path_prefix, "rhymes-ai/Aria")],
@@ -320,6 +388,7 @@ VLM_TEST_SETTINGS = {
        num_logprobs=10,
        image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
        auto_cls=AutoModelForImageTextToText,
+        marks=[large_gpu_mark(min_gb=32)],
    ),
    "glm4_1v-video": VLMTestInfo(
        models=["THUDM/GLM-4.1V-9B-Thinking"],
@@ -333,8 +402,7 @@ VLM_TEST_SETTINGS = {
            inputs=custom_inputs.video_with_metadata_glm4_1v(),
            limit_mm_per_prompt={"video": 1},
        )],
-        # This is needed to run on machine with 24GB VRAM
-        vllm_runner_kwargs={"gpu_memory_utilization": 0.95},
+        marks=[large_gpu_mark(min_gb=32)],
    ),
    "h2ovl": VLMTestInfo(
        models = [

--- a/tests/models/multimodal/generation/test_maverick.py
+++ b/tests/models/multimodal/generation/test_maverick.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Create a reduced-layer version of the Maverick model for testing purposes.
+
+This script creates a new model with fewer layers by:
+1. Loading the original Maverick model configuration
+2. Creating a reduced configuration
+3. Generating compatible safetensors files with appropriate weights
+4. Creating the necessary index files for vLLM compatibility
+"""
+
+import json
+import shutil
+from pathlib import Path
+from typing import Any
+
+import pytest
+import torch
+from safetensors.torch import save_file
+from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
+                          GenerationConfig)
+
+from vllm import LLM, SamplingParams
+
+from ....utils import multi_gpu_test
+
+# Sample prompts for testing
+PROMPTS: list[str] = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+
+def run_maverick_serving(model: str):
+    """Test Llama-4-Maverick model with vLLM LLM class using CLI equivalent
+    options with reduced layers.
+    """
+
+    try:
+        sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+        llm = LLM(
+            model=model,
+            max_model_len=2048,
+            enforce_eager=True,
+            tensor_parallel_size=8,
+            enable_expert_parallel=True,
+            trust_remote_code=True,
+            gpu_memory_utilization=0.4,
+            kv_cache_dtype="fp8",
+        )
+
+        outputs = llm.generate(PROMPTS, sampling_params)
+
+        # Print the outputs
+        print("\nGenerated Outputs:\n" + "-" * 60)
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt:    {prompt!r}")
+            print(f"Output:    {generated_text!r}")
+            print("-" * 60)
+
+    except Exception as e:
+        print(f"Error initializing or running model: {e}")
+        raise
+
+
+def create_reduced_maverick_model(
+    original_model_name:
+    str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+    output_dir: str = "/tmp/reduced_maverick",
+    text_layers: int = 4,
+    num_experts: int = 4,
+    vision_layers: int = 2,
+    force_recreate: bool = False,
+) -> str:
+    """
+    Create a reduced-layer version of the Maverick model.
+
+    Args:
+        original_model_name: Name of the original Maverick model
+        output_dir: Directory to save the reduced model
+        text_layers: Number of text transformer layers
+        num_experts: Number of experts per layer
+        vision_layers: Number of vision transformer layers
+        force_recreate: Whether to recreate if output_dir already exists
+
+    Returns:
+        Path to the created reduced model directory
+    """
+
+    print(
+        f"Creating reduced Maverick model with {text_layers} text layers and "
+        f"{vision_layers} vision layers...")
+
+    # Create output directory
+    output_path = Path(output_dir)
+    if output_path.exists():
+        if force_recreate:
+            shutil.rmtree(output_path)
+        else:
+            print(f"Output directory {output_dir} already exists. "
+                  "Use --force-recreate to overwrite.")
+            return str(output_path)
+
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    try:
+        print("Loading original model configuration...")
+        original_config = AutoConfig.from_pretrained(original_model_name,
+                                                     trust_remote_code=True)
+
+        print("Creating reduced configuration...")
+        reduced_config = create_reduced_config(original_config, text_layers,
+                                               num_experts, vision_layers)
+
+        config_path = output_path / "config.json"
+        with open(config_path, "w") as f:
+            json.dump(reduced_config, f, indent=2)
+        print(f"Saved reduced config to {config_path}")
+
+        print("Copying tokenizer files...")
+        copy_tokenizer_files(original_model_name, output_path)
+
+        print("Creating reduced safetensors files...")
+        create_reduced_safetensors(original_config, reduced_config,
+                                   output_path)
+
+        print("Creating preprocessor config...")
+        create_preprocessor_config(original_config, output_path)
+
+        try:
+            gen_config = GenerationConfig.from_pretrained(original_model_name)
+            gen_config.save_pretrained(output_path)
+            print("Copied generation config")
+        except Exception as e:
+            print(f"Could not copy generation config: {e}")
+
+        print(f"Successfully created reduced Maverick model at {output_path}")
+        return str(output_path)
+
+    except Exception as e:
+        print(f"Error creating reduced model: {e}")
+        # Clean up on failure
+        if output_path.exists():
+            shutil.rmtree(output_path)
+        raise
+
+
+def create_reduced_config(original_config: Any, text_layers: int,
+                          num_experts: int,
+                          vision_layers: int) -> dict[str, Any]:
+    """Create a reduced configuration based on the original."""
+
+    # Convert config to dictionary
+    config_dict = original_config.to_dict()
+
+    # Reduce text layers
+    if "text_config" in config_dict:
+        original_text_layers = config_dict["text_config"]["num_hidden_layers"]
+        config_dict["text_config"]["num_hidden_layers"] = text_layers
+        print(
+            f"Reduced text layers from {original_text_layers} to {text_layers}"
+        )
+
+        original_num_experts = config_dict["text_config"]["num_local_experts"]
+        config_dict["text_config"]["num_local_experts"] = num_experts
+        print(
+            f"Reduced num experts from {original_num_experts} to {num_experts}"
+        )
+
+        hidden_dim_divisor = 4
+
+        original_hidden_size = config_dict["text_config"]["hidden_size"]
+        new_hidden_size = original_hidden_size // hidden_dim_divisor
+        config_dict["text_config"]["hidden_size"] = new_hidden_size
+        print(f"Reduced hidden size from {original_hidden_size} to "
+              f"{new_hidden_size}")
+
+        original_head_dim = config_dict["text_config"]["head_dim"]
+        new_head_dim = original_head_dim // hidden_dim_divisor
+        config_dict["text_config"]["head_dim"] = new_head_dim
+        print(f"Reduced head dim from {original_head_dim} to {new_head_dim}")
+
+    # Reduce vision layers
+    if "vision_config" in config_dict:
+        original_vision_layers = config_dict["vision_config"][
+            "num_hidden_layers"]
+        config_dict["vision_config"]["num_hidden_layers"] = vision_layers
+        print(f"Reduced vision layers from {original_vision_layers} "
+              f"to {vision_layers}")
+
+    # Update model name to indicate it's a reduced version
+    config_dict["_name_or_path"] = (
+        f"reduced_maverick_{text_layers}t_{vision_layers}v")
+
+    return config_dict
+
+
+def copy_tokenizer_files(original_model_name: str, output_path: Path) -> None:
+    """Copy tokenizer files from the original model."""
+
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(original_model_name,
+                                                  trust_remote_code=True)
+        tokenizer.save_pretrained(output_path)
+        print("Tokenizer files copied successfully")
+    except Exception as e:
+        print(f"Warning: Could not copy tokenizer files: {e}")
+
+
+def create_preprocessor_config(original_config: Any,
+                               output_path: Path) -> None:
+    """Create preprocessor_config.json for multimodal model."""
+
+    # Try to load the original preprocessor config
+    try:
+        processor = AutoProcessor.from_pretrained(
+            original_config._name_or_path
+            or "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+            trust_remote_code=True,
+        )
+        processor.save_pretrained(output_path)
+        print("Copied original preprocessor config")
+        return
+    except Exception as e:
+        print(f"Could not copy original preprocessor config: {e}")
+        raise
+
+
+def create_reduced_safetensors(original_config: Any, reduced_config: dict[str,
+                                                                          Any],
+                               output_path: Path) -> None:
+    """Create safetensors files with weights for the reduced model."""
+
+    print("Generating synthetic weights for reduced model...")
+
+    text_config = reduced_config["text_config"]
+    vision_config = reduced_config["vision_config"]
+
+    weights = {}
+
+    print("Creating text model weights...")
+    weights.update(create_text_model_weights(text_config))
+
+    print("Creating vision model weights...")
+    weights.update(create_vision_model_weights(vision_config))
+
+    print("Creating shared model weights...")
+    weights.update(create_shared_weights(text_config, vision_config))
+
+    print("Saving weights to safetensors files...")
+    save_weights_to_safetensors(weights, output_path)
+
+
+def create_text_model_weights(
+        text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
+    """Create synthetic weights for the text model with MoE structure."""
+
+    weights = {}
+
+    vocab_size = text_config["vocab_size"]
+    hidden_size = text_config["hidden_size"]
+    intermediate_size = text_config["intermediate_size"]
+    intermediate_size_mlp = text_config["intermediate_size_mlp"]
+    num_layers = text_config["num_hidden_layers"]
+    num_attention_heads = text_config["num_attention_heads"]
+    num_key_value_heads = text_config.get("num_key_value_heads",
+                                          num_attention_heads)
+
+    # MoE specific parameters
+    num_experts = text_config.get("num_local_experts")
+    assert (num_experts
+            is not None), "num_local_experts must be specified for MoE"
+
+    head_dim = hidden_size // num_attention_heads
+
+    # Embedding layers
+    weights["language_model.model.embed_tokens.weight"] = torch.randn(
+        vocab_size, hidden_size, dtype=torch.float16)
+
+    # Transformer layers
+    for layer_idx in range(num_layers):
+        layer_prefix = f"language_model.model.layers.{layer_idx}"
+        print(f"Creating weights for layer {layer_prefix}...")
+
+        # Self-attention weights (separate q, k, v projections)
+        weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
+            hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
+            hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
+            num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
+            hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
+        print("Self-attention weights created.")
+
+        # Feed-forward weights - MoE pattern based on interleave_moe_layer_step
+        # For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers
+        # 0,2,4,... are dense
+        interleave_step = text_config.get("interleave_moe_layer_step", 1)
+        is_moe_layer = (interleave_step > 0
+                        and (layer_idx + 1) % interleave_step == 0)
+
+        if is_moe_layer:
+            # MoE layer structure
+            # 1. Router weights
+            weights[
+                f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
+                    num_experts, hidden_size, dtype=torch.float16)
+
+            # 2. Individual expert weights (not fused)
+            for expert_idx in range(num_experts):
+                expert_prefix = (
+                    f"{layer_prefix}.feed_forward.experts.{expert_idx}")
+
+                weights[f"{expert_prefix}.gate_proj.weight"] = torch.randn(
+                    intermediate_size, hidden_size, dtype=torch.bfloat16)
+                weights[f"{expert_prefix}.up_proj.weight"] = torch.randn(
+                    intermediate_size, hidden_size, dtype=torch.bfloat16)
+                weights[f"{expert_prefix}.down_proj.weight"] = torch.randn(
+                    hidden_size, intermediate_size, dtype=torch.bfloat16)
+
+                # Expert weight scales (FP8 quantization)
+                weights[
+                    f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
+                        intermediate_size, 1, dtype=torch.bfloat16)
+                weights[f"{expert_prefix}.up_proj.weight_scale"] = torch.ones(
+                    intermediate_size, 1, dtype=torch.bfloat16)
+                weights[
+                    f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
+                        hidden_size, 1, dtype=torch.bfloat16)
+
+            # 3. Shared expert weights
+            shared_expert_prefix = f"{layer_prefix}.feed_forward.shared_expert"
+            weights[f"{shared_expert_prefix}.gate_proj.weight"] = torch.randn(
+                intermediate_size, hidden_size, dtype=torch.bfloat16)
+            weights[f"{shared_expert_prefix}.up_proj.weight"] = torch.randn(
+                intermediate_size, hidden_size, dtype=torch.bfloat16)
+            weights[f"{shared_expert_prefix}.down_proj.weight"] = torch.randn(
+                hidden_size, intermediate_size, dtype=torch.bfloat16)
+            print(f"MoE feed-forward weights created for layer {layer_idx}.")
+        else:
+            # Dense layer structure
+            weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = (
+                torch.randn(intermediate_size_mlp,
+                            hidden_size,
+                            dtype=torch.bfloat16))
+            weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = (
+                torch.randn(intermediate_size_mlp,
+                            hidden_size,
+                            dtype=torch.bfloat16))
+            weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = (
+                torch.randn(hidden_size,
+                            intermediate_size_mlp,
+                            dtype=torch.bfloat16))
+            print(f"Dense feed-forward weights created for layer {layer_idx}.")
+
+        # Layer norms
+        weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
+            hidden_size, dtype=torch.bfloat16)
+        weights[
+            f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
+                hidden_size, dtype=torch.bfloat16)
+        print("Layer norms created.")
+
+    # Final layer norm and output projection
+    weights["language_model.model.norm.weight"] = torch.ones(
+        hidden_size, dtype=torch.bfloat16)
+    weights["language_model.lm_head.weight"] = torch.randn(
+        vocab_size, hidden_size, dtype=torch.bfloat16)
+
+    return weights
+
+
+def create_vision_model_weights(
+        vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
+    """Create synthetic weights for the vision model."""
+
+    weights = {}
+
+    hidden_size = vision_config["hidden_size"]
+    intermediate_size = vision_config["intermediate_size"]
+    num_layers = vision_config["num_hidden_layers"]
+
+    # Vision transformer layers
+    for layer_idx in range(num_layers):
+        layer_prefix = f"vision_model.model.layers.{layer_idx}"
+
+        weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
+            hidden_size, hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.q_proj.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
+            hidden_size, hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.k_proj.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
+            hidden_size, hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.v_proj.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
+            hidden_size, hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.self_attn.o_proj.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16)
+
+        weights[f"{layer_prefix}.mlp.fc1.weight"] = torch.randn(
+            intermediate_size, hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.mlp.fc1.bias"] = torch.zeros(
+            intermediate_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.mlp.fc2.weight"] = torch.randn(
+            hidden_size, intermediate_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.mlp.fc2.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16)
+
+        weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
+            hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.input_layernorm.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16)
+        weights[
+            f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
+                hidden_size, dtype=torch.bfloat16)
+        weights[f"{layer_prefix}.post_attention_layernorm.bias"] = torch.zeros(
+            hidden_size, dtype=torch.bfloat16)
+
+    return weights
+
+
+def create_shared_weights(
+        text_config: dict[str, Any],
+        vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
+    """Create weights for shared components (vision-language connector)"""
+
+    weights = {}
+
+    text_hidden_size = text_config["hidden_size"]
+    projector_input_dim = vision_config["projector_input_dim"]
+
+    # Vision-language connector (projects vision features to text space)
+    weights["multi_modal_projector.linear_1.weight"] = torch.randn(
+        text_hidden_size, projector_input_dim, dtype=torch.bfloat16)
+
+    return weights
+
+
+def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
+                                output_path: Path) -> None:
+    """Save weights to safetensors files and create index."""
+
+    # Determine how to shard the weights
+    max_shard_size = 5 * 1024 * 1024 * 1024  # 5GB per shard
+
+    # Calculate sizes and create shards
+    shards = []
+    current_shard: dict[str, torch.Tensor] = {}
+    current_size = 0
+
+    for name, tensor in weights.items():
+        tensor_size = tensor.numel() * tensor.element_size()
+
+        if current_size + tensor_size > max_shard_size and current_shard:
+            shards.append(current_shard)
+            current_shard = {}
+            current_size = 0
+
+        current_shard[name] = tensor
+        current_size += tensor_size
+
+    if current_shard:
+        shards.append(current_shard)
+
+    # Save shards and create index
+    weight_map = {}
+
+    if len(shards) == 1:
+        # Single file
+        filename = "model.safetensors"
+        save_file(shards[0], output_path / filename)
+        weight_map = {name: filename for name in shards[0]}
+        print(f"Saved weights to single file: {filename}")
+    else:
+        # Multiple shards
+        for i, shard in enumerate(shards):
+            filename = f"model-{i+1:05d}-of-{len(shards):05d}.safetensors"
+            save_file(shard, output_path / filename)
+            for name in shard:
+                weight_map[name] = filename
+            print(f"Saved shard {i+1}/{len(shards)}: {filename}")
+
+    # Create index file
+    index_data = {
+        "metadata": {
+            "total_size":
+            sum(tensor.numel() * tensor.element_size()
+                for tensor in weights.values())
+        },
+        "weight_map": weight_map,
+    }
+
+    index_path = output_path / "model.safetensors.index.json"
+    with open(index_path, "w") as f:
+        json.dump(index_data, f, indent=2)
+
+    print(f"Created index file: {index_path}")
+    print(f"Total model size: "
+          f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
+
+
+def run_reduced_model(model_path: str,
+                      should_profile: bool = False,
+                      **kwargs) -> None:
+    """Test the created reduced model with vLLM."""
+
+    print(f"\nTesting reduced model at {model_path}...")
+
+    llm = LLM(
+        model=model_path,
+        trust_remote_code=True,
+        max_model_len=512,  # Small context for testing
+        gpu_memory_utilization=0.3,  # Conservative memory usage
+        **kwargs,
+    )
+
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=50)
+
+    if should_profile:
+        llm.start_profile()
+    outputs = llm.generate(PROMPTS, sampling_params)
+    if should_profile:
+        llm.stop_profile()
+
+    print("Test generation successful!")
+    for output in outputs:
+        print(f"Prompt: {output.prompt}")
+        print(f"Output: "
+              f"{output.outputs[0].text}")
+        print("-" * 40)
+
+
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize(
+    "original_model_name,text_layers,num_experts,vision_layers,",
+    [("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)])
+@pytest.mark.parametrize("enforce_eager", [True, False])
+@pytest.mark.parametrize("tp,ep", [(2, True)])
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_dummy_maverick(
+    original_model_name: str,
+    text_layers: int,
+    num_experts: int,
+    vision_layers: int,
+    enforce_eager: bool,
+    tp: int,
+    ep: bool,
+    output_dir: str = "/tmp/reduced_maverick",
+    force_recreate: bool = True,
+    profile: bool = False,
+) -> None:
+    model_path = create_reduced_maverick_model(
+        original_model_name=original_model_name,
+        output_dir=output_dir,
+        text_layers=text_layers,
+        num_experts=num_experts,
+        vision_layers=vision_layers,
+        force_recreate=force_recreate,
+    )
+
+    print(f"\nReduced model created successfully at: {model_path}")
+
+    run_reduced_model(model_path=model_path,
+                      should_profile=profile,
+                      enforce_eager=enforce_eager,
+                      tensor_parallel_size=tp,
+                      enable_expert_parallel=ep)
+
+
+def main():
+    """Main function to create and test the reduced model."""
+
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Create a reduced-layer Maverick model")
+    parser.add_argument(
+        "--output-dir",
+        default="/tmp/reduced_maverick",
+        help="Output directory for the reduced model",
+    )
+    parser.add_argument(
+        "--text-layers",
+        type=int,
+        default=4,
+        help="Number of text transformer layers",
+    )
+    parser.add_argument("--num-experts",
+                        type=int,
+                        default=4,
+                        help="Number of experts")
+    parser.add_argument(
+        "--vision-layers",
+        type=int,
+        default=2,
+        help="Number of vision transformer layers",
+    )
+    parser.add_argument(
+        "--force-recreate",
+        action="store_true",
+        help="Force recreation if output directory exists",
+    )
+    parser.add_argument("--test",
+                        action="store_true",
+                        help="Test the created model with vLLM")
+    parser.add_argument("--profile",
+                        action="store_true",
+                        help="Profile the created model with vLLM")
+    parser.add_argument(
+        "--test-original",
+        action="store_true",
+        help="Test the original model with vLLM",
+    )
+    parser.add_argument(
+        "--original-model",
+        default="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
+        help="Original model name to base the reduction on",
+    )
+
+    args = parser.parse_args()
+
+    if args.test:
+        test_dummy_maverick(original_model_name=args.original_model,
+                            output_dir=args.output_dir,
+                            text_layers=args.text_layers,
+                            num_experts=args.num_experts,
+                            vision_layers=args.vision_layers,
+                            force_recreate=args.force_recreate,
+                            tp=2,
+                            ep=True,
+                            enforce_eager=True,
+                            profile=args.profile)
+
+    if args.test_original:
+        run_maverick_serving(args.original_model)
+
+
+if __name__ == "__main__":
+    exit(main())
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@@ -182,8 +182,7 @@ def test_chat(
    ) as vllm_model:
        outputs = []
        for msg in MSGS:
-            output = vllm_model.model.chat(msg,
-                                           sampling_params=SAMPLING_PARAMS)
+            output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)

            outputs.extend(output)

@@ -219,7 +218,7 @@ def test_multi_modal_placeholders(vllm_runner, prompt,
            max_model_len=8192,
            limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
    ) as vllm_model:
-        outputs = vllm_model.model.generate(prompt)
+        outputs = vllm_model.llm.generate(prompt)

        assert len(outputs) == 1, f"{len(outputs)=}"
        output: RequestOutput = outputs[0]

--- a/tests/models/multimodal/generation/test_voxtral.py
+++ b/tests/models/multimodal/generation/test_voxtral.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+import pytest_asyncio
+from mistral_common.audio import Audio
+from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
+                                                       TextChunk, UserMessage)
+
+from vllm.transformers_utils.tokenizer import MistralTokenizer
+
+from ....conftest import AudioTestAssets
+from ....utils import RemoteOpenAIServer
+from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
+
+MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
+MISTRAL_FORMAT_ARGS = [
+    "--tokenizer_mode", "mistral", "--config_format", "mistral",
+    "--load_format", "mistral"
+]
+
+
+@pytest.fixture()
+def server(request, audio_assets: AudioTestAssets):
+    args = [
+        "--enforce-eager",
+        "--limit-mm-per-prompt",
+        json.dumps({"audio": len(audio_assets)}),
+    ] + MISTRAL_FORMAT_ARGS
+
+    with RemoteOpenAIServer(MODEL_NAME,
+                            args,
+                            env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
+                                      "30"}) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+def _get_prompt(audio_assets, question):
+    tokenizer = MistralTokenizer.from_pretrained(MODEL_NAME)
+
+    audios = [
+        Audio.from_file(str(audio_assets[i].get_local_path()), strict=False)
+        for i in range(len(audio_assets))
+    ]
+    audio_chunks = [
+        AudioChunk(input_audio=RawAudio.from_audio(audio)) for audio in audios
+    ]
+
+    text_chunk = TextChunk(text=question)
+    messages = [UserMessage(content=[*audio_chunks, text_chunk]).to_openai()]
+
+    return tokenizer.apply_chat_template(messages=messages)
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models_with_multiple_audios(vllm_runner,
+                                     audio_assets: AudioTestAssets, dtype: str,
+                                     max_tokens: int,
+                                     num_logprobs: int) -> None:
+    vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
+    run_multi_audio_test(
+        vllm_runner,
+        [(vllm_prompt, [audio.audio_and_sample_rate
+                        for audio in audio_assets])],
+        MODEL_NAME,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tokenizer_mode="mistral",
+    )
+
+
+@pytest.mark.asyncio
+async def test_online_serving(client, audio_assets: AudioTestAssets):
+    """Exercises online serving with/without chunked prefill enabled."""
+
+    def asset_to_chunk(asset):
+        audio = Audio.from_file(str(asset.get_local_path()), strict=False)
+        audio.format = "wav"
+        audio_dict = AudioChunk.from_audio(audio).to_openai()
+        return audio_dict
+
+    audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            *audio_chunks,
+            {
+                "type":
+                "text",
+                "text":
+                f"What's happening in these {len(audio_assets)} audio clips?"
+            },
+        ],
+    }]
+
+    chat_completion = await client.chat.completions.create(model=MODEL_NAME,
+                                                           messages=messages,
+                                                           max_tokens=10)
+
+    assert len(chat_completion.choices) == 1
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -107,7 +107,7 @@ def run_test(
            tensor_parallel_size=tensor_parallel_size,
            distributed_executor_backend=distributed_executor_backend,
    ) as vllm_model:
-        llm = vllm_model.model
+        llm = vllm_model.llm

        sampling_params = SamplingParams(
            temperature=0,

--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -85,7 +85,7 @@ def run_test(
                     enforce_eager=enforce_eager,
                     task=task,
                     **vllm_runner_kwargs_) as vllm_model:
-        tokenizer = vllm_model.model.get_tokenizer()
+        tokenizer = vllm_model.llm.get_tokenizer()

        vllm_kwargs: dict[str, Any] = {}
        if get_stop_token_ids is not None:

--- a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
+++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
@@ -97,7 +97,7 @@ def _run_test(
                     dtype=dtype,
                     enforce_eager=True,
                     max_model_len=8192) as vllm_model:
-        tokenizer = vllm_model.model.get_tokenizer()
+        tokenizer = vllm_model.llm.get_tokenizer()
        texts = [
            # this is necessary because vllm_model.embed will not apply any
            # templating to the prompt, and therefore lacks an image_pad

--- a/tests/models/multimodal/pooling/test_jinavl_reranker.py
+++ b/tests/models/multimodal/pooling/test_jinavl_reranker.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Union
+
+import pytest
+from transformers import AutoModel
+
+from vllm.entrypoints.chat_utils import ChatCompletionContentPartImageParam
+from vllm.entrypoints.score_utils import ScoreMultiModalParam
+
+from ....conftest import HfRunner, VllmRunner
+
+model_name = "jinaai/jina-reranker-m0"
+
+mm_processor_kwargs = {
+    "min_pixels": 3136,
+    "max_pixels": 602112,
+}
+
+limit_mm_per_prompt = {"image": 2}
+
+
+def vllm_reranker(
+    vllm_runner: type[VllmRunner],
+    model_name: str,
+    dtype: str,
+    query_strs: list[str],
+    document_strs: list[str],
+    query_type: str = "text",
+    doc_type: str = "text",
+):
+
+    def create_image_param(url: str) -> ChatCompletionContentPartImageParam:
+        return {"type": "image_url", "image_url": {"url": f"{url}"}}
+
+    query: Union[list[str], ScoreMultiModalParam]
+    if query_type == "text":
+        query = query_strs
+    elif query_type == "image":
+        query = ScoreMultiModalParam(
+            content=[create_image_param(url) for url in query_strs])
+
+    documents: Union[list[str], ScoreMultiModalParam]
+    if doc_type == "text":
+        documents = document_strs
+    elif doc_type == "image":
+        documents = ScoreMultiModalParam(
+            content=[create_image_param(url) for url in document_strs])
+
+    with vllm_runner(
+            model_name,
+            task="score",
+            dtype=dtype,
+            max_num_seqs=2,
+            max_model_len=2048,
+            mm_processor_kwargs=mm_processor_kwargs,
+            limit_mm_per_prompt=limit_mm_per_prompt,
+    ) as vllm_model:
+        outputs = vllm_model.llm.score(query, documents)
+
+    return [output.outputs.score for output in outputs]
+
+
+def hf_reranker(
+    hf_runner: type[HfRunner],
+    model_name: str,
+    dtype: str,
+    query_strs: list[str],
+    document_strs: list[str],
+    query_type: str = "text",
+    doc_type: str = "text",
+):
+    checkpoint_to_hf_mapper = {
+        "visual.": "model.visual.",
+        "model.": "model.language_model.",
+    }
+
+    data_pairs = [[query_strs[0], d] for d in document_strs]
+
+    with hf_runner(
+            model_name,
+            dtype=dtype,
+            trust_remote_code=True,
+            auto_cls=AutoModel,
+            model_kwargs={"key_mapping": checkpoint_to_hf_mapper},
+    ) as hf_model:
+        return hf_model.model.compute_score(data_pairs,
+                                            max_length=2048,
+                                            query_type=query_type,
+                                            doc_type=doc_type)
+
+
+# Visual Documents Reranking
+@pytest.mark.parametrize("model_name", [model_name])
+@pytest.mark.parametrize("dtype", ["half"])
+def test_model_text_image(hf_runner, vllm_runner, model_name, dtype):
+    query = ["slm markdown"]
+    documents = [
+        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
+        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
+    ]
+
+    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
+                             "text", "image")
+    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
+                                 documents, "text", "image")
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
+
+
+# Textual Documents Reranking
+@pytest.mark.parametrize("model_name", [model_name])
+@pytest.mark.parametrize("dtype", ["half"])
+def test_model_text_text(hf_runner, vllm_runner, model_name, dtype):
+    query = ["slm markdown"]
+    documents = [
+        """We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient 
+        web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML 
+        into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding 
+        large language models. The models effectiveness results from two key innovations: (1) a three-stage 
+        data synthesis pipeline that generates high quality, diverse training data by iteratively drafting, 
+        refining, and critiquing web content extraction; and (2) a unified training framework combining 
+        continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that 
+        ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated 
+        benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly 
+        lower computational requirements.""",  # noqa: E501
+        "数据提取么？为什么不用正则啊，你用正则不就全解决了么？",
+    ]
+    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
+                             "text", "text")
+    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
+                                 documents, "text", "text")
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
+
+
+# Image Querying for Textual Documents
+@pytest.mark.parametrize("model_name", [model_name])
+@pytest.mark.parametrize("dtype", ["half"])
+def test_model_image_text(hf_runner, vllm_runner, model_name, dtype):
+    query = [
+        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+    ]
+    documents = [
+        """We present ReaderLM-v2, a compact 1.5 billion parameter language model designed for efficient
+        web content extraction. Our model processes documents up to 512K tokens, transforming messy HTML
+        into clean Markdown or JSON formats with high accuracy -- making it an ideal tool for grounding
+        large language models. The models effectiveness results from two key innovations: (1) a three-stage
+        data synthesis pipeline that generates high quality, diverse training data by iteratively drafting,
+        refining, and critiquing web content extraction; and (2) a unified training framework combining
+        continuous pre-training with multi-objective optimization. Intensive evaluation demonstrates that
+        ReaderLM-v2 outperforms GPT-4o-2024-08-06 and other larger models by 15-20% on carefully curated
+        benchmarks, particularly excelling at documents exceeding 100K tokens, while maintaining significantly
+        lower computational requirements.""",  # noqa: E501
+        "数据提取么？为什么不用正则啊，你用正则不就全解决了么？",
+    ]
+
+    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
+                             "image", "text")
+    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
+                                 documents, "image", "text")
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
+
+
+# Image Querying for Image Documents
+@pytest.mark.parametrize("model_name", [model_name])
+@pytest.mark.parametrize("dtype", ["half"])
+def test_model_image_image(hf_runner, vllm_runner, model_name, dtype):
+    query = [
+        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+    ]
+    documents = [
+        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png",
+        "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
+    ]
+
+    hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
+                             "image", "image")
+    vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
+                                 documents, "image", "image")
+
+    assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
+    assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
--- a/tests/models/multimodal/pooling/test_prithvi_mae.py
+++ b/tests/models/multimodal/pooling/test_prithvi_mae.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from vllm.utils import set_default_torch_num_threads
+
+from ....conftest import VllmRunner
+
+
+def generate_test_mm_data():
+    mm_data = {
+        "pixel_values": torch.full((6, 512, 512), 1.0, dtype=torch.float16),
+        "location_coords": torch.full((1, 2), 1.0, dtype=torch.float16),
+    }
+    return mm_data
+
+
+def _run_test(
+    vllm_runner: type[VllmRunner],
+    model: str,
+) -> None:
+
+    prompt = [
+        {
+            # This model deals with no text input
+            "prompt_token_ids": [1],
+            "multi_modal_data": generate_test_mm_data(),
+        } for _ in range(10)
+    ]
+
+    with (
+            set_default_torch_num_threads(1),
+            vllm_runner(
+                model,
+                task="embed",
+                dtype=torch.float16,
+                enforce_eager=True,
+                skip_tokenizer_init=True,
+                # Limit the maximum number of sequences to avoid the
+                # test going OOM during the warmup run
+                max_num_seqs=32,
+            ) as vllm_model,
+    ):
+        vllm_model.encode(prompt)
+
+
+MODELS = ["christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"]
+
+
+@pytest.mark.core_model
+@pytest.mark.parametrize("model", MODELS)
+def test_models_image(
+    hf_runner,
+    vllm_runner,
+    image_assets,
+    model: str,
+) -> None:
+    _run_test(
+        vllm_runner,
+        model,
+    )
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -161,6 +161,7 @@ def _test_processing_correctness(
 _ADD_SPECIAL_TOKENS_OVERRIDES = {
    "mllama": False,
    "ovis": False,
+    "paligemma": False,
    "ultravox": False,
    "whisper": False,
 }
@@ -291,7 +292,8 @@ def _test_processing_correctness_one(
    os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-VL-01"),
    os.path.join(models_path_prefix, "allenai/Molmo-7B-D-0924"),
    os.path.join(models_path_prefix, "allenai/Molmo-7B-O-0924"),
-    os.path.join(models_path_prefix, "nvidia/NVLM-D-72B"),
+    os.path.join(models_path_prefix,  "nvidia/NVLM-D-72B"),
+    os.path.join(models_path_prefix, "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"),
    os.path.join(models_path_prefix, "AIDC-AI/Ovis1.6-Gemma2-9B"),
    os.path.join(models_path_prefix, "AIDC-AI/Ovis1.6-Llama3.2-3B"),
    os.path.join(models_path_prefix, "AIDC-AI/Ovis2-1B"),
@@ -302,7 +304,7 @@ def _test_processing_correctness_one(
    os.path.join(models_path_prefix, "mistralai/Pixtral-12B-2409"),
    os.path.join(models_path_prefix, "mistral-community/pixtral-12b"),
    os.path.join(models_path_prefix, "Qwen/Qwen-VL-Chat"),
-    os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct"),
+    os.path.join(models_path_prefix,  "Qwen/Qwen2-VL-2B-Instruct"),
    os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct"),
    os.path.join(models_path_prefix, "Qwen/Qwen2-Audio-7B-Instruct"),
    os.path.join(models_path_prefix, "Qwen/Qwen2.5-Omni-3B"),

--- a/tests/models/multimodal/processing/test_nemotron_vl.py
+++ b/tests/models/multimodal/processing/test_nemotron_vl.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Tests for Nemotron-Nano-VL's multimodal preprocessing kwargs."""
+from collections.abc import Mapping
+from typing import Optional
+
+import pytest
+from PIL import Image
+from transformers import PretrainedConfig
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.processing import BaseMultiModalProcessor
+
+from ....conftest import ImageTestAssets
+from ...utils import build_model_context
+
+
+def _get_expected_num_patches(
+    config: PretrainedConfig,
+    image: Image.Image,
+    num_imgs: int,
+    min_num: int,
+    max_num: int,
+):
+    from vllm.model_executor.models.internvl import (
+        calculate_internvl_targets, get_internvl_target_ratios)
+
+    width, height = image.size
+
+    blocks, _, _ = calculate_internvl_targets(
+        orig_width=width,
+        orig_height=height,
+        target_ratios=get_internvl_target_ratios(
+            min_num,
+            max_num,
+        ),
+        image_size=config.force_image_size,
+        use_thumbnail=False,
+    )
+    expected_num_patches = blocks
+
+    if config.use_thumbnail and expected_num_patches > 1:
+        expected_num_patches += 1
+
+    return expected_num_patches
+
+
+def _run_check(
+    processor: BaseMultiModalProcessor,
+    images: list[Image.Image],
+    min_num: int,
+    max_num: int,
+    mm_processor_kwargs: Mapping[str, object],
+):
+    tokenizer = processor.info.get_tokenizer()
+    config = processor.info.get_hf_config()
+    image_processor = processor.info.get_image_processor()
+
+    config.use_thumbnail = image_processor.use_thumbnail
+    prompt = "<image>" * len(images)
+    mm_data = {"image": images}
+
+    total_expected_num_patches = sum(
+        _get_expected_num_patches(config, image, len(images), min_num, max_num)
+        for image in images)
+    print(total_expected_num_patches)
+    processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
+
+    # Ensure we have the right number of placeholders per num_crops size
+    image_token_id = tokenizer.convert_tokens_to_ids("<image>")
+    img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
+    pixel_shape = processed_inputs["mm_kwargs"]["pixel_values_flat"].shape
+    print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
+    assert img_tok_count == 256 * total_expected_num_patches
+    assert pixel_shape[0] == total_expected_num_patches
+
+
+@pytest.mark.parametrize("model_id",
+                         ["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+        [4.0, 2.0, 1.0],
+    ],
+)
+@pytest.mark.parametrize(
+    ("min_dynamic_patch", "max_dynamic_patch"),
+    [(1, 1), (1, 2), (1, 4), (1, 8), (2, 4), (4, 8)],
+)
+@pytest.mark.parametrize("dynamic_image_size", [True, False])
+@pytest.mark.parametrize("kwargs_on_init", [True, False])
+def test_processor_override(
+    model_id: str,
+    image_assets: ImageTestAssets,
+    size_factors: list[int],
+    min_dynamic_patch: int,
+    max_dynamic_patch: int,
+    dynamic_image_size: Optional[bool],
+    kwargs_on_init: bool,
+):
+    mm_processor_kwargs = {
+        "min_dynamic_patch": min_dynamic_patch,
+        "max_dynamic_patch": max_dynamic_patch,
+        "dynamic_image_size": dynamic_image_size,
+    }
+
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
+        limit_mm_per_prompt={"image": len(size_factors)},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
+
+    min_num = min_dynamic_patch if dynamic_image_size else 1
+    max_num = max_dynamic_patch if dynamic_image_size else 1
+
+    _run_check(
+        processor,
+        [
+            rescale_image_size(image_assets[0].pil_image, f)
+            for f in size_factors
+        ],
+        min_num,
+        max_num,
+        hf_processor_mm_kwargs,
+    )
--- a/tests/models/multimodal/processing/test_transformers.py
+++ b/tests/models/multimodal/processing/test_transformers.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+
+from vllm.assets.image import ImageAsset
+from vllm.config import ModelConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+
+# yapf: disable
+@pytest.mark.parametrize("model_id",
+                         ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
+def test_multimodal_processor(model_id):
+    model_config = ModelConfig(
+        model=model_id,
+        model_impl="transformers",
+    )
+
+    mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config, )
+
+    image_pil = ImageAsset('cherry_blossom').pil_image
+    mm_data = {"image": image_pil}
+    str_prompt = "<|im_start|>user <image>\nWhat is the content of this image?<|im_end|><|im_start|>assistant\n" # noqa: E501
+    str_processed_inputs = mm_processor.apply(
+        prompt=str_prompt,
+        mm_data=mm_data,
+        hf_processor_mm_kwargs={},
+    )
+
+    ids_prompt = [
+        151644, 872, 220, 151646, 198, 3838, 374, 279, 2213, 315, 419, 2168,
+        30, 151645, 151644, 77091, 198
+    ]
+    ids_processed_inputs = mm_processor.apply(
+        prompt=ids_prompt,
+        mm_data=mm_data,
+        hf_processor_mm_kwargs={},
+    )
+
+    assert str_processed_inputs["prompt"] == ids_processed_inputs["prompt"]
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -17,8 +17,8 @@ from ..utils import models_path_prefix
 from vllm.platforms import current_platform


-from ..models.utils import check_embeddings_close
-from ..utils import compare_two_settings, create_new_process_for_each_test
+from ...utils import compare_two_settings, multi_gpu_test
+from ..utils import check_embeddings_close, check_logprobs_close

 models_4bit_to_test = [
    (os.path.join(models_path_prefix, "facebook/opt-125m"), "quantize opt model inflight"),
@@ -30,6 +30,10 @@ models_4bit_to_embedding_test = [
    ("intfloat/e5-mistral-7b-instruct", "quantize embedding model inflight"),
 ]

+models_4bit_to_moe_test = [
+    ("allenai/OLMoE-1B-7B-0125-Instruct", "quantize moe model inflight"),
+]
+
 models_pre_qaunt_4bit_to_test = [
    (os.path.join(models_path_prefix, 'PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed'),
     'read pre-quantized 4-bit FP4 model'),
@@ -46,7 +50,6 @@ models_pre_quant_8bit_to_test = [
 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or current_platform.is_rocm(),
                    reason='bitsandbytes is not supported on this GPU type.')
 @pytest.mark.parametrize("model_name, description", models_4bit_to_test)
-@create_new_process_for_each_test()
 def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                             model_name, description) -> None:

@@ -60,7 +63,6 @@ def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                    reason='bitsandbytes is not supported on this GPU type.')
 @pytest.mark.parametrize("model_name, description",
                         models_pre_qaunt_4bit_to_test)
-@create_new_process_for_each_test()
 def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                                       model_name, description) -> None:

@@ -72,7 +74,6 @@ def test_load_pre_quant_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                    reason='bitsandbytes is not supported on this GPU type.')
 @pytest.mark.parametrize("model_name, description",
                         models_pre_quant_8bit_to_test)
-@create_new_process_for_each_test()
 def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                             model_name, description) -> None:

@@ -80,12 +81,11 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                             model_name, True)


-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason='Test requires at least 2 GPUs.')
+
 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes") or current_platform.is_rocm(),
                    reason='bitsandbytes is not supported on this GPU type.')
 @pytest.mark.parametrize("model_name, description", models_4bit_to_test)
-@create_new_process_for_each_test()
+@multi_gpu_test(num_gpus=2)
 def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                                model_name, description) -> None:

@@ -100,12 +100,10 @@ def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                             vllm_tp_size=2)


-@pytest.mark.skipif(torch.cuda.device_count() < 2,
-                    reason='Test requires at least 2 GPUs.')
 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                    reason='bitsandbytes is not supported on this GPU type.')
 @pytest.mark.parametrize("model_name, description", models_4bit_to_test)
-@create_new_process_for_each_test()
+@multi_gpu_test(num_gpus=2)
 def test_load_pp_4bit_bnb_model(model_name, description) -> None:
    common_args = [
        "--disable-log-stats",
@@ -126,12 +124,40 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
    compare_two_settings(model_name, common_args, pp_args)


+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.parametrize("model_name, description", models_4bit_to_moe_test)
+def test_4bit_bnb_moe_model(hf_runner, vllm_runner, example_prompts,
+                            model_name, description) -> None:
+
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_use_double_quant=True,
+    ))
+    with vllm_runner(model_name,
+                     quantization='bitsandbytes',
+                     enforce_eager=False) as llm:
+        vllm_outputs = llm.generate_greedy_logprobs(example_prompts,
+                                                    max_tokens=32,
+                                                    num_logprobs=5)
+
+    with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
+        transformers_outputs = llm.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens=32, num_logprobs=5)
+    check_logprobs_close(
+        outputs_0_lst=transformers_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="transformers",
+        name_1="vllm",
+    )
+
+
 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                    reason='bitsandbytes is not supported on this GPU type.')
 @pytest.mark.parametrize("model_name, description",
                         models_4bit_to_embedding_test)
 @pytest.mark.parametrize("dtype", ["half"])
-@create_new_process_for_each_test()
 def test_4bit_bnb_embedding_model(
    model_name,
    description,
@@ -150,6 +176,13 @@ def test_4bit_bnb_embedding_model(
    example_prompts = [str(s).strip() for s in example_prompts]

    # Inflight 4bit quantization
+    with vllm_runner(model_name,
+                     task="embed",
+                     dtype=dtype,
+                     gpu_memory_utilization=0.5,
+                     quantization="bitsandbytes") as vllm_model:
+        vllm_outputs = vllm_model.embed(example_prompts)
+
    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
        load_in_4bit=True))
    with hf_runner(
@@ -160,12 +193,6 @@ def test_4bit_bnb_embedding_model(
    ) as hf_model:
        hf_outputs = hf_model.encode(example_prompts)

-    with vllm_runner(model_name,
-                     task="embed",
-                     dtype=dtype,
-                     gpu_memory_utilization=0.5,
-                     quantization="bitsandbytes") as vllm_model:
-        vllm_outputs = vllm_model.embed(example_prompts)
    check_embeddings_close(
        embeddings_0_lst=hf_outputs,
        embeddings_1_lst=vllm_outputs,
@@ -193,7 +220,8 @@ def validate_generated_texts(hf_runner,
                             model_name,
                             pre_quant=False,
                             hf_model_kwargs=None,
-                             vllm_tp_size=1):
+                             vllm_tp_size=1,
+                             max_tokens=8):

    # NOTE: run vLLM first, as it requires a clean process
    # when using distributed inference
@@ -201,7 +229,8 @@ def validate_generated_texts(hf_runner,
                     quantization=None if pre_quant else 'bitsandbytes',
                     tensor_parallel_size=vllm_tp_size,
                     enforce_eager=False) as llm:
-        vllm_outputs = llm.generate_greedy(prompts, 8)
+
+        vllm_outputs = llm.generate_greedy(prompts, max_tokens)
        vllm_logs = log_generated_texts(prompts, vllm_outputs, "VllmRunner")

    # Clean up the GPU memory for the next test
@@ -213,19 +242,17 @@ def validate_generated_texts(hf_runner,

    # Run with HF runner
    with hf_runner(model_name, model_kwargs=hf_model_kwargs) as llm:
-        hf_outputs = llm.generate_greedy(prompts, 8)
+        hf_outputs = llm.generate_greedy(prompts, max_tokens)
        hf_logs = log_generated_texts(prompts, hf_outputs, "HfRunner")

    # Clean up the GPU memory for the next test
    gc.collect()
    torch.cuda.empty_cache()
-
    # Compare the generated strings
    for hf_log, vllm_log in zip(hf_logs, vllm_logs):
        hf_str = hf_log["generated_text"]
        vllm_str = vllm_log["generated_text"]
        prompt = hf_log["prompt"]
-
        assert hf_str == vllm_str, (f"Model: {model_name}"
                                    f"Mismatch between HF and vLLM outputs:\n"
                                    f"Prompt: {prompt}\n"

--- a/tests/models/quantization/test_modelopt.py
+++ b/tests/models/quantization/test_modelopt.py
@@ -46,7 +46,7 @@ EXPECTED_STRS_MAP = {
                    reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
 def test_models(example_prompts, model_name) -> None:
-    model = LLM(
+    llm = LLM(
        model=model_name,
        max_model_len=MAX_MODEL_LEN,
        trust_remote_code=True,
@@ -69,9 +69,9 @@ def test_models(example_prompts, model_name) -> None:
    # Note: these need to be run 1 at a time due to numerical precision,
    # since the expected strs were generated this way.
    for prompt in formatted_prompts:
-        outputs = model.generate(prompt, params)
+        outputs = llm.generate(prompt, params)
        generations.append(outputs[0].outputs[0].text)
-    del model
+    del llm

    print(model_name, generations)
    expected_strs = EXPECTED_STRS_MAP[model_name]

--- a/tests/models/quantization/test_nvfp4.py
+++ b/tests/models/quantization/test_nvfp4.py
@@ -46,7 +46,7 @@ EXPECTED_STRS_MAP = {
                    reason="modelopt_fp4 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
 def test_models(example_prompts, model_name) -> None:
-    model = LLM(
+    llm = LLM(
        model=model_name,
        max_model_len=MAX_MODEL_LEN,
        trust_remote_code=True,
@@ -69,9 +69,9 @@ def test_models(example_prompts, model_name) -> None:
    # Note: these need to be run 1 at a time due to numerical precision,
    # since the expected strs were generated this way.
    for prompt in formatted_prompts:
-        outputs = model.generate(prompt, params)
+        outputs = llm.generate(prompt, params)
        generations.append(outputs[0].outputs[0].text)
-    del model
+    del llm

    print(model_name, generations)
    expected_strs = EXPECTED_STRS_MAP[model_name]

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -139,16 +139,20 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                   trust_remote_code=True),
    "AquilaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "BAAI/AquilaChat2-7B"),
                                         trust_remote_code=True),
+    "ArceeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "arcee-ai/AFM-4.5B-Base"),
+                                        is_available_online=False),
    "ArcticForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Snowflake/snowflake-arctic-instruct"),
                                         trust_remote_code=True),
    "BaiChuanForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "baichuan-inc/Baichuan-7B"),
                                         trust_remote_code=True),
    "BaichuanForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "baichuan-inc/Baichuan2-7B-chat"),
                                         trust_remote_code=True),
-    "BambaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"ibm-ai-platform/Bamba-9B"),
-                                        extras={"tiny": os.path.join(models_path_prefix,"hmellor/tiny-random-BambaForCausalLM")}),  # noqa: E501
-    "BloomForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"bigscience/bloom-560m"),
-                                        {"1b": os.path.join(models_path_prefix,"bigscience/bloomz-1b1")}),
+    "BailingMoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "inclusionAI/Ling-lite-1.5"),
+                                         trust_remote_code=True),
+    "BambaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "ibm-ai-platform/Bamba-9B"),
+                                        extras={"tiny": os.path.join(models_path_prefix, "hmellor/tiny-random-BambaForCausalLM")}),  # noqa: E501
+    "BloomForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "bigscience/bloom-560m"),
+                                        {"1b": os.path.join(models_path_prefix, "bigscience/bloomz-1b1")}),
    "ChatGLMModel": _HfExamplesInfo(os.path.join(models_path_prefix, "THUDM/chatglm3-6b"),
                                    trust_remote_code=True,
                                    max_transformers_version="4.48"),
@@ -166,14 +170,15 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                         trust_remote_code=True),
    "DeepseekV3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "deepseek-ai/DeepSeek-V3"),  # noqa: E501
                                         trust_remote_code=True),
-    "Ernie4_5_ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"baidu/ERNIE-4.5-0.3B-PT"),
-                                        trust_remote_code=True),
-    "Ernie4_5_MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"baidu/ERNIE-4.5-21B-A3B-PT"),
-                                        trust_remote_code=True),
-    "ExaoneForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct")),  # noqa: E501
-    "Fairseq2LlamaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"mgleize/fairseq2-dummy-Llama-3.2-1B")),  # noqa: E501
-    "FalconForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"tiiuae/falcon-7b")),
-    "FalconH1ForCausalLM":_HfExamplesInfo(os.path.join(models_path_prefix,"tiiuae/Falcon-H1-0.5B-Base"),
+    "Ernie4_5_ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "baidu/ERNIE-4.5-0.3B-PT"),
+                                            min_transformers_version="4.54"),
+    "Ernie4_5_MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "baidu/ERNIE-4.5-21B-A3B-PT"),
+                                               min_transformers_version="4.54"),
+    "ExaoneForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct")),  # noqa: E501
+    "Exaone4ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "LGAI-EXAONE/EXAONE-4.0-32B")),  # noqa: E501
+    "Fairseq2LlamaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "mgleize/fairseq2-dummy-Llama-3.2-1B")),  # noqa: E501
+    "FalconForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "tiiuae/falcon-7b")),
+    "FalconH1ForCausalLM":_HfExamplesInfo(os.path.join(models_path_prefix, "tiiuae/Falcon-H1-0.5B-Base"),
                                          min_transformers_version="4.53"),
    "GemmaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"google/gemma-1.1-2b-it")),
    "Gemma2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"google/gemma-2-9b")),
@@ -198,7 +203,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                             trust_remote_code=True),
    "HunYuanMoEV1ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"tencent/Hunyuan-A13B-Instruct"),
                                               trust_remote_code=True),
-    "InternLMForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"internlm/internlm-chat-7b"),
+    "HunYuanDenseV1ForCausalLM":_HfExamplesInfo(os.path.join(models_path_prefix, "tencent/Hunyuan-7B-Instruct-0124"),
+                                               trust_remote_code=True),
+    "InternLMForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "internlm/internlm-chat-7b"),
                                           trust_remote_code=True),
    "InternLM2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "internlm/internlm2-chat-7b"),
                                            trust_remote_code=True),
@@ -222,6 +229,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                         trust_remote_code=True),
    "MiniCPM3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "openbmb/MiniCPM3-4B"),
                                         trust_remote_code=True),
+    "MiniMaxForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-Text-01-hf"),
+                                          min_transformers_version="4.53"),
    "MiniMaxText01ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-Text-01"),
                                                trust_remote_code=True,
                                                revision="a59aa9cbc53b9fb8742ca4e9e1531b9802b6fdc3"),  # noqa: E501
@@ -243,14 +252,14 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                      {"1b": os.path.join(models_path_prefix, "facebook/opt-iml-max-1.3b")}),
    "OrionForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "OrionStarAI/Orion-14B-Chat"),
                                        trust_remote_code=True),
-    "PersimmonForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"adept/persimmon-8b-chat")),
-    "PhiForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"microsoft/phi-2")),
-    "Phi3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"microsoft/Phi-3-mini-4k-instruct")),
-    # Blocksparse attention not supported in V1 yet
-    "Phi3SmallForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"microsoft/Phi-3-small-8k-instruct"),
-                                            trust_remote_code=True,
-                                            v0_only=True),
-    "PhiMoEForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"microsoft/Phi-3.5-MoE-instruct"),
+    "PersimmonForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "adept/persimmon-8b-chat")),
+    "PhiForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "microsoft/phi-2")),
+    "Phi3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "microsoft/Phi-3-mini-4k-instruct")),
+    "Phi4FlashForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "microsoft/Phi-4-mini-flash-reasoning"), # noqa: E501
+                                        trust_remote_code=True,
+                                        v0_only=True,
+                                        max_model_len=10240),
+    "PhiMoEForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "microsoft/Phi-3.5-MoE-instruct"),
                                         trust_remote_code=True),
    "Plamo2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "pfnet/plamo-2-1b"),
                                        trust_remote_code=True),
@@ -258,16 +267,15 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                       trust_remote_code=True),
    "Qwen2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen2-0.5B-Instruct"),
                                        extras={"2.5": "Qwen/Qwen2.5-0.5B-Instruct"}), # noqa: E501
-    "Qwen2MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen1.5-MoE-A2.7B-Chat")),
-    "Qwen3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen3-8B")),
-    "Qwen3MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen3-30B-A3B")),
-    "Qwen3ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix,"tomaarsen/Qwen3-Reranker-0.6B-seq-cls")),  # noqa: E501
-    "RWForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"tiiuae/falcon-40b")),
-    "StableLMEpochForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"stabilityai/stablelm-zephyr-3b")),  # noqa: E501
-    "StableLmForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"stabilityai/stablelm-3b-4e1t")),
-    "Starcoder2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"bigcode/starcoder2-3b")),
-    "SolarForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"upstage/solar-pro-preview-instruct")),
-    "TeleChat2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"Tele-AI/TeleChat2-3B"),
+    "Qwen2MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen1.5-MoE-A2.7B-Chat")),
+    "Qwen3ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen3-8B")),
+    "Qwen3MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen3-30B-A3B")),
+    "RWForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "tiiuae/falcon-40b")),
+    "StableLMEpochForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "stabilityai/stablelm-zephyr-3b")),  # noqa: E501
+    "StableLmForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "stabilityai/stablelm-3b-4e1t")),
+    "Starcoder2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "bigcode/starcoder2-3b")),
+    "SolarForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "upstage/solar-pro-preview-instruct")),
+    "TeleChat2ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "Tele-AI/TeleChat2-3B"),
                                            trust_remote_code=True),
    "TeleFLMForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "CofeAI/FLM-2-52B-Instruct-2407"),
                                            trust_remote_code=True),
@@ -290,28 +298,27 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {

 _EMBEDDING_EXAMPLE_MODELS = {
    # [Text-only]
-    "BertModel": _HfExamplesInfo(os.path.join(models_path_prefix,"BAAI/bge-base-en-v1.5"), v0_only=True),
-    "Gemma2Model": _HfExamplesInfo(os.path.join(models_path_prefix,"BAAI/bge-multilingual-gemma2"), v0_only=True),  # noqa: E501
-    "GPT2ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix,"nie3e/sentiment-polish-gpt2-small")),  # noqa: E501
-    "GritLM": _HfExamplesInfo(os.path.join(models_path_prefix,"parasail-ai/GritLM-7B-vllm")),
-    "GteModel": _HfExamplesInfo(os.path.join(models_path_prefix,"Snowflake/snowflake-arctic-embed-m-v2.0"),
+    "BertModel": _HfExamplesInfo(os.path.join(models_path_prefix, "BAAI/bge-base-en-v1.5"), v0_only=True),
+    "Gemma2Model": _HfExamplesInfo(os.path.join(models_path_prefix, "BAAI/bge-multilingual-gemma2"), v0_only=True),  # noqa: E501
+    "GritLM": _HfExamplesInfo(os.path.join(models_path_prefix, "parasail-ai/GritLM-7B-vllm")),
+    "GteModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Snowflake/snowflake-arctic-embed-m-v2.0"),
                                               trust_remote_code=True),
    "GteNewModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Alibaba-NLP/gte-base-en-v1.5"),
                                   trust_remote_code=True,
                                   hf_overrides={"architectures": ["GteNewModel"]}),  # noqa: E501
    "InternLM2ForRewardModel": _HfExamplesInfo(os.path.join(models_path_prefix, "internlm/internlm2-1_8b-reward"),
                                               trust_remote_code=True),
-    "JambaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix,"ai21labs/Jamba-tiny-reward-dev")),  # noqa: E501
-    "LlamaModel": _HfExamplesInfo(os.path.join(models_path_prefix,"llama"), is_available_online=False),
-    "MistralModel": _HfExamplesInfo(os.path.join(models_path_prefix,"intfloat/e5-mistral-7b-instruct")),
-    "ModernBertModel": _HfExamplesInfo(os.path.join(models_path_prefix,"Alibaba-NLP/gte-modernbert-base"),
+    "JambaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "ai21labs/Jamba-tiny-reward-dev")),  # noqa: E501
+    "LlamaModel": _HfExamplesInfo(os.path.join(models_path_prefix, "llama"), is_available_online=False),
+    "MistralModel": _HfExamplesInfo(os.path.join(models_path_prefix, "intfloat/e5-mistral-7b-instruct")),
+    "ModernBertModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Alibaba-NLP/gte-modernbert-base"),
                                trust_remote_code=True, v0_only=True),
-    "NomicBertModel": _HfExamplesInfo(os.path.join(models_path_prefix,"nomic-ai/nomic-embed-text-v2-moe"),
+    "NomicBertModel": _HfExamplesInfo(os.path.join(models_path_prefix, "nomic-ai/nomic-embed-text-v2-moe"),
                                               trust_remote_code=True, v0_only=True),  # noqa: E501
+
    "Qwen2Model": _HfExamplesInfo(os.path.join(models_path_prefix,"ssmits/Qwen2-7B-Instruct-embed-base")),
    "Qwen2ForRewardModel": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen2.5-Math-RM-72B")),
    "Qwen2ForProcessRewardModel": _HfExamplesInfo(os.path.join(models_path_prefix,"Qwen/Qwen2.5-Math-PRM-7B")),
-    "Qwen2ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix,"jason9693/Qwen2.5-1.5B-apeach")),  # noqa: E501
    "RobertaModel": _HfExamplesInfo(os.path.join(models_path_prefix,"sentence-transformers/stsb-roberta-base-v2"), v0_only=True),  # noqa: E501
    "RobertaForMaskedLM": _HfExamplesInfo(os.path.join(models_path_prefix,"sentence-transformers/all-roberta-large-v1"), v0_only=True),  # noqa: E501
    "XLMRobertaModel": _HfExamplesInfo(os.path.join(models_path_prefix,"intfloat/multilingual-e5-small"), v0_only=True),  # noqa: E501
@@ -324,12 +331,27 @@ _EMBEDDING_EXAMPLE_MODELS = {
                                            is_available_online=False),  # noqa: E501
 }

-_CROSS_ENCODER_EXAMPLE_MODELS = {
-    # [Text-only]
+_SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
+    # [Decoder-only]
+    "GPT2ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "nie3e/sentiment-polish-gpt2-small")),  # noqa: E501
+
+    # [Cross-encoder]
    "BertForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "cross-encoder/ms-marco-MiniLM-L-6-v2"), v0_only=True),  # noqa: E501
+    "ModernBertForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "Alibaba-NLP/gte-reranker-modernbert-base"), v0_only=True), # noqa: E501
    "RobertaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "cross-encoder/quora-roberta-base"), v0_only=True),  # noqa: E501
    "XLMRobertaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-m3"), v0_only=True),  # noqa: E501
-    "ModernBertForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "Alibaba-NLP/gte-reranker-modernbert-base"), v0_only=True),  # noqa: E501
+}
+
+_AUTOMATIC_CONVERTED_MODELS = {
+    # Use as_seq_cls_model for automatic conversion
+    "GemmaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "BAAI/bge-reranker-v2-gemma)",  # noqa: E501
+                                                      v0_only=True,
+                                                      hf_overrides={"architectures": ["GemmaForSequenceClassification"], # noqa: E501
+                                                                    "classifier_from_token": ["Yes"],  # noqa: E501
+                                                                    "method": "no_post_processing"}),  # noqa: E501
+    "LlamaForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "Skywork/Skywork-Reward-V2-Llama-3.2-1B")),  # noqa: E501
+    "Qwen2ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "jason9693/Qwen2.5-1.5B-apeach")),  # noqa: E501
+    "Qwen3ForSequenceClassification": _HfExamplesInfo(os.path.join(models_path_prefix, "tomaarsen/Qwen3-Reranker-0.6B-seq-cls")),  # noqa: E501
 }

 _MULTIMODAL_EXAMPLE_MODELS = {
@@ -350,12 +372,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "GLM4VForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"THUDM/glm-4v-9b"),
                                        trust_remote_code=True,
                                        hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
-    "Glm4vForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix,"THUDM/GLM-4.1V-9B-Thinking"), min_transformers_version="4.53"),  # noqa: E501
-    "Glm4MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"THUDM/GLM-4.5"),
+    "Glm4vForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "THUDM/GLM-4.1V-9B-Thinking"), min_transformers_version="4.53"),  # noqa: E501
+    "Glm4MoeForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "THUDM/GLM-4.5"),
                                          min_transformers_version="4.54",
                                          is_available_online=False),   # noqa: E501
-    "H2OVLChatModel": _HfExamplesInfo(os.path.join(models_path_prefix,"h2oai/h2ovl-mississippi-800m"),
-                                      extras={"2b": os.path.join(models_path_prefix,"h2oai/h2ovl-mississippi-2b")},  # noqa: E501
+    "H2OVLChatModel": _HfExamplesInfo(os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-800m"),
+                                      extras={"2b": os.path.join(models_path_prefix, "h2oai/h2ovl-mississippi-2b")},  # noqa: E501
                                      max_transformers_version="4.48",  # noqa: E501
                                      transformers_version_reason="HF model is not compatible."),  # noqa: E501
    "InternVLChatModel": _HfExamplesInfo(os.path.join(models_path_prefix, "OpenGVLab/InternVL2-1B"),
@@ -364,12 +386,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                         trust_remote_code=True),
    "Idefics3ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "HuggingFaceM4/Idefics3-8B-Llama3"),  # noqa: E501
                                                        {"tiny": os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM-256M-Instruct")}),  # noqa: E501
-    "KeyeForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
+    "KeyeForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Kwai-Keye/Keye-VL-8B-Preview"), # noqa: E501
                                                    trust_remote_code=True),
    "KimiVLForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Instruct"),  # noqa: E501
                                                      extras={"thinking": os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Thinking")},  # noqa: E501
                                                      trust_remote_code=True),
-    "Llama4ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct",   # noqa: E501
+    "Llama4ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct"),   # noqa: E501
                                                      max_model_len=10240),
    "LlavaForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "llava-hf/llava-1.5-7b-hf"),
                                                     extras={"mistral": os.path.join(models_path_prefix, "mistral-community/pixtral-12b"), # noqa: E501
@@ -398,9 +420,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                        trust_remote_code=True),
    "NVLM_D": _HfExamplesInfo(os.path.join(models_path_prefix, "nvidia/NVLM-D-72B"),
                              trust_remote_code=True),
+
+    "Llama_Nemotron_Nano_VL" : _HfExamplesInfo(os.path.join(models_path_prefix, "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"), # noqa: E501
+                                                     trust_remote_code=True),
    "PaliGemmaForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "google/paligemma-3b-mix-224"),  # noqa: E501
                                                         extras={"v2": os.path.join(models_path_prefix, "google/paligemma2-3b-ft-docci-448")}),  # noqa: E501
-    "Phi3VForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "microsoft/Phi-3-vision-128k-instruct"),
+    "Phi3VForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "microsoft/Phi-3-vision-128k-instruct",
                                        trust_remote_code=True,
                                        max_transformers_version="4.48",
                                        transformers_version_reason="Use of deprecated imports which have been removed.",  # noqa: E501
@@ -418,7 +443,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                      hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}),  # noqa: E501
    "Qwen2AudioForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen2-Audio-7B-Instruct")),  # noqa: E501
    "Qwen2VLForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")),  # noqa: E501
-    "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct")),  # noqa: E501
+    "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct"), # noqa: E501
+                                                          max_model_len=4096),
    "Qwen2_5OmniModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen2.5-Omni-3B")),
    "Qwen2_5OmniForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "Qwen/Qwen2.5-Omni-7B-AWQ")),  # noqa: E501
    "SkyworkR1VChatModel": _HfExamplesInfo(os.path.join(models_path_prefix, "Skywork/Skywork-R1V-38B")),
@@ -429,6 +455,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                        hf_overrides={"architectures": ["TarsierForConditionalGeneration"]}),  # noqa: E501
    "Tarsier2ForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier2-Recap-7b",  # noqa: E501
                                                        hf_overrides={"architectures": ["Tarsier2ForConditionalGeneration"]}),  # noqa: E501
+    "VoxtralForConditionalGeneration": _HfExamplesInfo(
+        "mistralai/Voxtral-Mini-3B-2507",
+        min_transformers_version="4.54",
+        # disable this temporarily until we support HF format
+        is_available_online=False,
+    ),
    # [Encoder-decoder]
    # Florence-2 uses BartFastTokenizer which can't be loaded from AutoTokenizer
    # Therefore, we borrow the BartTokenizer from the original Bart model
@@ -436,17 +468,19 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                         tokenizer=os.path.join(models_path_prefix,"Isotr0py/Florence-2-tokenizer"),  # noqa: E501
                                                         trust_remote_code=True),  # noqa: E501
    "MllamaForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "meta-llama/Llama-3.2-11B-Vision-Instruct")),  # noqa: E501
-    "Llama4ForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct")),  # noqa: E501
    "WhisperForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix, "openai/whisper-large-v3")),  # noqa: E501
+    # [Cross-encoder]
+    "JinaVLForRanking": _HfExamplesInfo(os.path.join(models_path_prefix, "jinaai/jina-reranker-m0")),   # noqa: E501
 }

+
 _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
-    "EAGLEModel": _HfExamplesInfo(os.path.join(models_path_prefix, "JackFram/llama-68m"),
-                                  speculative_model=os.path.join(models_path_prefix, "abhigoyal/vllm-eagle-llama-68m-random")),  # noqa: E501
    "MedusaModel": _HfExamplesInfo(os.path.join(models_path_prefix, "JackFram/llama-68m"),
                                   speculative_model=os.path.join(models_path_prefix, "abhigoyal/vllm-medusa-llama-68m-random")),  # noqa: E501
-    "MLPSpeculatorPreTrainedModel": _HfExamplesInfo(os.path.join(models_path_prefix, "JackFram/llama-160m"),
-                                                    speculative_model=os.path.join(models_path_prefix, "ibm-ai-platform/llama-160m-accelerator")),  # noqa: E501
+    # Temporarily disabled.
+    # TODO(woosuk): Re-enable this once the MLP Speculator is supported in V1.
+    # "MLPSpeculatorPreTrainedModel": _HfExamplesInfo("JackFram/llama-160m",
+    #                                                 speculative_model="ibm-ai-platform/llama-160m-accelerator"),  # noqa: E501
    "DeepSeekMTPModel": _HfExamplesInfo(os.path.join(models_path_prefix, "luccafong/deepseek_mtp_main_random"),
                                        speculative_model=os.path.join(models_path_prefix, "luccafong/deepseek_mtp_draft_random"),  # noqa: E501
                                        trust_remote_code=True),
@@ -454,32 +488,39 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
                                             trust_remote_code=True,
                                             speculative_model=os.path.join(models_path_prefix, "yuhuili/EAGLE-LLaMA3-Instruct-8B"),
                                             tokenizer=os.path.join(models_path_prefix, "meta-llama/Meta-Llama-3-8B-Instruct")),  # noqa: E501
-    "Eagle3LlamaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"),  # noqa: E501
+    "Eagle3LlamaForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B")),  # noqa: E501
                                            trust_remote_code=True,
-                                            speculative_model=os.path.join(models_path_prefix,"yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"),
-                                            tokenizer=os.path.join(models_path_prefix,"meta-llama/Llama-3.1-8B-Instruct")),
-    "EagleMiniCPMForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"openbmb/MiniCPM-1B-sft-bf16"),
+
+                                            speculative_model=os.path.join(models_path_prefix, "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"),
+                                            tokenizer=os.path.join(models_path_prefix, "meta-llama/Llama-3.1-8B-Instruct")),
+    "EagleLlama4ForCausalLM": _HfExamplesInfo(
+        os.path.join(models_path_prefix, "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct"),
+        trust_remote_code=True,
+        speculative_model=os.path.join(models_path_prefix, "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct"),
+        tokenizer=os.path.join(models_path_prefix, "meta-llama/Llama-4-Scout-17B-16E-Instruct")),  # noqa: E501
+    "EagleMiniCPMForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "openbmb/MiniCPM-1B-sft-bf16"),
                                            trust_remote_code=True,
                                            is_available_online=False,
-                                            speculative_model=os.path.join(models_path_prefix,"openbmb/MiniCPM-2B-sft-bf16"),
-                                            tokenizer=os.path.join(models_path_prefix,"openbmb/MiniCPM-2B-sft-bf16")),
-    "Glm4MoeMTPModel": _HfExamplesInfo(os.path.join(models_path_prefix,"THUDM/GLM-4.5"),
-                                        speculative_model=os.path.join(models_path_prefix,"THUDM/GLM-4.5"),
+                                            speculative_model=os.path.join(models_path_prefix, "openbmb/MiniCPM-2B-sft-bf16"),
+                                            tokenizer=os.path.join(models_path_prefix, "openbmb/MiniCPM-2B-sft-bf16")),
+    "Glm4MoeMTPModel": _HfExamplesInfo(os.path.join(models_path_prefix, "THUDM/GLM-4.5"),
+                                        speculative_model=os.path.join(models_path_prefix, "THUDM/GLM-4.5"),
                                        min_transformers_version="4.54",
                                        is_available_online=False),
-    "MiMoMTPModel": _HfExamplesInfo(os.path.join(models_path_prefix,"XiaomiMiMo/MiMo-7B-RL"),
+    "MiMoMTPModel": _HfExamplesInfo(os.path.join(models_path_prefix, "XiaomiMiMo/MiMo-7B-RL"),
                                    trust_remote_code=True,
                                    speculative_model=os.path.join(models_path_prefix,"XiaomiMiMo/MiMo-7B-RL"))
 }

 _TRANSFORMERS_MODELS = {
-    "TransformersForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "ArthurZ/Ilama-3.2-1B"), trust_remote_code=True),  # noqa: E501
+    "TransformersForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "hmellor/Ilama-3.2-1B"), trust_remote_code=True),  # noqa: E501
+    "TransformersForMultimodalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "OpenGVLab/InternVL3-1B-hf")),
 }

 _EXAMPLE_MODELS = {
    **_TEXT_GENERATION_EXAMPLE_MODELS,
    **_EMBEDDING_EXAMPLE_MODELS,
-    **_CROSS_ENCODER_EXAMPLE_MODELS,
+    **_SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS,
    **_MULTIMODAL_EXAMPLE_MODELS,
    **_SPECULATIVE_DECODING_EXAMPLE_MODELS,
    **_TRANSFORMERS_MODELS,
@@ -511,4 +552,5 @@ class HfExampleModels:
        raise ValueError(f"No example model defined for {model_id}")


-HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
\ No newline at end of file
+HF_EXAMPLE_MODELS = HfExampleModels(_EXAMPLE_MODELS)
+AUTO_EXAMPLE_MODELS = HfExampleModels(_AUTOMATIC_CONVERTED_MODELS)
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -12,20 +12,36 @@ from vllm.utils import GiB_bytes
 from vllm.v1.core.kv_cache_utils import get_kv_cache_config
 from vllm.v1.engine.core import EngineCore as V1EngineCore

-from .registry import HF_EXAMPLE_MODELS
-
-
-@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
-def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
-    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
+from ..utils import create_new_process_for_each_test
+from .registry import AUTO_EXAMPLE_MODELS, HF_EXAMPLE_MODELS, HfExampleModels
+
+
+@create_new_process_for_each_test()
+def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
+                   EXAMPLE_MODELS: HfExampleModels):
+    """The reason for using create_new_process_for_each_test is to avoid
+    the WARNING:
+        "We must use the 'spawn' multiprocessing start method. Overriding
+        VLLM_WORKER_MULTIPROC_METHOD to 'spawn'."
+    The spawn process causes the _initialize_kv_caches_v1 function below to
+    become ineffective.
+    """
+
+    model_info = EXAMPLE_MODELS.get_hf_info(model_arch)
    model_info.check_available_online(on_fail="skip")
    model_info.check_transformers_version(on_fail="skip")

    # FIXME: Possible memory leak in the previous tests?
-    if model_arch in ("GraniteSpeechForConditionalGeneration",
+    if model_arch in ("Glm4vForConditionalGeneration",
+                      "GraniteSpeechForConditionalGeneration",
                      "KimiVLForConditionalGeneration"):
        pytest.skip("Avoid OOM")

+    if model_arch in ("Llama4ForCausalLM", "EagleLlama4ForCausalLM"):
+        from vllm.model_executor.models.llama4 import Llama4ForCausalLM
+        from vllm.model_executor.models.registry import ModelRegistry
+        ModelRegistry.register_model("Llama4ForCausalLM", Llama4ForCausalLM)
+
    # Avoid OOM and reduce initialization time by only using 1 layer
    def hf_overrides(hf_config: PretrainedConfig) -> PretrainedConfig:
        hf_config.update(model_info.hf_overrides)
@@ -33,13 +49,18 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
        text_config = hf_config.get_text_config()

        # Ensure at least 2 expert per group
-        # Since `grouped_topk` assums top-2
+        # Since `grouped_topk` assumes top-2
        n_group = getattr(text_config, 'n_group', None)
        num_experts = n_group * 2 if n_group is not None else 2

+        # we use three layers for Gemma-3n to check
+        # both normal layer and kv_shared_layer
+        num_hidden_layers = (3 if model_arch
+                             == "Gemma3nForConditionalGeneration" else 1)
+
        text_config.update({
            "num_layers": 1,
-            "num_hidden_layers": 1,
+            "num_hidden_layers": num_hidden_layers,
            "num_experts": num_experts,
            "num_experts_per_tok": 2,
            "num_local_experts": num_experts,
@@ -47,6 +68,8 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
            "first_k_dense_replace": 0,
            # To avoid OOM on DeepSeek-V3
            "n_routed_experts": num_experts,
+            # For Gemma-3n
+            "num_kv_shared_layers": 1,
        })

        if hasattr(hf_config, "vision_config"):
@@ -86,6 +109,9 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
                       _initialize_kv_caches_v1), monkeypatch.context() as m):
        if model_info.v0_only:
            m.setenv("VLLM_USE_V1", "0")
+        if model_arch == "Phi4FlashForCausalLM":
+            # Phi4FlashForCausalLM only supports DIFFERENTIAL_FLASH_ATTN backend
+            m.setenv("VLLM_ATTENTION_BACKEND", "DIFFERENTIAL_FLASH_ATTN")
        LLM(
            model_info.default,
            tokenizer=model_info.tokenizer,
@@ -102,3 +128,15 @@ def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
            load_format="dummy",
            hf_overrides=hf_overrides,
        )
+
+
+@pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())
+def test_can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch):
+    can_initialize(model_arch, monkeypatch, HF_EXAMPLE_MODELS)
+
+
+@pytest.mark.parametrize("model_arch",
+                         AUTO_EXAMPLE_MODELS.get_supported_archs())
+def test_implicit_converted_models(model_arch: str,
+                                   monkeypatch: pytest.MonkeyPatch):
+    can_initialize(model_arch, monkeypatch, AUTO_EXAMPLE_MODELS)
--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -80,11 +80,15 @@ def test_registry_model_property(model_arch, is_mm, init_cuda, is_ce):


 @create_new_process_for_each_test()
-@pytest.mark.parametrize("model_arch,is_pp,init_cuda", [
-    (os.path.join(models_path_prefix, "MLPSpeculatorPreTrainedModel"), False, False),
-    (os.path.join(models_path_prefix, "DeepseekV2ForCausalLM"), True, False),
-    (os.path.join(models_path_prefix, "Qwen2VLForConditionalGeneration"), True, True),
-])
+@pytest.mark.parametrize(
+    "model_arch,is_pp,init_cuda",
+    [
+        # TODO(woosuk): Re-enable this once the MLP Speculator is supported
+        # in V1.
+        # ("MLPSpeculatorPreTrainedModel", False, False),
+        (os.path.join(models_path_prefix, "DeepseekV2ForCausalLM"), True, False),
+        (os.path.join(models_path_prefix, "Qwen2VLForConditionalGeneration"), True, True),
+    ])
 def test_registry_is_pp(model_arch, is_pp, init_cuda):
    assert ModelRegistry.is_pp_supported_model(model_arch) is is_pp


--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -57,8 +57,8 @@ def check_implementation(
 @pytest.mark.parametrize(
    "model,model_impl",
    [
-        (os.path.join(models_path_prefix,"meta-llama/Llama-3.2-1B-Instruct"), "transformers"),
-        (os.path.join(models_path_prefix,"ArthurZ/Ilama-3.2-1B", "auto")),  # CUSTOM CODE
+        (os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"), "transformers"),
+        (os.path.join(models_path_prefix, "hmellor/Ilama-3.2-1B"), "auto"),  # CUSTOM CODE
    ])  # trust_remote_code=True by default
 def test_models(
    hf_runner: type[HfRunner],
@@ -105,7 +105,7 @@ def test_distributed(
    reason="bitsandbytes quantization is currently not supported in rocm.")
 @pytest.mark.parametrize("model, quantization_kwargs", [
    (
-        os.path.join(models_path_prefix,"meta-llama/Llama-3.2-1B-Instruct"),
+        os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct"),
        {
            "quantization": "bitsandbytes",
        },
@@ -139,4 +139,39 @@ def test_quantization(
        outputs_1_lst=vllm_outputs,
        name_0="transformers",
        name_1="vllm",
-    )
\ No newline at end of file
+    )
+
+
+@pytest.mark.parametrize(
+    "model",
+    [os.path.join(models_path_prefix, "jason9693/Qwen2.5-1.5B-apeach")],
+)
+@pytest.mark.parametrize("dtype", ["float"])
+def test_classify(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    monkeypatch,
+) -> None:
+    import torch
+    from transformers import AutoModelForSequenceClassification
+
+    with vllm_runner(model,
+                     max_model_len=512,
+                     dtype=dtype,
+                     model_impl="transformers") as vllm_model:
+        vllm_outputs = vllm_model.classify(example_prompts)
+
+    with hf_runner(model,
+                   dtype=dtype,
+                   auto_cls=AutoModelForSequenceClassification) as hf_model:
+        hf_outputs = hf_model.classify(example_prompts)
+
+    for hf_output, vllm_output in zip(hf_outputs, vllm_outputs):
+        hf_output = torch.tensor(hf_output)
+        vllm_output = torch.tensor(vllm_output)
+
+        assert torch.allclose(hf_output, vllm_output,
+                              1e-3 if dtype == "float" else 1e-2)