Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori

38d80967 · zhuwenwen · 33650733 · 880c741b · 38d80967 · 38d80967
Commit 38d80967 authored Sep 12, 2025 by zhuwenwen
20 changed files
--- a/tests/models/multimodal/generation/vlm_utils/builders.py
+++ b/tests/models/multimodal/generation/vlm_utils/builders.py
@@ -250,7 +250,7 @@ def build_video_inputs_from_test_info(

 def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
                             size_type: SizeType):
-    """Applies a size scaler to one image; this can be a an image size factor,
+    """Applies a size scaler to one image; this can be an image size factor,
    which scales the image while maintaining the aspect ratio"""
    # Special case for embeddings; if it's a tensor, it's only valid if we
    # are considering size factors at constant scale, i.e., we just clone

--- a/tests/models/multimodal/generation/vlm_utils/case_filtering.py
+++ b/tests/models/multimodal/generation/vlm_utils/case_filtering.py
@@ -42,7 +42,7 @@ def get_filtered_test_settings(
            else:
                assert test_info.prompt_formatter is not None

-            # Everything looks okay; keep if this is has correct proc handling
+            # Everything looks okay; keep if this is correct proc handling
            if (test_info.distributed_executor_backend
                    is not None) == new_proc_per_test:
                matching_tests[test_name] = test_info

--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -42,7 +42,7 @@ def run_test(
    tensor_parallel_size: int = 1,
    vllm_embeddings: Optional[torch.Tensor] = None,
 ):
-    """Modality agnostic test test executor for comparing HF/vLLM outputs."""
+    """Modality agnostic test executor for comparing HF/vLLM outputs."""
    # In the case of embeddings, vLLM takes separate input tensors
    vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs

@@ -69,6 +69,9 @@ def run_test(
        vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
    if model_info.hf_overrides:
        vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
+    if model_info.skip_tokenizer_init:
+        vllm_runner_kwargs_[
+            "skip_tokenizer_init"] = model_info.skip_tokenizer_init

    if vllm_runner_kwargs:
        vllm_runner_kwargs_.update(vllm_runner_kwargs)

--- a/tests/models/multimodal/pooling/test_prithvi_mae.py
+++ b/tests/models/multimodal/pooling/test_prithvi_mae.py
@@ -46,7 +46,7 @@ def _run_test(
        vllm_model.encode(prompt)


-MODELS = ["christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM"]
+MODELS = ["mgazz/Prithvi-EO-2.0-300M-TL-Sen1Floods11"]


 @pytest.mark.core_model

--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -66,7 +66,9 @@ def _test_processing_correctness(
        hf_overrides=model_info.hf_overrides,
        # Ensure that the cache can fit all of the data
        mm_processor_cache_gb=2048,
-    )
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype)

    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
@@ -293,6 +295,7 @@ def _test_processing_correctness_one(
    "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview",
    "OpenGVLab/InternVL3_5-30B-A3B",
    "Kwai-Keye/Keye-VL-8B-Preview",
+    "Kwai-Keye/Keye-VL-1_5-8B",
    "moonshotai/Kimi-VL-A3B-Instruct",
    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
    "llava-hf/llava-1.5-7b-hf",
@@ -301,6 +304,7 @@ def _test_processing_correctness_one(
    "llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
    "meta-llama/Llama-3.2-11B-Vision-Instruct",
    "TIGER-Lab/Mantis-8B-siglip-llama3",
+    "mispeech/midashenglm-7b",
    "openbmb/MiniCPM-Llama3-V-2_5",
    "openbmb/MiniCPM-o-2_6",
    "openbmb/MiniCPM-V-2_6",

--- a/tests/models/multimodal/processing/test_glm4_1v.py
+++ b/tests/models/multimodal/processing/test_glm4_1v.py
@@ -5,6 +5,7 @@ import pytest

 from vllm.assets.video import VideoAsset
 from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.video import OpenCVDynamicVideoBackend, OpenCVVideoBackend

 from ...utils import build_model_context

@@ -50,3 +51,49 @@ def test_processor_override(

    assert grid_t == expected_grid_t
    assert video_tok_count == expected_toks_per_frame * grid_t
+
+
+@pytest.mark.parametrize("model_id", ["zai-org/GLM-4.1V-9B-Thinking"])
+@pytest.mark.parametrize("fps", [2])
+def test_video_loader_consistency(
+    model_id: str,
+    fps: int,
+):
+    """
+    Ensure dynamic video loader (pre-sampled by loader) and normal video 
+    loader (post-sampled by processor) produce same video processing outputs.
+    """
+    ctx = build_model_context(
+        model_id,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"video": 1},
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
+    hf_processor_mm_kwargs = {"fps": fps}
+
+    # Build the image str / prompt based on the number of images we pass
+    prompt = "<|begin_of_video|><|video|><|end_of_video|>"
+
+    video_path = VideoAsset(name="baby_reading", num_frames=-1).video_path
+    with open(video_path, "rb") as f:
+        video_bytes = f.read()
+
+    static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
+    dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
+        video_bytes, requested_fps=fps)
+
+    # pre-sampled loader shouldn't read all frames
+    assert len(dynamic_video) < len(static_video)
+
+    static_mm_data = {"video": [(static_video, static_metadata)]}
+    dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}
+
+    static_outputs = processor.apply(prompt, static_mm_data,
+                                     hf_processor_mm_kwargs)
+    dynamic_outputs = processor.apply(prompt, dynamic_mm_data,
+                                      hf_processor_mm_kwargs)
+
+    assert static_outputs["prompt_token_ids"] == dynamic_outputs[
+        "prompt_token_ids"]
+    assert static_outputs["mm_kwargs"].get_data(
+    ) == dynamic_outputs["mm_kwargs"].get_data()
--- a/tests/models/multimodal/processing/test_mllama4.py
+++ b/tests/models/multimodal/processing/test_mllama4.py
@@ -52,7 +52,7 @@ def test_profiling(model_id: str, max_model_len: int):
    chunks_per_image = prod(mm_data["patches_per_image"])
    total_num_patches = chunks_per_image * tokens_per_patch
    num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][
-        1]  # x-y seperator tokens
+        1]  # x-y separator tokens
    total_tokens = total_num_patches.item() + num_tiles.item(
    ) + 3  # image start, image, image end


--- a/tests/models/multimodal/processing/test_tensor_schema.py
+++ b/tests/models/multimodal/processing/test_tensor_schema.py
@@ -31,6 +31,7 @@ from ...utils import dummy_hf_overrides

 ARCH_TO_SKIP = {
    "MolmoForCausalLM": "incompatible requirements",
+    "Florence2ForConditionalGeneration": "not supported in V1",
 }
 ARCH_NEEDS_EXTRAS = [
    "InternVLChatModel",
@@ -41,9 +42,6 @@ ARCH_NEEDS_EXTRAS = [
 ]
 REPO_ID_TO_SKIP = {
    "nm-testing/pixtral-12b-FP8-dynamic": "duplicated test",
-    # FIXME(Isotr0py): enable GPT-OSS based InternVL3.5 model
-    # after support PP for GPT-OSS
-    "OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview": "Broken model",
 }

 ImageInput = list[Image.Image]
@@ -199,7 +197,9 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
        revision=model_info.revision,
        trust_remote_code=model_info.trust_remote_code,
        hf_overrides=hf_overrides_fn,
-    )
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype)
    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
    factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]


--- a/tests/models/multimodal/test_mapping.py
+++ b/tests/models/multimodal/test_mapping.py
@@ -59,7 +59,9 @@ def test_hf_model_weights_mapper(model_arch: str):
        revision=model_info.revision,
        trust_remote_code=model_info.trust_remote_code,
        hf_overrides=model_info.hf_overrides,
-    )
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
+        dtype=model_info.dtype)
    model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)

    original_weights = create_repo_dummy_weights(model_id)

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -6,10 +6,11 @@ from dataclasses import dataclass, field
 from typing import Any, Literal, Optional

 import pytest
+import torch
 from packaging.version import Version
 from transformers import __version__ as TRANSFORMERS_VERSION

-from vllm.config import TokenizerMode
+from vllm.config import ModelDType, TokenizerMode


 @dataclass(frozen=True)
@@ -47,6 +48,23 @@ class _HfExamplesInfo:
    The reason for the minimum/maximum version requirement.
    """

+    skip_tokenizer_init: bool = False
+    """
+    If true, skip initialization of tokenizer and detokenizer. 
+    """
+
+    dtype: ModelDType = "auto"
+    """
+    The data type for the model weights and activations.
+    """
+
+    enforce_eager: bool = False
+    """
+    Whether to enforce eager execution. If True, we will
+    disable CUDA graph and always execute the model in eager mode.
+    If False, we will use CUDA graph and eager execution in hybrid.
+    """
+
    is_available_online: bool = True
    """
    Set this to ``False`` if the name of this architecture no longer exists on
@@ -76,6 +94,15 @@ class _HfExamplesInfo:
    If not specified, the default revision will be used.
    """

+    max_num_seqs: Optional[int] = None
+    """Maximum number of sequences to be processed in a single iteration."""
+
+    use_original_num_layers: bool = False
+    """
+    If True, use the original number of layers from the model config 
+    instead of minimal layers for testing.
+    """
+
    def check_transformers_version(
        self,
        *,
@@ -137,7 +164,7 @@ class _HfExamplesInfo:
 # yapf: disable
 _TEXT_GENERATION_EXAMPLE_MODELS = {
    # [Decoder-only]
-    "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B",
+    "ApertusForCausalLM": _HfExamplesInfo("swiss-ai/Apertus-8B-2509",
                                          min_transformers_version="4.56.0",
                                          trust_remote_code=True),
    "AquilaModel": _HfExamplesInfo("BAAI/AquilaChat-7B",
@@ -154,7 +181,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "BailingMoeForCausalLM": _HfExamplesInfo("inclusionAI/Ling-lite-1.5",
                                         trust_remote_code=True),
    "BambaForCausalLM": _HfExamplesInfo("ibm-ai-platform/Bamba-9B-v1",
-                                        min_transformers_version="4.56.0",
+                                        min_transformers_version="4.55.3",
                                        extras={"tiny": "hmellor/tiny-random-BambaForCausalLM"}),  # noqa: E501
    "BloomForCausalLM": _HfExamplesInfo("bigscience/bloom-560m",
                                        {"1b": "bigscience/bloomz-1b1"}),
@@ -208,7 +235,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "GptOssForCausalLM": _HfExamplesInfo("lmsys/gpt-oss-20b-bf16"),
    "GraniteForCausalLM": _HfExamplesInfo("ibm/PowerLM-3b"),
    "GraniteMoeForCausalLM": _HfExamplesInfo("ibm/PowerMoE-3b"),
-    "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview"),  # noqa: E501
+    "GraniteMoeHybridForCausalLM": _HfExamplesInfo("ibm-granite/granite-4.0-tiny-preview", # noqa: E501
+                                                   min_transformers_version="4.55.3"),
    "GraniteMoeSharedForCausalLM": _HfExamplesInfo("ibm-research/moe-7b-1b-active-shared-experts"),  # noqa: E501
    "Grok1ModelForCausalLM": _HfExamplesInfo("hpcai-tech/grok-1",
                                             trust_remote_code=True),
@@ -228,7 +256,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
                                            trust_remote_code=True),
    "JAISLMHeadModel": _HfExamplesInfo("inceptionai/jais-13b-chat"),
    "JambaForCausalLM": _HfExamplesInfo("ai21labs/AI21-Jamba-1.5-Mini",
-                                        min_transformers_version="4.56.0",
+                                        min_transformers_version="4.55.3",
                                        extras={
                                            "tiny": "ai21labs/Jamba-tiny-dev",
                                            "random": "ai21labs/Jamba-tiny-random",  # noqa: E501
@@ -244,7 +272,11 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Llama4ForCausalLM": _HfExamplesInfo("meta-llama/Llama-4-Scout-17B-16E-Instruct", # noqa: E501
                                         is_available_online=False),
    "MambaForCausalLM": _HfExamplesInfo("state-spaces/mamba-130m-hf"),
-    "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1"),
+    "Mamba2ForCausalLM": _HfExamplesInfo("mistralai/Mamba-Codestral-7B-v0.1",
+                                         min_transformers_version="4.55.3",
+                                         extras={
+                                            "random": "yujiepan/mamba2-codestral-v0.1-tiny-random", # noqa: E501
+                                         }),
    "FalconMambaForCausalLM": _HfExamplesInfo("tiiuae/falcon-mamba-7b-instruct"),  # noqa: E501
    "MiniCPMForCausalLM": _HfExamplesInfo("openbmb/MiniCPM-2B-sft-bf16",
                                         trust_remote_code=True),
@@ -259,7 +291,9 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "MistralForCausalLM": _HfExamplesInfo("mistralai/Mistral-7B-Instruct-v0.1"),
    "MixtralForCausalLM": _HfExamplesInfo("mistralai/Mixtral-8x7B-Instruct-v0.1",  # noqa: E501
                                          {"tiny": "TitanML/tiny-mixtral"}),  # noqa: E501
-    "QuantMixtralForCausalLM": _HfExamplesInfo("mistral-community/Mixtral-8x22B-v0.1-AWQ"),  # noqa: E501
+    "MotifForCausalLM": _HfExamplesInfo("Motif-Technologies/Motif-2.6B",
+                                        trust_remote_code=True,
+                                        v0_only=True),
    "MptForCausalLM": _HfExamplesInfo("mpt", is_available_online=False),
    "MPTForCausalLM": _HfExamplesInfo("mosaicml/mpt-7b"),
    "NemotronForCausalLM": _HfExamplesInfo("nvidia/Minitron-8B-Base"),
@@ -282,8 +316,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "PhiMoEForCausalLM": _HfExamplesInfo("microsoft/Phi-3.5-MoE-instruct",
                                         trust_remote_code=True),
    "Plamo2ForCausalLM": _HfExamplesInfo("pfnet/plamo-2-1b",
-                                         max_transformers_version="4.53",
-                                         transformers_version_reason="vLLM impl inherits PreTrainedModel and clashes with get_input_embeddings",  # noqa: E501
                                        trust_remote_code=True),
    "QWenLMHeadModel": _HfExamplesInfo("Qwen/Qwen-7B-Chat",
                                       max_transformers_version="4.53",
@@ -294,6 +326,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "Qwen2MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen1.5-MoE-A2.7B-Chat"),
    "Qwen3ForCausalLM": _HfExamplesInfo("Qwen/Qwen3-8B"),
    "Qwen3MoeForCausalLM": _HfExamplesInfo("Qwen/Qwen3-30B-A3B"),
+    "Qwen3NextForCausalLM": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct",
+                                            min_transformers_version="4.56.2"),
    "RWForCausalLM": _HfExamplesInfo("tiiuae/falcon-40b"),
    "SeedOssForCausalLM": _HfExamplesInfo("ByteDance-Seed/Seed-OSS-36B-Instruct", # noqa: E501
                                          trust_remote_code=True,
@@ -328,6 +362,7 @@ _EMBEDDING_EXAMPLE_MODELS = {
    # [Text-only]
    "BertModel": _HfExamplesInfo("BAAI/bge-base-en-v1.5"),
    "Gemma2Model": _HfExamplesInfo("BAAI/bge-multilingual-gemma2"),  # noqa: E501
+    "Gemma3TextModel": _HfExamplesInfo("google/embeddinggemma-300m"),
    "GritLM": _HfExamplesInfo("parasail-ai/GritLM-7B-vllm"),
    "GteModel": _HfExamplesInfo("Snowflake/snowflake-arctic-embed-m-v2.0",
                                               trust_remote_code=True),
@@ -359,7 +394,20 @@ _EMBEDDING_EXAMPLE_MODELS = {
                                         trust_remote_code=True),
    "Qwen2VLForConditionalGeneration": _HfExamplesInfo("MrLight/dse-qwen2-2b-mrl-v1"), # noqa: E501
    "PrithviGeoSpatialMAE": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
-                                            is_available_online=False),  # noqa: E501
+                                            dtype=torch.float16,
+                                            enforce_eager=True,
+                                            skip_tokenizer_init=True,
+                                            # This is to avoid the model
+                                            # going OOM in CI
+                                            max_num_seqs=32,
+                                            ),
+    "Terratorch": _HfExamplesInfo("ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", # noqa: E501
+                                  dtype=torch.float16,
+                                  enforce_eager=True,
+                                  skip_tokenizer_init=True,
+                                  # This is to avoid the model going OOM in CI
+                                  max_num_seqs=32,
+                                  ),
 }

 _SEQUENCE_CLASSIFICATION_EXAMPLE_MODELS = {
@@ -438,6 +486,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
    "InternVLForConditionalGeneration": _HfExamplesInfo("OpenGVLab/InternVL3-1B-hf"),    # noqa: E501
    "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
                                                    trust_remote_code=True),
+    "KeyeVL1_5ForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-1_5-8B", # noqa: E501
+                                                         trust_remote_code=True),
    "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
                                                      extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
                                                      trust_remote_code=True),
@@ -455,6 +505,8 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                      max_transformers_version="4.48",  # noqa: E501
                                                      transformers_version_reason="HF model is not compatible.",  # noqa: E501
                                                      hf_overrides={"architectures": ["MantisForConditionalGeneration"]}),  # noqa: E501
+    "MiDashengLMModel": _HfExamplesInfo("mispeech/midashenglm-7b",
+                            trust_remote_code=True),
    "MiniCPMO": _HfExamplesInfo("openbmb/MiniCPM-o-2_6",
                                trust_remote_code=True),
    "MiniCPMV": _HfExamplesInfo("openbmb/MiniCPM-Llama3-V-2_5",
@@ -474,6 +526,9 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                              trust_remote_code=True),
    "Llama_Nemotron_Nano_VL" : _HfExamplesInfo("nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1", # noqa: E501
                                                     trust_remote_code=True),
+    "NemotronH_Nano_VL": _HfExamplesInfo("nano_vl_dummy",
+                                          is_available_online=False,
+                                          trust_remote_code=True),
    "Ovis": _HfExamplesInfo("AIDC-AI/Ovis2-1B", trust_remote_code=True,
                            max_transformers_version="4.53",
                            transformers_version_reason="HF model is not compatible",  # noqa: E501
@@ -554,19 +609,21 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
    "EagleDeepSeekMTPModel": _HfExamplesInfo("eagle618/deepseek-v3-random",
                                        speculative_model="eagle618/eagle-deepseek-v3-random",  # noqa: E501
                                        trust_remote_code=True),
-    "EagleLlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE-LLaMA3-Instruct-8B",
+    "EagleLlamaForCausalLM": _HfExamplesInfo("meta-llama/Meta-Llama-3-8B-Instruct", # noqa: E501
                                             trust_remote_code=True,
                                             speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B",
-                                             tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"),  # noqa: E501
-    "Eagle3LlamaForCausalLM": _HfExamplesInfo("yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",  # noqa: E501
+                                             tokenizer="meta-llama/Meta-Llama-3-8B-Instruct"), # noqa: E501
+    "Eagle3LlamaForCausalLM": _HfExamplesInfo("meta-llama/Llama-3.1-8B-Instruct",  # noqa: E501
+                                            trust_remote_code=True,
+                                            speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", # noqa: E501
+                                            tokenizer="meta-llama/Llama-3.1-8B-Instruct",
+                                            use_original_num_layers=True,
+                                            max_model_len=10240),
+    "LlamaForCausalLMEagle3": _HfExamplesInfo("Qwen/Qwen3-8B",  # noqa: E501
                                            trust_remote_code=True,
-                                            speculative_model="yuhuili/EAGLE3-LLaMA3.1-Instruct-8B",
-                                            tokenizer="meta-llama/Llama-3.1-8B-Instruct"),
-    # TODO: Re-enable this once tests/models/test_initialization.py is fixed, see PR #22333 #22611   # noqa: E501
-    # "LlamaForCausalLMEagle3": _HfExamplesInfo("AngelSlim/Qwen3-8B_eagle3",  # noqa: E501
-    #                                         trust_remote_code=True,
-    #                                         speculative_model="AngelSlim/Qwen3-8B_eagle3",   # noqa: E501
-    #                                         tokenizer="Qwen/Qwen3-8B"),
+                                            speculative_model="AngelSlim/Qwen3-8B_eagle3",   # noqa: E501
+                                            tokenizer="Qwen/Qwen3-8B",
+                                            use_original_num_layers=True),
    "EagleLlama4ForCausalLM": _HfExamplesInfo(
        "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct",
        trust_remote_code=True,
@@ -586,7 +643,9 @@ _SPECULATIVE_DECODING_EXAMPLE_MODELS = {
                                        is_available_online=False),
    "MiMoMTPModel": _HfExamplesInfo("XiaomiMiMo/MiMo-7B-RL",
                                    trust_remote_code=True,
-                                    speculative_model="XiaomiMiMo/MiMo-7B-RL")
+                                    speculative_model="XiaomiMiMo/MiMo-7B-RL"),
+    "Qwen3NextMTP": _HfExamplesInfo("Qwen/Qwen3-Next-80B-A3B-Instruct",
+                                     min_transformers_version="4.56.2"),
 }

 _TRANSFORMERS_BACKEND_MODELS = {

--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -36,7 +36,10 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,

    hf_overrides_fn = partial(dummy_hf_overrides,
                              model_arch=model_arch,
-                              exist_overrides=model_info.hf_overrides)
+                              exist_overrides=model_info.hf_overrides,
+                              use_original_num_layers=getattr(
+                                  model_info, 'use_original_num_layers',
+                                  False))

    # Avoid calling model.forward()
    def _initialize_kv_caches_v0(self) -> None:
@@ -60,19 +63,29 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
                       _initialize_kv_caches_v1), monkeypatch.context() as m):
        if model_info.v0_only:
            m.setenv("VLLM_USE_V1", "0")
-        if model_arch == "Phi4FlashForCausalLM":
-            # Phi4FlashForCausalLM only supports DIFFERENTIAL_FLASH_ATTN backend
+        if model_arch in ("Phi4FlashForCausalLM", "MotifForCausalLM"):
+            # Phi4FlashForCausalLM and MotifForCausalLM
+            # only supports DIFFERENTIAL_FLASH_ATTN backend
            m.setenv("VLLM_ATTENTION_BACKEND", "DIFFERENTIAL_FLASH_ATTN")
        if model_arch == "GptOssForCausalLM":
            # FIXME: A hack to bypass FA3 assertion because our CI's L4 GPU
            # has cc==8.9 which hasn't supported FA3 yet. Remove this hack when
            # L4 supports FA3.
            m.setenv("VLLM_ATTENTION_BACKEND", "TRITON_ATTN_VLLM_V1")
+        if model_arch == "Florence2ForConditionalGeneration":
+            # An encoder-decoder model that's V0-only. Just skip it
+            # since V0 is about to be removed.
+            pytest.skip("Skipping Florence2ForConditionalGeneration")
+        if model_arch == "WhisperForConditionalGeneration":
+            m.setenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn")
        LLM(
            model_info.default,
            tokenizer=model_info.tokenizer,
            tokenizer_mode=model_info.tokenizer_mode,
            revision=model_info.revision,
+            enforce_eager=model_info.enforce_eager,
+            skip_tokenizer_init=model_info.skip_tokenizer_init,
+            dtype=model_info.dtype,
            speculative_config={
                "model": model_info.speculative_model,
                "num_speculative_tokens": 1,
@@ -85,7 +98,7 @@ def can_initialize(model_arch: str, monkeypatch: pytest.MonkeyPatch,
            model_impl=ModelImpl.TRANSFORMERS
            if model_arch in _TRANSFORMERS_BACKEND_MODELS else ModelImpl.VLLM,
            hf_overrides=hf_overrides_fn,
-        )
+            max_num_seqs=model_info.max_num_seqs)


 @pytest.mark.parametrize("model_arch", HF_EXAMPLE_MODELS.get_supported_archs())

--- a/tests/models/test_terratorch.py
+++ b/tests/models/test_terratorch.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+import torch
+
+from tests.conftest import VllmRunner
+from vllm.utils import set_default_torch_num_threads
+
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11",
+        "mgazz/Prithvi_v2_eo_300_tl_unet_agb"
+    ],
+)
+def test_inference(
+    vllm_runner: type[VllmRunner],
+    model: str,
+) -> None:
+
+    pixel_values = torch.full((6, 512, 512), 1.0, dtype=torch.float16)
+    location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
+    prompt = dict(prompt_token_ids=[1],
+                  multi_modal_data=dict(pixel_values=pixel_values,
+                                        location_coords=location_coords))
+    with (
+            set_default_torch_num_threads(1),
+            vllm_runner(
+                model,
+                runner="pooling",
+                dtype=torch.float16,
+                enforce_eager=True,
+                skip_tokenizer_init=True,
+                # Limit the maximum number of sequences to avoid the
+                # test going OOM during the warmup run
+                max_num_seqs=32,
+            ) as vllm_model,
+    ):
+
+        vllm_output = vllm_model.llm.encode(prompt)
+        assert torch.equal(
+            torch.isnan(vllm_output[0].outputs.data).any(),
+            torch.tensor(False))
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -294,6 +294,8 @@ def build_model_context(
        limit_mm_per_prompt=limit_mm_per_prompt,
        mm_processor_cache_gb=mm_processor_cache_gb,
        hf_overrides=model_info.hf_overrides,
+        skip_tokenizer_init=model_info.skip_tokenizer_init,
+        enforce_eager=model_info.enforce_eager,
        **model_config_kwargs,
    )
    return InputContext(model_config)
@@ -345,6 +347,7 @@ class ModelInfo:
    name: str
    architecture: str = ""
    dtype: str = "auto"
+    hf_dtype: str = "float32"
    hf_overrides: Optional[dict[str, Any]] = None
    default_pooling_type: str = ""
    enable_test: bool = True
@@ -352,6 +355,7 @@ class ModelInfo:

 @dataclass
 class EmbedModelInfo(ModelInfo):
+    mteb_score: Optional[float] = None
    is_matryoshka: bool = False
    matryoshka_dimensions: Optional[list[int]] = None

@@ -368,7 +372,7 @@ class LASTPoolingEmbedModelInfo(EmbedModelInfo):

 @dataclass
 class RerankModelInfo(ModelInfo):
-    pass
+    mteb_score: Optional[float] = None


 @dataclass
@@ -381,11 +385,18 @@ class LASTPoolingRerankModelInfo(RerankModelInfo):
    default_pooling_type: str = "LAST"


+@dataclass
+class GenerateModelInfo(ModelInfo):
+    hf_dtype: str = "auto"
+    hf_ppl: Optional[float] = None
+
+
 def dummy_hf_overrides(
    hf_config: PretrainedConfig,
    *,
    model_arch: str = "",
    exist_overrides: Optional[dict[str, Any]] = None,
+    use_original_num_layers: bool = False,
 ) -> PretrainedConfig:
    """
    Dummy HF overrides function used to create dummy model
@@ -402,10 +413,18 @@ def dummy_hf_overrides(

    # we use three layers for Gemma-3n to check
    # both normal layer and kv_shared_layer
-    num_hidden_layers = (3 if model_arch == "Gemma3nForConditionalGeneration"
-                         else 1)
+    if use_original_num_layers:
+        # Use the original number of layers from the config
+        num_layers = getattr(text_config, 'num_layers', 1)
+        num_hidden_layers = getattr(text_config, 'num_hidden_layers', 1)
+    else:
+        # Use minimal layers for testing
+        num_layers = 1
+        num_hidden_layers = (3 if model_arch
+                             == "Gemma3nForConditionalGeneration" else 1)
+
    text_config.update({
-        "num_layers": 1,
+        "num_layers": num_layers,
        "num_hidden_layers": num_hidden_layers,
        "num_experts": num_experts,
        "num_experts_per_tok": 2,

--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -31,11 +31,11 @@ if TYPE_CHECKING:
    from vllm.multimodal.inputs import MultiModalPlaceholderDict

 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
-TEST_IMAGE_URLS = [
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
-    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
-    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
-    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+TEST_IMAGE_ASSETS = [
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
 ]

 TEST_VIDEO_URLS = [
@@ -45,12 +45,11 @@ TEST_VIDEO_URLS = [


 @pytest.fixture(scope="module")
-def url_images() -> dict[str, Image.Image]:
-    connector = MediaConnector()
+def url_images(local_asset_server) -> dict[str, Image.Image]:

    return {
-        image_url: connector.fetch_image(image_url)
-        for image_url in TEST_IMAGE_URLS
+        image_url: local_asset_server.get_image_asset(image_url)
+        for image_url in TEST_IMAGE_ASSETS
    }


@@ -69,7 +68,7 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool:


 @pytest.mark.asyncio
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_fetch_image_http(image_url: str):
    connector = MediaConnector()

@@ -79,12 +78,12 @@ async def test_fetch_image_http(image_url: str):


 @pytest.mark.asyncio
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
 @pytest.mark.parametrize("suffix", get_supported_suffixes())
 async def test_fetch_image_base64(url_images: dict[str, Image.Image],
-                                  image_url: str, suffix: str):
+                                  raw_image_url: str, suffix: str):
    connector = MediaConnector()
-    url_image = url_images[image_url]
+    url_image = url_images[raw_image_url]

    try:
        mime_type = Image.MIME[Image.registered_extensions()[suffix]]
@@ -117,7 +116,7 @@ async def test_fetch_image_base64(url_images: dict[str, Image.Image],


 @pytest.mark.asyncio
-@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
 async def test_fetch_image_local_files(image_url: str):
    connector = MediaConnector()

@@ -152,8 +151,8 @@ async def test_fetch_image_local_files(image_url: str):


 @pytest.mark.asyncio
-async def test_fetch_image_local_files_with_space_in_name():
-    image_url = TEST_IMAGE_URLS[0]
+@pytest.mark.parametrize("image_url", [TEST_IMAGE_ASSETS[0]], indirect=True)
+async def test_fetch_image_local_files_with_space_in_name(image_url: str):
    connector = MediaConnector()

    with TemporaryDirectory() as temp_dir:
@@ -205,6 +204,32 @@ async def test_fetch_video_http(video_url: str, num_frames: int):
    assert metadata_sync == metadata_async


+@pytest.mark.asyncio
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+@pytest.mark.parametrize("max_duration", [1, 60, 1800])
+@pytest.mark.parametrize("requested_fps", [2, 24])
+async def test_fetch_video_http_with_dynamic_loader(
+        video_url: str, max_duration: int, requested_fps: int,
+        monkeypatch: pytest.MonkeyPatch):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
+        connector = MediaConnector(
+            media_io_kwargs={
+                "video": {
+                    "max_duration": max_duration,
+                    "requested_fps": requested_fps,
+                }
+            })
+
+        video_sync, metadata_sync = connector.fetch_video(video_url)
+        video_async, metadata_async = await connector.fetch_video_async(
+            video_url)
+
+        assert np.array_equal(video_sync, video_async)
+        assert metadata_sync == metadata_async
+        assert metadata_sync["video_backend"] == "opencv_dynamic"
+
+
 # Used for `test_argsort_mm_positions`.
 class TestCase(NamedTuple):
    mm_positions: "MultiModalPlaceholderDict"
@@ -458,7 +483,7 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
    with torch.inference_mode():
        sharded_output = run_dp_sharded_vision_model(image_input, vision_model)

-    # Check that the world size is setup correctly
+    # Check that the world size is set up correctly
    assert get_tensor_model_parallel_world_size() == world_size

    # Check that the outputs have the same shape
@@ -636,11 +661,13 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,

    # Run the model through the sharded function
    with torch.inference_mode():
-        sharded_output = run_dp_sharded_mrope_vision_model(
-            vision_model, pixel_values, grid_thw_list)
+        sharded_output = run_dp_sharded_mrope_vision_model(vision_model,
+                                                           pixel_values,
+                                                           grid_thw_list,
+                                                           rope_type="rope_3d")
        sharded_output = torch.cat(sharded_output, dim=0)

-    # Check that the world size is setup correctly
+    # Check that the world size is set up correctly
    assert get_tensor_model_parallel_world_size() == world_size

    # Compare outputs (only on rank 0)
@@ -691,8 +718,10 @@ def run_dp_sharded_mrope_vision_model_empty_input_worker(

    # Should handle empty input gracefully
    with torch.inference_mode():
-        output = run_dp_sharded_mrope_vision_model(vision_model, pixel_values,
-                                                   grid_thw_list)
+        output = run_dp_sharded_mrope_vision_model(vision_model,
+                                                   pixel_values,
+                                                   grid_thw_list,
+                                                   rope_type="rope_3d")

    assert len(output) == 0

@@ -745,8 +774,10 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker(

    # Should handle uneven distribution without errors
    with torch.inference_mode():
-        output_tuple = run_dp_sharded_mrope_vision_model(
-            vision_model, pixel_values, grid_thw_list)
+        output_tuple = run_dp_sharded_mrope_vision_model(vision_model,
+                                                         pixel_values,
+                                                         grid_thw_list,
+                                                         rope_type="rope_3d")

    # Verify output shape is reasonable
    merge_factor = vision_model.spatial_merge_size**2

--- a/tests/neuron/1_core/test_activation.py
+++ b/tests/neuron/1_core/test_activation.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-import torch.nn.functional as F
-
-from vllm.model_executor.layers.activation import FastGELU, SiluAndMul
-from vllm.platforms import current_platform
-
-
-@pytest.mark.parametrize("activation", ["silu_and_mul", "gelu_fast"])
-@pytest.mark.parametrize("num_tokens,d,dtype", [
-    (7, 512, torch.half),
-    (7, 512, torch.float),
-    (83, 512, torch.half),
-])
-@torch.inference_mode()
-def test_act_and_mul(
-    activation: str,
-    num_tokens: int,
-    d: int,
-    dtype: torch.dtype,
-) -> None:
-    import torch_xla.core.xla_model as xm
-
-    device = xm.xla_device()
-    current_platform.seed_everything(0)
-    torch.set_default_device("cpu")
-    x = torch.randn(num_tokens, 2 * d, dtype=dtype).to(device=device)
-    if activation == "silu_and_mul":
-        layer = SiluAndMul()
-        fn = layer.forward_native
-    elif activation == "gelu_fast":
-        layer = FastGELU()
-        fn = F.gelu
-    else:
-        raise NotImplementedError(
-            f"activation {activation} is not implemented.")
-    assert x.is_xla, "input tensor under testing is expected to be XLA tensor."
-    out = layer.to(device=device).forward_neuron(x)
-    ref_out = fn(x.cpu())
-    torch.testing.assert_close(out.cpu(), ref_out, atol=0.01, rtol=0.0)
--- a/tests/neuron/1_core/test_block_table.py
+++ b/tests/neuron/1_core/test_block_table.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import neuronxcc.nki.language as nl
-import pytest
-import torch
-import torch.nn.functional as F
-from neuronxcc import nki
-
-from vllm.attention.ops.nki_flash_attn import (
-    load_block_tables, transform_block_tables_for_indirect_load)
-
-
-def is_power_of_2(n):
-    return n > 0 and (n & (n - 1) == 0)
-
-
-def nki_load_and_transform_block_tables(
-    block_tables,
-    num_tiles,
-    num_blocks_per_tile,
-    num_head,
-    head_id,
-    block_size_tiling_factor,
-):
-    assert is_power_of_2(
-        num_blocks_per_tile), f"{num_blocks_per_tile=} must be power of 2"
-    block_tables_sbuf = load_block_tables(block_tables, num_tiles,
-                                          num_blocks_per_tile)
-
-    # we need to pass an Index as head_id
-    head_id = nl.arange(1)[None, :] + head_id
-
-    block_tables_transposed = transform_block_tables_for_indirect_load(
-        block_tables_sbuf, block_size_tiling_factor, num_head, head_id)
-    B_P_SIZE = 128
-    assert block_tables_transposed.shape[1] == B_P_SIZE
-
-    out = nl.ndarray(
-        block_tables_transposed.shape,
-        dtype=nl.int32,
-        buffer=nl.shared_hbm,
-    )
-    for i in nl.affine_range(block_tables_transposed.shape[0]):
-        nl.store(dst=out[i], value=block_tables_transposed[i])
-    return out
-
-
-def ref_block_tables_transform(
-    block_tables,
-    num_tiles,
-    num_blocks_per_tile,
-    num_head,
-    head_id,
-    block_size_tiling_factor,
-):
-    assert block_tables.numel() == num_tiles * num_blocks_per_tile
-    block_tables = block_tables.view(num_tiles, num_blocks_per_tile)
-    B_F_SIZE = 128
-    num_tiles_padded = (num_tiles + B_F_SIZE - 1) // B_F_SIZE * B_F_SIZE
-    block_tables = F.pad(
-        block_tables,
-        (0, 0, 0, num_tiles_padded - num_tiles),
-        "constant",
-        0,
-    )
-
-    block_tables = block_tables * num_head + head_id
-    block_tables = block_tables.view(num_tiles_padded, num_blocks_per_tile, 1)
-    offset = torch.arange(0, block_size_tiling_factor).view(1, 1, -1)
-    block_tables = block_tables * block_size_tiling_factor + offset
-    block_tables_transposed = block_tables.view(num_tiles_padded, -1).t()
-
-    num_blocks_per_tile = block_tables_transposed.shape[0]
-    assert num_blocks_per_tile % B_F_SIZE == 0
-    return block_tables_transposed.view(num_blocks_per_tile // B_F_SIZE,
-                                        B_F_SIZE, num_tiles_padded)
-
-
-@pytest.mark.parametrize(
-    "q_head_per_kv_head,head_id",
-    [
-        (1, 0),
-        (3, 1),
-    ],
-)
-@pytest.mark.parametrize(
-    "num_tiles,num_blocks_per_tile",
-    [
-        (1, 1),
-        (13, 16),
-        (17, 128),
-        (35, 512),
-        (128, 128),
-        (130, 64),
-        (280, 256),
-        (315, 1),
-    ],
-)
-@torch.inference_mode()
-def test_load_and_transform_block_tables(
-    monkeypatch: pytest.MonkeyPatch,
-    num_tiles,
-    num_blocks_per_tile,
-    q_head_per_kv_head,
-    head_id,
-) -> None:
-    import torch_xla.core.xla_model as xm
-
-    device = xm.xla_device()
-
-    compiler_flags_str = " ".join([
-        "-O1",
-        "--retry_failed_compilation",
-    ])
-    with monkeypatch.context() as m:
-        m.setenv("NEURON_CC_FLAGS", compiler_flags_str)
-
-        torch.manual_seed(10000)
-        torch.set_printoptions(sci_mode=False)
-
-        # On Neuron, we need B_P_SIZE = 128 blocks to make DMA efficient
-        B_P_SIZE = 128
-        if num_blocks_per_tile < B_P_SIZE:
-            assert B_P_SIZE % num_blocks_per_tile == 0
-            block_size_tiling_factor = B_P_SIZE // num_blocks_per_tile
-        else:
-            block_size_tiling_factor = 1
-        max_num_blocks = 100000
-        block_tables = torch.randint(
-            0,
-            max_num_blocks,
-            (num_tiles * num_blocks_per_tile, ),
-            dtype=torch.int32,
-        )
-        nki_out = nki.jit(nki_load_and_transform_block_tables)[1, 1](
-            block_tables.to(device=device),
-            num_tiles,
-            num_blocks_per_tile,
-            q_head_per_kv_head,
-            head_id,
-            block_size_tiling_factor,
-        ).cpu()
-        ref_out = ref_block_tables_transform(
-            block_tables,
-            num_tiles,
-            num_blocks_per_tile,
-            q_head_per_kv_head,
-            head_id,
-            block_size_tiling_factor,
-        )
-        assert (nki_out.shape == ref_out.shape
-                ), f"{nki_out.shape=} != {ref_out.shape=}"
-        assert torch.all(nki_out == ref_out)
--- a/tests/neuron/1_core/test_cache.py
+++ b/tests/neuron/1_core/test_cache.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-
-from vllm.attention.ops.nki_flash_attn import reshape_and_cache
-
-
-@pytest.mark.parametrize(
-    "num_tokens, n_kv_head, d_head, num_blocks, block_size",
-    [
-        # Small model configuration (e.g., GPT-2 small)
-        (32, 12, 64, 4, 128),  # Typical sequence processing
-        (1, 12, 64, 4, 128),  # Single token update
-        (128, 12, 64, 4, 128),  # Longer sequence
-
-        # Medium model configuration (e.g., GPT-2 medium)
-        (64, 16, 96, 8, 256),  # Standard batch
-        (256, 16, 96, 8, 256),  # Large batch
-
-        # Large model configuration (e.g., GPT-3 style)
-        (48, 32, 128, 16, 512),  # Typical processing window
-        (512, 32, 128, 16, 512),  # Full context window
-
-        # Edge cases and stress tests
-        (1024, 8, 32, 32, 32),  # Many tokens, small heads
-        (16, 64, 256, 4, 64),  # Few tokens, many heads
-        (2048, 24, 128, 64, 128),  # Large scale test
-
-        # Minimal configurations for debugging
-        (4, 2, 16, 2, 16),  # Tiny test case
-        (1, 1, 8, 1, 8),  # Minimal possible
-    ])
-def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks,
-                           block_size):
-    # Set random seed for reproducibility
-    torch.manual_seed(42)
-
-    # Create CPU tensors for reference implementation
-    key_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt(
-        torch.tensor(d_head))
-    value_cpu = torch.randn(num_tokens, n_kv_head, d_head) / torch.sqrt(
-        torch.tensor(d_head))
-    key_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head)
-    value_cache_cpu = torch.zeros(num_blocks, n_kv_head, block_size, d_head)
-    slot_mapping_cpu = torch.randperm(num_blocks * block_size)[:num_tokens]
-
-    # Run reference implementation on CPU
-    block_indices = torch.div(slot_mapping_cpu,
-                              block_size,
-                              rounding_mode="floor")
-    block_offsets = slot_mapping_cpu % block_size
-
-    for i in range(num_tokens):
-        block_idx = block_indices[i]
-        block_offset = block_offsets[i]
-        key_cache_cpu[block_idx, :, block_offset, :] = key_cpu[i]
-        value_cache_cpu[block_idx, :, block_offset, :] = value_cpu[i]
-
-    # Create XLA device tensors
-    device = torch.device('xla')
-    key = key_cpu.to(device)
-    value = value_cpu.to(device)
-    key_cache = torch.zeros_like(key_cache_cpu, device=device)
-    value_cache = torch.zeros_like(value_cache_cpu, device=device)
-    slot_mapping = slot_mapping_cpu.to(device)
-    kv_cache = torch.stack([key_cache, value_cache])
-
-    # Run vectorized implementation on XLA device
-    reshape_and_cache(key, value, kv_cache, slot_mapping)
-    key_cache, value_cache = torch.unbind(kv_cache, dim=0)
-
-    # Move results back to CPU for comparison
-    key_cache_result = key_cache.cpu()
-    value_cache_result = value_cache.cpu()
-
-    # Assert results match
-    torch.testing.assert_close(key_cache_result,
-                               key_cache_cpu,
-                               rtol=1e-5,
-                               atol=1e-5)
-    torch.testing.assert_close(value_cache_result,
-                               value_cache_cpu,
-                               rtol=1e-5,
-                               atol=1e-5)
--- a/tests/neuron/1_core/test_layernorm.py
+++ b/tests/neuron/1_core/test_layernorm.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import pytest
-import torch
-
-from vllm.model_executor.layers.layernorm import RMSNorm
-from vllm.platforms import current_platform
-
-
-@pytest.mark.parametrize("num_tokens,hidden_size,add_residual,dtype", [
-    (7, 8, False, torch.half),
-    (83, 768, False, torch.half),
-    (83, 768, True, torch.half),
-    (83, 768, True, torch.bfloat16),
-    (83, 768, True, torch.float32),
-])
-@torch.inference_mode()
-def test_rms_norm(
-    num_tokens: int,
-    hidden_size: int,
-    add_residual: bool,
-    dtype: torch.dtype,
-) -> None:
-    import torch_xla.core.xla_model as xm
-
-    device = xm.xla_device()
-    current_platform.seed_everything(0)
-    torch.set_default_device("cpu")
-    layer = RMSNorm(hidden_size).to(dtype=dtype)
-    layer.weight.data.normal_(mean=1.0, std=0.1)
-    scale = 1 / (2 * hidden_size)
-    x = torch.randn(num_tokens, hidden_size, dtype=dtype).to(device=device)
-    x *= scale
-    residual = torch.randn_like(x) * scale if add_residual else None
-
-    residual_cpu = residual.cpu() if add_residual else None
-    ref_out = layer.to(device="cpu").forward_native(x.cpu(), residual_cpu)
-    assert x.is_xla, "input tensor under testing is expected to be XLA tensor."
-    out = layer.to(device=device)(x, residual)
-
-    # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
-    # numerical errors than other operators because they involve reductions.
-    # Therefore, we use a larger tolerance.
-    if add_residual:
-        assert out[0].is_xla, "output tensor is expected to be XLA tensor"
-        torch.testing.assert_close(out[0].cpu(),
-                                   ref_out[0],
-                                   atol=1e-2,
-                                   rtol=1e-2)
-        torch.testing.assert_close(out[1].cpu(),
-                                   ref_out[1],
-                                   atol=1e-2,
-                                   rtol=1e-2)
-    else:
-        assert out.is_xla, "output tensor is expected to be XLA tensor"
-        torch.testing.assert_close(out.cpu(), ref_out, atol=1e-2, rtol=1e-2)
--- a/tests/neuron/1_core/test_logits_processor.py
+++ b/tests/neuron/1_core/test_logits_processor.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import random
-from unittest.mock import patch
-
-import pytest
-import torch
-
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.utils import set_random_seed
-from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.utils import is_pin_memory_available
-
-
-class MockLogitsProcessor(LogitsProcessor):
-
-    def __init__(self, vocab_size: int, scale: float,
-                 fake_logits: torch.Tensor):
-        super().__init__(vocab_size=vocab_size, scale=scale)
-        self.fake_logits = fake_logits.clone()
-
-    def forward(self, *args, **kwargs):
-        with patch(
-                "vllm.model_executor.layers.logits_processor._prune_hidden_states",
-                lambda x, y: x
-        ), patch(
-                "vllm.model_executor.layers.logits_processor.LogitsProcessor._get_logits",
-                lambda *args, **kwargs: self.fake_logits):
-            return super().forward(*args, **kwargs)
-
-
-def _prepare_test(
-        batch_size: int
-) -> tuple[torch.Tensor, torch.Tensor, MockLogitsProcessor]:
-    vocab_size = 32000
-    input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16)
-    fake_logits = torch.full((batch_size, vocab_size),
-                             1e-2,
-                             dtype=input_tensor.dtype)
-    logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits)
-    return input_tensor, fake_logits, logits_processor
-
-
-RANDOM_SEEDS = list(range(8))
-
-
-@pytest.mark.parametrize("seed", RANDOM_SEEDS)
-def test_logits_processors(seed: int):
-    import torch_xla.core.xla_model as xm
-
-    device = xm.xla_device()
-    set_random_seed(seed)
-    torch.set_default_device("cpu")
-    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, logits_processor = _prepare_test(batch_size)
-
-    # This sample logits processor gives infinite score to the i-th token,
-    # where i is the length of the input sequence.
-    # We therefore expect the output token sequence to be [0, 1, 2, ...]
-    def pick_ith(token_ids, logits):
-        logits[len(token_ids)] = float("inf")
-        return logits
-
-    seq_group_metadata_list = []
-    seq_lens = []
-    for i in range(batch_size):
-        seq_group_metadata_list.append(
-            SequenceGroupMetadata(
-                request_id=f"test_{i}",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=SamplingParams(temperature=0,
-                                               logits_processors=[pick_ith]),
-                block_tables={0: [1]},
-            ))
-        seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())
-
-    sampling_metadata = SamplingMetadata.prepare(
-        seq_group_metadata_list,
-        seq_lens,
-        query_lens=seq_lens,
-        device=device,
-        pin_memory=is_pin_memory_available())
-    logits_processor_output = logits_processor(
-        lm_head=None,
-        hidden_states=input_tensor,
-        sampling_metadata=sampling_metadata)
-
-    fake_logits *= logits_processor.scale
-    torch.testing.assert_close(logits_processor_output[:, 1],
-                               fake_logits[:, 1],
-                               rtol=1e-4,
-                               atol=0.0)
--- a/tests/neuron/1_core/test_neuron_model_runner.py
+++ b/tests/neuron/1_core/test_neuron_model_runner.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import os
-from unittest.mock import MagicMock
-
-from vllm.config import VllmConfig
-from vllm.engine.arg_utils import EngineArgs
-from vllm.platforms import current_platform
-from vllm.platforms.neuron import NeuronFramework
-from vllm.sampling_params import SamplingParams
-from vllm.sequence import SequenceData, SequenceGroupMetadata
-from vllm.worker.neuron_model_runner import NeuronModelRunner
-
-os.environ[
-    'VLLM_NEURON_FRAMEWORK'] = NeuronFramework.TRANSFORMERS_NEURONX.value
-
-
-def _create_neuron_model_runner(model: str, *args,
-                                **kwargs) -> NeuronModelRunner:
-    engine_args = EngineArgs(model, *args, **kwargs)
-    engine_config = engine_args.create_engine_config()
-    vllm_config = VllmConfig(
-        model_config=engine_config.model_config,
-        parallel_config=engine_config.parallel_config,
-        scheduler_config=engine_config.scheduler_config,
-        device_config=engine_config.device_config,
-    )
-    neuron_model_runner = NeuronModelRunner(vllm_config=vllm_config)
-    return neuron_model_runner
-
-
-def test_update_neuron_sampling_params_not_full_batch():
-    os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0"
-    model_runner = _create_neuron_model_runner(
-        "facebook/opt-125m",
-        seed=0,
-        dtype="float16",
-        max_num_seqs=2,
-    )
-    assert not model_runner._on_device_sampling_disabled
-    # Test sampling param updating only when TNx is framework
-    # NxDI handles sampling parameter updating inside model
-    if current_platform.use_transformers_neuronx():
-        model_mock = MagicMock()
-        model_runner.model = model_mock
-
-        seq_group_metadata_list = [
-            SequenceGroupMetadata(
-                request_id="test_0",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=SamplingParams(temperature=0.5,
-                                               top_k=1,
-                                               top_p=0.5),
-                block_tables={0: [1]},
-            )
-        ]
-
-        model_runner.prepare_model_input(seq_group_metadata_list)
-
-        # Index neuron sampling parameters based on block_tables indices.
-        # The first block_id of the sequence 0 is 1, so its parameters are
-        # placed at index 1. So the sampling parameters will be:
-        # Index 0: default sampling parameters
-        # Index 1: sequecne 0's sampling parameters.
-        neuron_sampling_params = (
-            model_runner.model_config.neuron_sampling_params)
-        assert neuron_sampling_params.temperature == [1.0, 0.5]
-        assert neuron_sampling_params.top_k == [
-            model_runner._MAX_NEURON_SAMPLING_TOP_K, 1
-        ]
-        assert neuron_sampling_params.top_p == [1.0, 0.5]
-        model_mock.model.update_generation_config.assert_called_once_with(
-            neuron_sampling_params)
-
-
-def test_update_neuron_sampling_params_full_batch():
-    os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0"
-    model_runner = _create_neuron_model_runner(
-        "facebook/opt-125m",
-        seed=0,
-        dtype="float16",
-        max_num_seqs=2,
-    )
-    assert not model_runner._on_device_sampling_disabled
-
-    # Test sampling param updating only when TNx is framework
-    # NxDI handles sampling parameter updating inside model
-    if current_platform.use_transformers_neuronx():
-        model_mock = MagicMock()
-        model_runner.model = model_mock
-
-        seq_group_metadata_list = [
-            SequenceGroupMetadata(
-                request_id="test_0",
-                is_prompt=True,
-                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
-                sampling_params=SamplingParams(temperature=0.5,
-                                               top_k=1,
-                                               top_p=0.5),
-                block_tables={0: [1]},
-            ),
-            SequenceGroupMetadata(
-                request_id="test_0",
-                is_prompt=True,
-                seq_data={1: SequenceData.from_seqs([4, 5, 6])},
-                sampling_params=SamplingParams(temperature=0.2,
-                                               top_k=2,
-                                               top_p=0.2),
-                block_tables={1: [0]},
-            )
-        ]
-
-        model_runner.prepare_model_input(seq_group_metadata_list)
-
-        # Index neuron sampling parameters based on block_tables indices.
-        # The first block_id of the sequence 0 is 1, so its parameters are
-        # placed at index 1. So the sampling parameters will be:
-        # Index 0: sequence 1's sampling parameters
-        # Index 1: sequecne 0's sampling parameters.
-        neuron_sampling_params = (
-            model_runner.model_config.neuron_sampling_params)
-        assert neuron_sampling_params.temperature == [0.2, 0.5]
-        assert neuron_sampling_params.top_k == [2, 1]
-        assert neuron_sampling_params.top_p == [0.2, 0.5]
-        model_mock.model.update_generation_config.assert_called_once_with(
-            neuron_sampling_params)