Merge tag 'v0.9.2' into v0.9.2-ori

99324e25 · zhuwenwen · cc7f22a8 · a5dd03c1 · 99324e25 · 99324e25
Commit 99324e25 authored Jul 12, 2025 by zhuwenwen
20 changed files
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+# Register CustomRotaryEmbedding to CustomOP.
+@RotaryEmbedding.register_oot
+class DummyRotaryEmbedding(RotaryEmbedding):
+    """Original rotary positional embedding."""
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.addition_config = True
+    def forward_oot(self, *args,
+                    **kwargs) -> tuple[torch.Tensor, torch.Tensor]:
+        return super().forward_oot(*args, **kwargs)
--- a/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+++ b/tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING
-from vllm.platforms.cuda import CudaPlatform
+from vllm.platforms.interface import Platform, PlatformEnum
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+else:
+    VllmConfig = None
+from vllm import envs
-class DummyPlatform(CudaPlatform):
+class DummyPlatform(Platform):
+    _enum = PlatformEnum.OOT
    device_name = "DummyDevice"
+    device_type: str = "privateuseone"
+    dispatch_key: str = "PrivateUse1"
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        if envs.VLLM_USE_V1:
+            compilation_config = vllm_config.compilation_config
+            # Activate custom ops for v1.
+            compilation_config.custom_ops = ["all"]
    def get_attn_backend_cls(self, backend_name, head_size, dtype,
                             kv_cache_dtype, block_size, use_v1, use_mla):
        return "vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"  # noqa E501
\ No newline at end of file
--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -5,6 +5,7 @@ import pytest
 import torch
 from vllm.attention.selector import get_attn_backend
+from vllm.plugins import load_general_plugins
 from vllm.utils import STR_BACKEND_ENV_VAR, STR_INVALID_VAL
@@ -32,3 +33,16 @@ def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
        m.setenv(STR_BACKEND_ENV_VAR, STR_INVALID_VAL)
        backend = get_attn_backend(16, torch.float16, "auto", 16, False)
        assert backend.get_name() == "Dummy_Backend"
+def test_oot_custom_op(monkeypatch: pytest.MonkeyPatch):
+    # simulate workload by running an example
+    load_general_plugins()
+    from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
+    layer = RotaryEmbedding(16, 16, 16, 16, True, torch.float16)
+    assert layer.__class__.__name__ == "DummyRotaryEmbedding", (
+        f"Expected DummyRotaryEmbedding, got {layer.__class__.__name__}, "
+        "possibly because the custom op is not registered correctly.")
+    assert hasattr(layer, "addition_config"), (
+        "Expected DummyRotaryEmbedding to have an 'addition_config' attribute, "
+        "which is set by the custom op.")
--- a/tests/pplx_utils.py
+++ b/tests/pplx_utils.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import dataclasses
-import os
-import traceback
-from typing import Callable
-import torch
-from torch.multiprocessing import (
-    spawn)  # pyright: ignore[reportPrivateImportUsage]
-from typing_extensions import Concatenate, ParamSpec
-P = ParamSpec("P")
-@dataclasses.dataclass
-class ProcessGroupInfo:
-    world_size: int
-    world_local_size: int
-    rank: int
-    node_rank: int
-    local_rank: int
-    device: torch.device
-def _worker_parallel_launch(
-    local_rank: int,
-    world_size: int,
-    world_local_size: int,
-    node_rank: int,
-    init_method: str,
-    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
-    *args: P.args,
-    **kwargs: P.kwargs,
-) -> None:
-    rank = node_rank * world_local_size + local_rank
-    torch.cuda.set_device(local_rank)
-    device = torch.device("cuda", local_rank)
-    torch.distributed.init_process_group(
-        backend="cpu:gloo,cuda:nccl",
-        init_method=init_method,
-        rank=rank,
-        world_size=world_size,
-        device_id=device,
-    )
-    barrier = torch.tensor([rank], device=device)
-    torch.distributed.all_reduce(barrier)
-    try:
-        worker(
-            ProcessGroupInfo(
-                world_size=world_size,
-                world_local_size=world_local_size,
-                rank=rank,
-                node_rank=node_rank,
-                local_rank=local_rank,
-                device=device,
-            ),
-            *args,
-            **kwargs,
-        )
-    except Exception as ex:
-        print(ex)
-        traceback.print_exc()
-        raise
-    finally:
-        torch.distributed.destroy_process_group()
-def parallel_launch(
-    world_size: int,
-    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
-    *args: P.args,
-    **kwargs: P.kwargs,
-) -> None:
-    assert not kwargs
-    spawn(
-        _worker_parallel_launch,
-        args=(
-            world_size,
-            world_size,
-            0,
-            "tcp://localhost:29500",
-            worker,
-        ) + args,
-        nprocs=world_size,
-        join=True,
-    )
-def parallel_launch_from_env(
-    worker: Callable[Concatenate[ProcessGroupInfo, P], None],
-    *args: P.args,
-    **kwargs: P.kwargs,
-) -> None:
-    """
-    Launches a worker function in parallel across all processes in the current
-    environment. The environment must have the following variables set:
-    - WORLD_SIZE: The total number of processes.
-    - WORLD_LOCAL_SIZE: The number of processes on the current node.
-    - NODE_RANK: The rank of the current
-    - MASTER_ADDR: The address of the master process.
-    - MASTER_PORT: The port of the master process.
-    """
-    assert not kwargs
-    world_size = int(os.environ["WORLD_SIZE"])
-    world_local_size = int(os.environ["WORLD_LOCAL_SIZE"])
-    node_rank = int(os.environ["NODE_RANK"])
-    assert "MASTER_ADDR" in os.environ
-    assert "MASTER_PORT" in os.environ
-    spawn(
-        _worker_parallel_launch,
-        args=(
-            world_size,
-            world_local_size,
-            node_rank,
-            "env://",
-            worker,
-        ) + args,
-        nprocs=world_local_size,
-        join=True,
-    )
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -159,8 +159,9 @@ def test_4bit_bnb_embedding_model(
    with vllm_runner(model_name,
                     task="embed",
                     dtype=dtype,
+                     gpu_memory_utilization=0.5,
                     quantization="bitsandbytes") as vllm_model:
-        vllm_outputs = vllm_model.encode(example_prompts)
+        vllm_outputs = vllm_model.embed(example_prompts)
    check_embeddings_close(
        embeddings_0_lst=hf_outputs,
        embeddings_1_lst=vllm_outputs,

--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -17,7 +17,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
    CompressedTensorsW4A4Fp4, CompressedTensorsW4A16Fp4,
    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
-    CompressedTensorsWNA16)
+    CompressedTensorsWNA16, cutlass_fp4_supported)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
    sparse_cutlass_supported)
 from vllm.platforms import current_platform
@@ -667,7 +667,13 @@ def test_compressed_tensors_nvfp4(vllm_runner, args):
            qkv_proj = layer.self_attn.qkv_proj
            assert isinstance(qkv_proj.quant_method,
                              CompressedTensorsLinearMethod)
-            assert isinstance(qkv_proj.scheme, scheme)
+            if isinstance(qkv_proj.scheme, scheme) or isinstance(
+                    qkv_proj.scheme,
+                    CompressedTensorsW4A16Fp4) and not cutlass_fp4_supported():
+                assert True
+            else:
+                raise AssertionError("FP4 Scheme Mismatch")
            assert qkv_proj.scheme.group_size == 16
        llm.apply_model(check_model)

--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -53,6 +53,7 @@ class CustomQuantConfig(QuantizationConfig):
    def __init__(self, num_bits: int = 8) -> None:
        """Initialize the quantization config."""
+        super().__init__()
        self.num_bits = num_bits
    def get_name(self) -> QuantizationMethods:

--- a/tests/quantization/test_rtn.py
+++ b/tests/quantization/test_rtn.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright © 2025, Oracle and/or its affiliates.
+"""Tests RTN quantization startup and generation, 
+doesn't test correctness
+"""
+import pytest
+from tests.quantization.utils import is_quant_method_supported
+MODELS = ["microsoft/Phi-3-mini-4k-instruct"]
+@pytest.mark.skipif(not is_quant_method_supported("rtn"),
+                    reason="RTN is not supported on this GPU type.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("max_tokens", [10])
+def test_model_rtn_startup(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    with vllm_runner(model, dtype=dtype, quantization="rtn") as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -60,5 +60,20 @@ def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
        print(output)
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
+    torch._dynamo.reset()
+    model_name = "mobicham/Qwen2.5-VL-3B-Instruct_int8wo_ao"
+    with vllm_runner(model_name=model_name,
+                     quantization="torchao",
+                     dtype="bfloat16",
+                     pt_load_map_location="cuda:0") as llm:
+        output = llm.generate_greedy(["The capital of France is"],
+                                     max_tokens=32)
+        assert output
+        print(output)
 if __name__ == "__main__":
    pytest.main([__file__])
--- a/tests/samplers/test_typical_acceptance_sampler.py
+++ b/tests/samplers/test_typical_acceptance_sampler.py
@@ -248,7 +248,7 @@ def test_temperature_zero_target_distribution(seed: int, device: str):
                                    size=(batch_size, 1),
                                    dtype=torch.int64)
    # The target probaility distribution is a temperature zero distribution
-    # with zero entroy. Since our draft token ids don't match the probability
+    # with zero entropy. Since our draft token ids don't match the probability
    # 1.0 tokens in the target distribution we will reject all of them and
    # fallback to the greedy sampling for selecting 1 token for each sequence.
    # Verify the same.

--- a/tests/spec_decode/e2e/test_eagle_correctness.py
+++ b/tests/spec_decode/e2e/test_eagle_correctness.py
@@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
    * Test greedy equality under various number of speculative tokens.
 With those tests, we can say at least, EAGLE would not break the
-correctess for the target model outputs.
+correctness for the target model outputs.
 """
 import pytest
@@ -370,6 +370,10 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
+        # 2 for small prompt, 256//16 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 16,
+        "max_model_len": (2 + 256 // 16) * 16,
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
@@ -420,6 +424,10 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
+        # 2 for small prompt, 256//16 for generated.
+        "num_gpu_blocks_override": 2 + 256 // 16,
+        "max_model_len": (2 + 256 // 16) * 16,
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,

--- a/tests/spec_decode/e2e/test_integration.py
+++ b/tests/spec_decode/e2e/test_integration.py
@@ -14,10 +14,13 @@ MAIN_MODEL = "JackFram/llama-68m"
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
+        "model_name": "JackFram/llama-68m",
        # Verify equality when cuda graphs allowed.
        "enforce_eager": False,
-        "model_name": "JackFram/llama-68m",
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize(
    "per_test_common_llm_kwargs",
@@ -59,6 +62,9 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [])
 @pytest.mark.parametrize(
@@ -117,6 +123,9 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])

--- a/tests/spec_decode/e2e/test_logprobs.py
+++ b/tests/spec_decode/e2e/test_logprobs.py
@@ -17,7 +17,10 @@ from .conftest import run_equality_correctness_test
        "model_name": "JackFram/llama-160m",
        # Skip cuda graph recording for fast test.
-        "enforce_eager": True
+        "enforce_eager": True,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -75,6 +78,9 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -128,6 +134,9 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -182,6 +191,9 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -256,8 +268,12 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
    "common_llm_kwargs",
    [{
        "model_name": "JackFram/llama-160m",
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])

--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
    * Test greedy equality under various number of speculative tokens.
 With those tests, we can say at least, Medusa would not break the
-correctess for the target model outputs.
+correctness for the target model outputs.
 """
 import pytest

--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -494,6 +494,9 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
+        # Precision
+        "dtype": PRECISION,
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])

--- a/tests/spec_decode/e2e/test_mtp_correctness.py
+++ b/tests/spec_decode/e2e/test_mtp_correctness.py
@@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
    * Test greedy equality under various number of speculative tokens.
 With those tests, we can say at least, mtp would not break the
-correctess for the target model outputs.
+correctness for the target model outputs.
 """
 import pytest

--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -57,6 +57,9 @@ from .conftest import (get_output_from_llm_generator,
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize(
    "per_test_common_llm_kwargs",
@@ -139,6 +142,9 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
        # Print spec metrics.
        "disable_log_stats": False,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize(
    "per_test_common_llm_kwargs",
@@ -216,6 +222,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
        # Print spec metrics.
        "disable_log_stats": False,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize(
    "per_test_common_llm_kwargs",
@@ -279,6 +288,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
    [{
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize(
    "per_test_common_llm_kwargs",
@@ -464,6 +476,8 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [
    {
@@ -523,6 +537,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize(
    "per_test_common_llm_kwargs",
@@ -589,6 +605,8 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -655,6 +673,8 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -706,6 +726,8 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -763,6 +785,8 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])

--- a/tests/spec_decode/e2e/test_ngram_correctness.py
+++ b/tests/spec_decode/e2e/test_ngram_correctness.py
@@ -22,8 +22,8 @@ However, we still need to verify below scenario could be passed:
    * Test greedy equality under preemption
    * Test greedy equality under various ngram sizes / speculative sizes
-With those tests, we can say at least, ngram spec would not break the correctess
+With those tests, we can say at least, ngram spec would not break the
-for the target model outputs.
+correctness for the target model outputs.
 """
 import pytest
@@ -40,6 +40,9 @@ from .conftest import run_equality_correctness_test
        # Print spec metrics.
        "disable_log_stats": False,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [
    {
@@ -97,6 +100,9 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
        # Print spec metrics.
        "disable_log_stats": False,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [
    {
@@ -160,6 +166,9 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [
    {
@@ -221,6 +230,9 @@ def test_ngram_e2e_greedy_correctness_with_preemption(
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -281,6 +293,9 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
@@ -337,6 +352,9 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
        # Skip cuda graph recording for fast test.
        "enforce_eager": True,
+        # The original model is float32, keep it for numerical stability.
+        "dtype": "float32",
    }])
 @pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
 @pytest.mark.parametrize("baseline_llm_kwargs", [{}])

--- a/tests/standalone_tests/pytorch_nightly_dependency.sh
+++ b/tests/standalone_tests/pytorch_nightly_dependency.sh
+#!/bin/sh
+# This script tests if the nightly torch packages are not overridden by the dependencies
+set -e
+set -x
+cd /vllm-workspace/
+rm -rf .venv
+uv venv .venv
+source .venv/bin/activate
+# check the environment
+uv pip freeze
+echo ">>> Installing nightly torch packages"
+uv pip install --quiet torch torchvision torchaudio --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu128
+echo ">>> Capturing torch-related versions before requirements install"
+uv pip freeze | grep -E '^torch|^torchvision|^torchaudio' | sort > before.txt
+echo "Before:"
+cat before.txt
+echo ">>> Installing requirements/nightly_torch_test.txt"
+uv pip install --quiet -r requirements/nightly_torch_test.txt
+echo ">>> Capturing torch-related versions after requirements install"
+uv pip freeze | grep -E '^torch|^torchvision|^torchaudio' | sort > after.txt
+echo "After:"
+cat after.txt
+echo ">>> Comparing versions"
+if diff before.txt after.txt; then
+  echo "torch version not overridden."
+else
+  echo "torch version overridden by nightly_torch_test.txt, \
+  if the dependency is not triggered by the pytroch nightly test,\
+  please add the dependency to the list 'white_list'  in tools/generate_nightly_torch_test.py"
+  exit 1
+fi
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -2,49 +2,16 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import MISSING, Field, asdict, dataclass, field
-from typing import Literal, Union
 import pytest
 from vllm.compilation.backends import VllmBackend
 from vllm.config import (LoadConfig, ModelConfig, PoolerConfig, VllmConfig,
-                         config, get_field)
+                         get_field)
 from vllm.model_executor.layers.pooler import PoolingType
 from vllm.platforms import current_platform
-class TestConfig1:
-    pass
-@dataclass
-class TestConfig2:
-    a: int
-    """docstring"""
-@dataclass
-class TestConfig3:
-    a: int = 1
-@dataclass
-class TestConfig4:
-    a: Union[Literal[1], Literal[2]] = 1
-    """docstring"""
-@pytest.mark.parametrize(("test_config", "expected_error"), [
-    (TestConfig1, "must be a dataclass"),
-    (TestConfig2, "must have a default"),
-    (TestConfig3, "must have a docstring"),
-    (TestConfig4, "must use a single Literal"),
-])
-def test_config(test_config, expected_error):
-    with pytest.raises(Exception, match=expected_error):
-        config(test_config)
 def test_compile_config_repr_succeeds():
    # setup: VllmBackend mutates the config object
    config = VllmConfig()
@@ -57,23 +24,23 @@ def test_compile_config_repr_succeeds():
    assert 'inductor_passes' in val
-def test_get_field():
+@dataclass
+class _TestConfigFields:
+    a: int
+    b: dict = field(default_factory=dict)
+    c: str = "default"
-    @dataclass
-    class TestConfig:
-        a: int
-        b: dict = field(default_factory=dict)
-        c: str = "default"
+def test_get_field():
    with pytest.raises(ValueError):
-        get_field(TestConfig, "a")
+        get_field(_TestConfigFields, "a")
-    b = get_field(TestConfig, "b")
+    b = get_field(_TestConfigFields, "b")
    assert isinstance(b, Field)
    assert b.default is MISSING
    assert b.default_factory is dict
-    c = get_field(TestConfig, "c")
+    c = get_field(_TestConfigFields, "c")
    assert isinstance(c, Field)
    assert c.default == "default"
    assert c.default_factory is MISSING
@@ -85,7 +52,7 @@ def test_get_field():
        ("distilbert/distilgpt2", "generate", "generate"),
        ("intfloat/multilingual-e5-small", "pooling", "embed"),
        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
-        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "score"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "reward"),
        ("openai/whisper-small", "transcription", "transcription"),
    ],
@@ -105,6 +72,32 @@ def test_auto_task(model_id, expected_runner_type, expected_task):
    assert config.task == expected_task
+@pytest.mark.parametrize(
+    ("model_id", "expected_runner_type", "expected_task"),
+    [
+        ("distilbert/distilgpt2", "pooling", "embed"),
+        ("intfloat/multilingual-e5-small", "pooling", "embed"),
+        ("jason9693/Qwen2.5-1.5B-apeach", "pooling", "classify"),
+        ("cross-encoder/ms-marco-MiniLM-L-6-v2", "pooling", "classify"),
+        ("Qwen/Qwen2.5-Math-RM-72B", "pooling", "embed"),
+        ("openai/whisper-small", "pooling", "embed"),
+    ],
+)
+def test_score_task(model_id, expected_runner_type, expected_task):
+    config = ModelConfig(
+        model_id,
+        task="score",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+    )
+    assert config.runner_type == expected_runner_type
+    assert config.task == expected_task
 @pytest.mark.parametrize(("model_id", "bad_task"), [
    ("Qwen/Qwen2.5-Math-RM-72B", "generate"),
 ])
@@ -438,3 +431,33 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
    config = VllmConfig(load_config=load_config)
    assert config.load_config.pt_load_map_location == pt_load_map_location
+@pytest.mark.parametrize(
+    ("model_id", "max_model_len", "expected_max_len", "should_raise"), [
+        ("BAAI/bge-reranker-base", None, 512, False),
+        ("BAAI/bge-reranker-base", 256, 256, False),
+        ("BAAI/bge-reranker-base", 513, 512, True),
+        ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", None, 131072, False),
+        ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True),
+    ])
+def test_get_and_verify_max_len(model_id, max_model_len, expected_max_len,
+                                should_raise):
+    """Test get_and_verify_max_len with different configurations."""
+    model_config = ModelConfig(
+        model_id,
+        task="auto",
+        tokenizer=model_id,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype="float16",
+        revision=None,
+    )
+    if should_raise:
+        with pytest.raises(ValueError):
+            model_config.get_and_verify_max_len(max_model_len)
+    else:
+        actual_max_len = model_config.get_and_verify_max_len(max_model_len)
+        assert actual_max_len == expected_max_len