Merge tag 'v0.9.0' into v0.9.0-ori

7a985548 · zhuwenwen · 45d3785c · dc1440cf · 7a985548 · 7a985548
Commit 7a985548 authored May 22, 2025 by zhuwenwen
20 changed files
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -26,6 +26,11 @@ TEST_IMAGE_URLS = [
    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
 ]

+TEST_VIDEO_URLS = [
+    "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4",
+    "https://filesamples.com/samples/video/avi/sample_640x360.avi",
+]
+

 @pytest.fixture(scope="module")
 def url_images() -> dict[str, Image.Image]:
@@ -134,6 +139,18 @@ async def test_fetch_image_local_files(image_url: str):
                f"file://{temp_dir}/../{os.path.basename(image_url)}")


+@pytest.mark.asyncio
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
+async def test_fetch_video_http(video_url: str, num_frames: int):
+    connector = MediaConnector()
+
+    video_sync = connector.fetch_video(video_url, num_frames=num_frames)
+    video_async = await connector.fetch_video_async(video_url,
+                                                    num_frames=num_frames)
+    assert np.array_equal(video_sync, video_async)
+
+
 # Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
 class TestCase(NamedTuple):
    mm_positions: "MultiModalPlaceholderDict"

--- a/tests/multimodal/test_video.py
+++ b/tests/multimodal/test_video.py
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+import numpy.typing as npt
+import pytest
+
+from vllm.multimodal.video import VIDEO_LOADER_REGISTRY, VideoLoader
+
+NUM_FRAMES = 10
+FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
+FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
+
+
+@VIDEO_LOADER_REGISTRY.register("test_video_loader_1")
+class TestVideoLoader1(VideoLoader):
+
+    @classmethod
+    def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray:
+        return FAKE_OUTPUT_1
+
+
+@VIDEO_LOADER_REGISTRY.register("test_video_loader_2")
+class TestVideoLoader2(VideoLoader):
+
+    @classmethod
+    def load_bytes(cls, data: bytes, num_frames: int = -1) -> npt.NDArray:
+        return FAKE_OUTPUT_2
+
+
+def test_video_loader_registry():
+    custom_loader_1 = VIDEO_LOADER_REGISTRY.load("test_video_loader_1")
+    output_1 = custom_loader_1.load_bytes(b"test")
+    np.testing.assert_array_equal(output_1, FAKE_OUTPUT_1)
+
+    custom_loader_2 = VIDEO_LOADER_REGISTRY.load("test_video_loader_2")
+    output_2 = custom_loader_2.load_bytes(b"test")
+    np.testing.assert_array_equal(output_2, FAKE_OUTPUT_2)
+
+
+def test_video_loader_type_doesnt_exist():
+    with pytest.raises(AssertionError):
+        VIDEO_LOADER_REGISTRY.load("non_existing_video_loader")
--- a/tests/neuron/1_core/test_neuron_model_runner.py
+++ b/tests/neuron/1_core/test_neuron_model_runner.py
+# SPDX-License-Identifier: Apache-2.0
+import os
+from unittest.mock import MagicMock
+
+from vllm.config import VllmConfig
+from vllm.engine.arg_utils import EngineArgs
+from vllm.platforms import current_platform
+from vllm.platforms.neuron import NeuronFramework
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import SequenceData, SequenceGroupMetadata
+from vllm.worker.neuron_model_runner import NeuronModelRunner
+
+os.environ[
+    'VLLM_NEURON_FRAMEWORK'] = NeuronFramework.TRANSFORMERS_NEURONX.value
+
+
+def _create_neuron_model_runner(model: str, *args,
+                                **kwargs) -> NeuronModelRunner:
+    engine_args = EngineArgs(model, *args, **kwargs)
+    engine_config = engine_args.create_engine_config()
+    vllm_config = VllmConfig(
+        model_config=engine_config.model_config,
+        parallel_config=engine_config.parallel_config,
+        scheduler_config=engine_config.scheduler_config,
+        device_config=engine_config.device_config,
+    )
+    neuron_model_runner = NeuronModelRunner(vllm_config=vllm_config)
+    return neuron_model_runner
+
+
+def test_update_neuron_sampling_params_not_full_batch():
+    os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0"
+    model_runner = _create_neuron_model_runner(
+        "facebook/opt-125m",
+        seed=0,
+        dtype="float16",
+        max_num_seqs=2,
+    )
+    assert not model_runner._on_device_sampling_disabled
+    # Test sampling param updating only when TNx is framework
+    # NxDI handles sampling parameter updating inside model
+    if current_platform.use_transformers_neuronx():
+        model_mock = MagicMock()
+        model_runner.model = model_mock
+
+        seq_group_metadata_list = [
+            SequenceGroupMetadata(
+                request_id="test_0",
+                is_prompt=True,
+                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                sampling_params=SamplingParams(temperature=0.5,
+                                               top_k=1,
+                                               top_p=0.5),
+                block_tables={0: [1]},
+            )
+        ]
+
+        model_runner.prepare_model_input(seq_group_metadata_list)
+
+        # Index neuron sampling parameters based on block_tables indices.
+        # The first block_id of the sequence 0 is 1, so its parameters are
+        # placed at index 1. So the sampling parameters will be:
+        # Index 0: default sampling parameters
+        # Index 1: sequecne 0's sampling parameters.
+        neuron_sampling_params = (
+            model_runner.model_config.neuron_sampling_params)
+        assert neuron_sampling_params.temperature == [1.0, 0.5]
+        assert neuron_sampling_params.top_k == [
+            model_runner._MAX_NEURON_SAMPLING_TOP_K, 1
+        ]
+        assert neuron_sampling_params.top_p == [1.0, 0.5]
+        model_mock.model.update_generation_config.assert_called_once_with(
+            neuron_sampling_params)
+
+
+def test_update_neuron_sampling_params_full_batch():
+    os.environ["NEURON_ON_DEVICE_SAMPLING_DISABLED"] = "0"
+    model_runner = _create_neuron_model_runner(
+        "facebook/opt-125m",
+        seed=0,
+        dtype="float16",
+        max_num_seqs=2,
+    )
+    assert not model_runner._on_device_sampling_disabled
+
+    # Test sampling param updating only when TNx is framework
+    # NxDI handles sampling parameter updating inside model
+    if current_platform.use_transformers_neuronx():
+        model_mock = MagicMock()
+        model_runner.model = model_mock
+
+        seq_group_metadata_list = [
+            SequenceGroupMetadata(
+                request_id="test_0",
+                is_prompt=True,
+                seq_data={0: SequenceData.from_seqs([1, 2, 3])},
+                sampling_params=SamplingParams(temperature=0.5,
+                                               top_k=1,
+                                               top_p=0.5),
+                block_tables={0: [1]},
+            ),
+            SequenceGroupMetadata(
+                request_id="test_0",
+                is_prompt=True,
+                seq_data={1: SequenceData.from_seqs([4, 5, 6])},
+                sampling_params=SamplingParams(temperature=0.2,
+                                               top_k=2,
+                                               top_p=0.2),
+                block_tables={1: [0]},
+            )
+        ]
+
+        model_runner.prepare_model_input(seq_group_metadata_list)
+
+        # Index neuron sampling parameters based on block_tables indices.
+        # The first block_id of the sequence 0 is 1, so its parameters are
+        # placed at index 1. So the sampling parameters will be:
+        # Index 0: sequence 1's sampling parameters
+        # Index 1: sequecne 0's sampling parameters.
+        neuron_sampling_params = (
+            model_runner.model_config.neuron_sampling_params)
+        assert neuron_sampling_params.temperature == [0.2, 0.5]
+        assert neuron_sampling_params.top_k == [2, 1]
+        assert neuron_sampling_params.top_p == [0.2, 0.5]
+        model_mock.model.update_generation_config.assert_called_once_with(
+            neuron_sampling_params)
--- a/tests/neuron/1_core/test_rotary_embedding.py
+++ b/tests/neuron/1_core/test_rotary_embedding.py
@@ -11,14 +11,16 @@ from vllm.platforms import current_platform


 @pytest.mark.parametrize(
-    "max_position,is_neox_style,rotary_dim,head_size,seq_len", [
-        (16, False, 32, 32, 1024),
-        (16, False, 32, 128, 1024),
-        (16, True, 32, 32, 1024),
-        (16, True, 32, 128, 1024),
+    "max_position,is_neox_style,rotary_dim,head_size,seq_len,use_key", [
+        (16, False, 32, 32, 1024, True),
+        (16, False, 32, 128, 1024, True),
+        (16, True, 32, 32, 1024, True),
+        (16, True, 32, 128, 1024, True),
+        (16, False, 32, 128, 1024, False),
+        (16, True, 32, 128, 1024, False),
    ])
 def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim,
-                                  head_size, seq_len):
+                                  head_size, seq_len, use_key):
    import torch_xla.core.xla_model as xm

    device = xm.xla_device()
@@ -40,19 +42,26 @@ def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim,
                        num_heads * head_size,
                        dtype=torch.float32,
                        device="cpu")
-    key = torch.randn_like(query)
-
+    key = torch.randn_like(query) if use_key else None
    assert positions.is_cpu, \
        "reference input tensor is expected to be CPU tensor."
    ref_query, ref_key = rot.to(device="cpu").forward_native(
        positions, query, key)
    out_query, out_key = rot.to(device=device).forward_neuron(
        positions.to(device=device), query.to(device=device),
-        key.to(device=device))
-    assert out_query.is_xla and out_key.is_xla, \
-        "output tensor is expected to be XLA tensor"
+        key.to(device=device) if key is not None else None)
+    if use_key:
+        assert out_query.is_xla and out_key.is_xla, \
+            "output tensor is expected to be XLA tensor"
+        torch.testing.assert_close(out_key.cpu(),
+                                   ref_key,
+                                   atol=1e-2,
+                                   rtol=1e-2)
+    else:
+        assert out_key is None, "expected returned key to be None"
+        assert out_query.is_xla, \
+            "output tensor is expected to be XLA tensor"
    torch.testing.assert_close(out_query.cpu(),
                               ref_query,
                               atol=1e-2,
                               rtol=1e-2)
-    torch.testing.assert_close(out_key.cpu(), ref_key, atol=1e-2, rtol=1e-2)
--- a/tests/neuron/2_core/test_mistral.py
+++ b/tests/neuron/2_core/test_mistral.py
+# SPDX-License-Identifier: Apache-2.0
+
+from vllm import LLM, SamplingParams
+
+
+def test_mistral():
+    llm = LLM(model="mistralai/Mistral-7B-v0.1",
+              tensor_parallel_size=2,
+              max_num_seqs=4,
+              max_model_len=512,
+              use_v2_block_manager=True,
+              override_neuron_config={
+                  "sequence_parallel_enabled": False,
+                  "skip_warmup": True
+              },
+              device="neuron")
+
+    prompts = [
+        "The president of the United States is",
+        "The capital of France is",
+    ]
+    outputs = llm.generate(prompts, SamplingParams(top_k=1))
+
+    expected_outputs = [
+        " the most powerful person in the world. He is the head of state "
+        "and head",
+        " a city of many faces. It is a city of history, culture, art"
+    ]
+
+    for expected_output, output in zip(expected_outputs, outputs):
+        generated_text = output.outputs[0].text
+        assert (expected_output == generated_text)
--- a/tests/models/embedding/vision_language/__init__.py
+++ b/tests/models/embedding/vision_language/__init__.py
--- a/tests/plugins/lora_resolvers/test_filesystem_resolver.py
+++ b/tests/plugins/lora_resolvers/test_filesystem_resolver.py
+# SPDX-License-Identifier: Apache-2.0
+import os
+import shutil
+
+import pytest
+from huggingface_hub import snapshot_download
+
+from vllm.plugins.lora_resolvers.filesystem_resolver import FilesystemResolver
+
+MODEL_NAME = "mistralai/Mistral-7B-v0.1"
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+PA_NAME = "swapnilbp/llama_tweet_ptune"
+
+
+@pytest.fixture(scope='module')
+def adapter_cache(request, tmpdir_factory):
+    # Create dir that mimics the structure of the adapter cache
+    adapter_cache = tmpdir_factory.mktemp(
+        request.module.__name__) / "adapter_cache"
+    return adapter_cache
+
+
+@pytest.fixture(scope="module")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.fixture(scope="module")
+def pa_files():
+    return snapshot_download(repo_id=PA_NAME)
+
+
+@pytest.mark.asyncio
+async def test_filesystem_resolver(adapter_cache, zephyr_lora_files):
+    model_files = adapter_cache / LORA_NAME
+    shutil.copytree(zephyr_lora_files, model_files)
+
+    fs_resolver = FilesystemResolver(adapter_cache)
+    assert fs_resolver is not None
+
+    lora_request = await fs_resolver.resolve_lora(MODEL_NAME, LORA_NAME)
+    assert lora_request is not None
+    assert lora_request.lora_name == LORA_NAME
+    assert lora_request.lora_path == os.path.join(adapter_cache, LORA_NAME)
+
+
+@pytest.mark.asyncio
+async def test_missing_adapter(adapter_cache):
+    fs_resolver = FilesystemResolver(adapter_cache)
+    assert fs_resolver is not None
+
+    missing_lora_request = await fs_resolver.resolve_lora(MODEL_NAME, "foobar")
+    assert missing_lora_request is None
+
+
+@pytest.mark.asyncio
+async def test_nonlora_adapter(adapter_cache, pa_files):
+    model_files = adapter_cache / PA_NAME
+    shutil.copytree(pa_files, model_files)
+
+    fs_resolver = FilesystemResolver(adapter_cache)
+    assert fs_resolver is not None
+
+    pa_request = await fs_resolver.resolve_lora(MODEL_NAME, PA_NAME)
+    assert pa_request is None
--- a/tests/quantization/test_auto_round.py
+++ b/tests/quantization/test_auto_round.py
+# SPDX-License-Identifier: Apache-2.0
+"""Test model set-up and inference for quantized HF models supported
+ on the AutoRound.
+
+ Validating the configuration and printing results for manual checking.
+
+ Run `pytest tests/quantization/test_auto_round.py`.
+"""
+
+import pytest
+
+from vllm.platforms import current_platform
+
+MODELS = [
+    "OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc",  ##auto_round:auto_gptq
+    "Intel/Qwen2-0.5B-Instruct-int4-sym-AutoRound"  ##auto_round:auto_awq
+]
+
+
+@pytest.mark.skipif(not current_platform.is_cpu()
+                    and not current_platform.is_xpu()
+                    and not current_platform.is_cuda(),
+                    reason="only supports CPU/XPU/CUDA backend.")
+@pytest.mark.parametrize("model", MODELS)
+def test_auto_round(vllm_runner, model):
+    with vllm_runner(model) as llm:
+        output = llm.generate_greedy(["The capital of France is"],
+                                     max_tokens=8)
+    assert output
+    print(f"{output[0][1]}")
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -8,9 +8,11 @@ import gc

 import pytest
 import torch
+from transformers import BitsAndBytesConfig

 from tests.quantization.utils import is_quant_method_supported

+from ..models.utils import check_embeddings_close
 from ..utils import compare_two_settings, create_new_process_for_each_test

 models_4bit_to_test = [
@@ -19,6 +21,10 @@ models_4bit_to_test = [
     "quantize inflight model with both HF and Mistral format weights")
 ]

+models_4bit_to_embedding_test = [
+    ("intfloat/e5-mistral-7b-instruct", "quantize embedding model inflight"),
+]
+
 models_pre_qaunt_4bit_to_test = [
    ('PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed',
     'read pre-quantized 4-bit FP4 model'),
@@ -31,6 +37,12 @@ models_pre_quant_8bit_to_test = [
    ("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
 ]

+models_pre_quant_8bit_to_test = [
+    ('meta-llama/Llama-Guard-3-8B-INT8',
+     'read pre-quantized llama 8-bit model'),
+    ("yec019/fbopt-350m-8bit", "read pre-quantized 8-bit opt model"),
+]
+

 @pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
                    reason='bitsandbytes is not supported on this GPU type.')
@@ -39,7 +51,8 @@ models_pre_quant_8bit_to_test = [
 def test_load_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                             model_name, description) -> None:

-    hf_model_kwargs = {"load_in_4bit": True}
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
+        load_in_4bit=True))
    validate_generated_texts(hf_runner, vllm_runner, example_prompts[:1],
                             model_name, False, hf_model_kwargs)

@@ -77,7 +90,8 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
 def test_load_tp_4bit_bnb_model(hf_runner, vllm_runner, example_prompts,
                                model_name, description) -> None:

-    hf_model_kwargs = {"load_in_4bit": True}
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
+        load_in_4bit=True))
    validate_generated_texts(hf_runner,
                             vllm_runner,
                             example_prompts[:1],
@@ -113,6 +127,54 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
    compare_two_settings(model_name, common_args, pp_args)


+@pytest.mark.skipif(not is_quant_method_supported("bitsandbytes"),
+                    reason='bitsandbytes is not supported on this GPU type.')
+@pytest.mark.parametrize("model_name, description",
+                         models_4bit_to_embedding_test)
+@pytest.mark.parametrize("dtype", ["half"])
+@create_new_process_for_each_test()
+def test_4bit_bnb_embedding_model(
+    model_name,
+    description,
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    dtype: str,
+) -> None:
+
+    # The example_prompts has ending "\n", for example:
+    # "Write a short story about a robot that dreams for the first time.\n"
+    # sentence_transformers will strip the input texts, see:
+    # https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
+    # This makes the input_ids different between hf_model and vllm_model.
+    # So we need to strip the input texts to avoid test failing.
+    example_prompts = [str(s).strip() for s in example_prompts]
+
+    # Inflight 4bit quantization
+    hf_model_kwargs = dict(quantization_config=BitsAndBytesConfig(
+        load_in_4bit=True))
+    with hf_runner(
+            model_name,
+            dtype=dtype,
+            model_kwargs=hf_model_kwargs,
+            is_sentence_transformer=True,
+    ) as hf_model:
+        hf_outputs = hf_model.encode(example_prompts)
+
+    with vllm_runner(model_name,
+                     task="embed",
+                     dtype=dtype,
+                     quantization="bitsandbytes") as vllm_model:
+        vllm_outputs = vllm_model.encode(example_prompts)
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=5e-2,
+    )
+
+
 def log_generated_texts(prompts, outputs, runner_name):
    logged_texts = []
    for i, (_, generated_text) in enumerate(outputs):

--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -13,9 +13,9 @@ from compressed_tensors.quantization import QuantizationType
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
    CompressedTensors24, CompressedTensorsLinearMethod,
-    CompressedTensorsW4A16Sparse24, CompressedTensorsW8A8Fp8,
-    CompressedTensorsW8A8Int8, CompressedTensorsW8A16Fp8,
-    CompressedTensorsWNA16)
+    CompressedTensorsW4A16Fp4, CompressedTensorsW4A16Sparse24,
+    CompressedTensorsW8A8Fp8, CompressedTensorsW8A8Int8,
+    CompressedTensorsW8A16Fp8, CompressedTensorsWNA16)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
    sparse_cutlass_supported)
 from vllm.platforms import current_platform
@@ -648,3 +648,23 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
        output = llm.generate_greedy("Hello my name is", max_tokens=20)
        print(output)
        assert output
+
+
+def test_compressed_tensors_nvfp4a16(vllm_runner):
+    # run weight only example
+    model = "nm-testing/TinyLlama-1.1B-Chat-v1.0-FP4"
+    with vllm_runner(model, enforce_eager=True) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            assert isinstance(qkv_proj.quant_method,
+                              CompressedTensorsLinearMethod)
+            assert isinstance(qkv_proj.scheme, CompressedTensorsW4A16Fp4)
+            assert qkv_proj.scheme.group_size == 16
+
+        llm.apply_model(check_model)
+        output = llm.generate_greedy("Hello my name is", max_tokens=20)
+        print(output)
+        assert output
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -5,6 +5,7 @@ Run `pytest tests/quantization/test_quark.py`.
 """

 import pytest
+import torch

 from vllm.model_executor.layers.quantization.quark.quark import (  # noqa: E501
    QuarkLinearMethod, QuarkW8A8Fp8, QuarkW8A8Int8)
@@ -63,3 +64,28 @@ def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):

        output = llm.generate_greedy("Hello my name is", max_tokens=20)
        assert output
+
+
+def test_quark_fp8_parity(vllm_runner):
+    quark_model_id = "amd-quark/llama-tiny-fp8-quark-quant-method"
+    fp8_model_id = "amd-quark/llama-tiny-fp8-quant-method"
+
+    llm_kwargs = {
+        "tensor_parallel_size": 1,
+        "enforce_eager": True,
+        "gpu_memory_utilization": 0.1
+    }
+    with (vllm_runner(quark_model_id, **llm_kwargs) as
+          quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle):
+        quark_model = (quark_handle.model.llm_engine.model_executor.
+                       driver_worker.model_runner.model)
+        quark_state_dict = quark_model.state_dict()
+
+        fp8_model = (fp8_handle.model.llm_engine.model_executor.driver_worker.
+                     model_runner.model)
+        fp8_state_dict = fp8_model.state_dict()
+
+    assert fp8_state_dict.keys() == quark_state_dict.keys()
+
+    for key in fp8_state_dict:
+        assert torch.equal(fp8_state_dict[key], quark_state_dict[key])
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -14,7 +14,7 @@ import torch.nn.functional as F
 from vllm.model_executor.layers.linear import LinearBase  # noqa: E501
 from vllm.model_executor.layers.linear import UnquantizedLinearMethod
 from vllm.model_executor.layers.quantization import (
-    get_quantization_config, register_quantization_config)
+    QuantizationMethods, get_quantization_config, register_quantization_config)
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
    QuantizationConfig)

@@ -54,7 +54,7 @@ class CustomQuantConfig(QuantizationConfig):
        """Initialize the quantization config."""
        self.num_bits = num_bits

-    def get_name(self) -> str:
+    def get_name(self) -> QuantizationMethods:
        """Name of the quantization method."""
        return "custom_quant"


--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -3,6 +3,7 @@ import importlib.metadata
 import importlib.util

 import pytest
+import torch

 DTYPE = ["bfloat16"]

@@ -21,5 +22,42 @@ def test_pre_quantized_model(vllm_runner):
    print(output)


+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+@pytest.mark.parametrize(
+    "pt_load_map_location",
+    [
+        "cuda:0",
+        # {"": "cuda"},
+    ])
+def test_opt_125m_int4wo_model_loading_with_params(vllm_runner,
+                                                   pt_load_map_location):
+    torch._dynamo.reset()
+    model_name = "jerryzh168/opt-125m-int4wo"
+    with vllm_runner(model_name=model_name,
+                     quantization="torchao",
+                     dtype="bfloat16",
+                     pt_load_map_location=pt_load_map_location) as llm:
+        output = llm.generate_greedy(["The capital of France is"],
+                                     max_tokens=32)
+
+        assert output
+        print(output)
+
+
+@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
+def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
+    torch._dynamo.reset()
+    model_name = "jerryzh168/opt-125m-int4wo-per-module"
+    with vllm_runner(model_name=model_name,
+                     quantization="torchao",
+                     dtype="bfloat16",
+                     pt_load_map_location="cuda:0") as llm:
+        output = llm.generate_greedy(["The capital of France is"],
+                                     max_tokens=32)
+
+        assert output
+        print(output)
+
+
 if __name__ == "__main__":
    pytest.main([__file__])
--- a/tests/reasoning/test_qwen3_reasoning_parser.py
+++ b/tests/reasoning/test_qwen3_reasoning_parser.py
+# SPDX-License-Identifier: Apache-2.0
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+parser_name = "qwen3"
+start_token = "<think>"
+end_token = "</think>"
+
+REASONING_MODEL_NAME = "Qwen/Qwen3-0.6B"
+
+
+@pytest.fixture(scope="module")
+def qwen3_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+# 带 <think></think>，非stream
+WITH_THINK = {
+    "output": "<think>This is a reasoning section</think>This is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+# 带 <think></think>，stream
+WITH_THINK_STREAM = {
+    "output": "<think>This is a reasoning section</think>This is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+# 不带 <think></think>，非stream
+WITHOUT_THINK = {
+    "output": "This is the rest",
+    "reasoning_content": None,
+    "content": "This is the rest",
+}
+# 不带 <think></think>，stream
+WITHOUT_THINK_STREAM = {
+    "output": "This is the rest",
+    "reasoning_content": None,
+    "content": "This is the rest",
+}
+
+COMPLETE_REASONING = {
+    "output": "<think>This is a reasoning section</think>",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+MULTILINE_REASONING = {
+    "output":
+    "<think>This is a reasoning\nsection</think>This is the rest\nThat",
+    "reasoning_content": "This is a reasoning\nsection",
+    "content": "This is the rest\nThat",
+}
+ONLY_OPEN_TAG = {
+    "output": "<think>This is a reasoning section",
+    "reasoning_content": None,
+    "content": "<think>This is a reasoning section",
+}
+
+ONLY_OPEN_TAG_STREAM = {
+    "output": "<think>This is a reasoning section",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        WITH_THINK,
+        id="with_think",
+    ),
+    pytest.param(
+        True,
+        WITH_THINK_STREAM,
+        id="with_think_stream",
+    ),
+    pytest.param(
+        False,
+        WITHOUT_THINK,
+        id="without_think",
+    ),
+    pytest.param(
+        True,
+        WITHOUT_THINK_STREAM,
+        id="without_think_stream",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_reasoning",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_reasoning_stream",
+    ),
+    pytest.param(
+        False,
+        MULTILINE_REASONING,
+        id="multiline_reasoning",
+    ),
+    pytest.param(
+        True,
+        MULTILINE_REASONING,
+        id="multiline_reasoning_stream",
+    ),
+    pytest.param(
+        False,
+        ONLY_OPEN_TAG,
+        id="only_open_tag",
+    ),
+    pytest.param(
+        True,
+        ONLY_OPEN_TAG_STREAM,
+        id="only_open_tag_stream",
+    ),
+]
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+    qwen3_tokenizer,
+):
+    output = qwen3_tokenizer.tokenize(param_dict["output"])
+    output_tokens: list[str] = [
+        qwen3_tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(qwen3_tokenizer)
+
+    reasoning, content = run_reasoning_extraction(parser,
+                                                  output_tokens,
+                                                  streaming=streaming)
+
+    assert reasoning == param_dict["reasoning_content"]
+    assert content == param_dict["content"]
--- a/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
+++ b/tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
@@ -2,8 +2,7 @@

 from vllm import SamplingParams
 from vllm.config import LoadConfig, LoadFormat
-from vllm.model_executor.model_loader.loader import (RunaiModelStreamerLoader,
-                                                     get_model_loader)
+from vllm.model_executor.model_loader import get_model_loader

 test_model = "openai-community/gpt2"

@@ -24,7 +23,7 @@ def get_runai_model_loader():

 def test_get_model_loader_with_runai_flag():
    model_loader = get_runai_model_loader()
-    assert isinstance(model_loader, RunaiModelStreamerLoader)
+    assert model_loader.__class__.__name__ == "RunaiModelStreamerLoader"


 def test_runai_model_loader_download_files(vllm_runner):

--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -169,7 +169,10 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
 @pytest.mark.parametrize("batch_size", [1, 8, 32, 128])
 @pytest.mark.parametrize("n_rep", [100])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("use_flashinfer", [True, False])
+# @pytest.mark.parametrize("use_flashinfer", [True, False])
+# Not testing FlashInfer now, since 0.2.3 API removed the ability
+# to pass in uniform samples.
+@pytest.mark.parametrize("use_flashinfer", [False])
 @torch.inference_mode()
 def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
                                   frac_seeded: float, n_rep: int, device: str,
@@ -214,7 +217,10 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
 @pytest.mark.parametrize("vocab_size", [30_000, 50_000])
 @pytest.mark.parametrize("batch_size", [3, 8, 32, 128])
 @pytest.mark.parametrize("device", CUDA_DEVICES)
-@pytest.mark.parametrize("use_flashinfer", [True, False])
+# @pytest.mark.parametrize("use_flashinfer", [True, False])
+# Not testing FlashInfer now, since 0.2.3 API removed the ability
+# to pass in uniform samples.
+@pytest.mark.parametrize("use_flashinfer", [False])
 @torch.inference_mode()
 def test_mixed_seeded_batch(k: int, vocab_size: int, batch_size: int,
                            device: str, use_flashinfer: bool):
@@ -284,6 +290,10 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
    Test the flashinfer and nonflashinfer backend generate 
    the same output metrics.
    """
+
+    pytest.skip("Not testing FlashInfer now, since 0.2.3 API removed "
+                "the ability to pass in uniform samples.")
+
    torch.set_default_device(device)
    torch.manual_seed(0)
    draft_probs = torch.rand(batch_size, k, vocab_size, dtype=torch.float32)

--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -478,7 +478,7 @@ def test_sampler_mixed(seed: int, device: str):
            sampling_params = SamplingParams(
                temperature=random.random() + 0.1,
                top_p=min(random.random() + 0.1, 1),
-                top_k=random.randint(0, 10) or -1,
+                top_k=random.randint(0, 10),
                n=n,
                presence_penalty=random.randint(0, 1),
            )
@@ -647,6 +647,8 @@ def test_flashinfer_fallback(seed: int, device: str):
    if not envs.VLLM_USE_FLASHINFER_SAMPLER:
        pytest.skip("Flashinfer sampler is disabled")

+    pytest.skip("After FlashInfer 0.2.3, sampling will never fail")
+
    set_random_seed(seed)
    torch.set_default_device(device)
    batch_size = random.randint(1, 256)

--- a/tests/spec_decode/e2e/test_medusa_correctness.py
+++ b/tests/spec_decode/e2e/test_medusa_correctness.py
@@ -205,7 +205,7 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "block_size": 8,
+        "block_size": 16,
        # 2 for small prompt, 256//8 for generated.
        "num_gpu_blocks_override": 2 + 256 // 8,
        "max_model_len": (2 + 256 // 8) * 8,

--- a/tests/spec_decode/e2e/test_mlp_correctness.py
+++ b/tests/spec_decode/e2e/test_mlp_correctness.py
@@ -267,7 +267,7 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "block_size": 8,
+        "block_size": 16,
        # 2 for small prompt, 256//8 for generated.
        "num_gpu_blocks_override": 2 + 256 // 8,
        "max_model_len": (2 + 256 // 8) * 8,
@@ -321,7 +321,7 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "block_size": 8,
+        "block_size": 16,
        # 2 for small prompt, 256//8 for generated.
        "num_gpu_blocks_override": 2 + 256 // 8,
        "max_model_len": (2 + 256 // 8) * 8,

--- a/tests/spec_decode/e2e/test_multistep_correctness.py
+++ b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -456,7 +456,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
 @pytest.mark.parametrize(
    "common_llm_kwargs",
    [{
-        "block_size": 8,
+        "block_size": 16,
        # 2 for small prompt, 256//8 for generated.
        "num_gpu_blocks_override": 2 + 256 // 8,
        "max_model_len": (2 + 256 // 8) * 8,
@@ -526,11 +526,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
 @pytest.mark.parametrize(
    "per_test_common_llm_kwargs",
    [
-        # As of this writing, vLLM only compiles with these 3 block sizes by
-        # default.
-        {
-            "block_size": 8,
-        },
+        # https://github.com/triton-lang/triton/issues/2266 tl.dot
+        # doesn't support embedding < 16
        {
            "block_size": 16,
        },