Merge tag 'v0.8.3' into v0.8.3-dev

fcfc474d · zhuwenwen · bb94d2e5 · 296c6572 · fcfc474d · fcfc474d
Commit fcfc474d authored Apr 09, 2025 by zhuwenwen
20 changed files
--- a/tests/models/test_initialization.py
+++ b/tests/models/test_initialization.py
@@ -54,8 +54,10 @@ def test_can_initialize(model_arch):
            model_info.default,
            tokenizer=model_info.tokenizer,
            tokenizer_mode=model_info.tokenizer_mode,
-            speculative_model=model_info.speculative_model,
+            speculative_config={
-            num_speculative_tokens=1 if model_info.speculative_model else None,
+                "model": model_info.speculative_model,
+                "num_speculative_tokens": 1,
+            } if model_info.speculative_model else None,
            trust_remote_code=model_info.trust_remote_code,
            load_format="dummy",
            hf_overrides=hf_overrides,

--- a/tests/models/test_registry.py
+++ b/tests/models/test_registry.py
@@ -23,6 +23,11 @@ from .registry import HF_EXAMPLE_MODELS
 @pytest.mark.parametrize("model_arch", ModelRegistry.get_supported_archs())
 def test_registry_imports(model_arch):
+    # Llama4ForCausalLM does not have a standalone model
+    if model_arch == "Llama4ForCausalLM":
+        return
    model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
    model_info.check_transformers_version(on_fail="skip")
@@ -91,8 +96,11 @@ def test_registry_is_pp(model_arch, is_pp, init_cuda):
 def test_hf_registry_coverage():
-    untested_archs = (ModelRegistry.get_supported_archs() -
+    untested_archs = set(ModelRegistry.get_supported_archs() -
-                      HF_EXAMPLE_MODELS.get_supported_archs())
+                         HF_EXAMPLE_MODELS.get_supported_archs())
+    # Llama4ForCausalLM does not have a standalone model
+    untested_archs.discard("Llama4ForCausalLM")
    assert not untested_archs, (
        "Please add the following architectures to "

--- a/tests/models/test_transformers.py
+++ b/tests/models/test_transformers.py
@@ -3,8 +3,6 @@
 Run `pytest tests/models/test_transformers.py`.
 """
-from contextlib import nullcontext
 import pytest
 from ..conftest import HfRunner, VllmRunner
@@ -42,7 +40,6 @@ def check_implementation(
    "model,model_impl",
    [
        ("meta-llama/Llama-3.2-1B-Instruct", "transformers"),
-        ("openai-community/gpt2", "transformers"),
        ("ArthurZ/Ilama-3.2-1B", "auto"),  # CUSTOM CODE
    ])  # trust_remote_code=True by default
 def test_models(
@@ -52,20 +49,11 @@ def test_models(
    model: str,
    model_impl: str,
 ) -> None:
+    check_implementation(hf_runner,
-    maybe_raises = nullcontext()
+                         vllm_runner,
-    if model == "openai-community/gpt2" and model_impl == "transformers":
+                         example_prompts,
-        # Model is not backend compatible
+                         model,
-        maybe_raises = pytest.raises(
+                         model_impl=model_impl)
-            ValueError,
-            match="The Transformers implementation.*not compatible with vLLM")
-    with maybe_raises:
-        check_implementation(hf_runner,
-                             vllm_runner,
-                             example_prompts,
-                             model,
-                             model_impl=model_impl)
 @multi_gpu_test(num_gpus=2)
@@ -84,7 +72,6 @@ def test_distributed(
        "meta-llama/Llama-3.2-1B-Instruct",
        {
            "quantization": "bitsandbytes",
-            "load_format": "bitsandbytes",
        },
    ),
 ])

--- a/tests/models/test_utils.py
+++ b/tests/models/test_utils.py
+# SPDX-License-Identifier: Apache-2.0
+import torch
+from vllm.model_executor.models.utils import AutoWeightsLoader
+class ModuleWithBatchNorm(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.bn = torch.nn.BatchNorm1d(2)
+    def forward(self, x):
+        return self.bn(x)
+class ModuleWithNestedBatchNorm(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.nested_mod = ModuleWithBatchNorm()
+    def forward(self, x):
+        return self.nested_mod(x)
+def test_module_with_batchnorm_can_load():
+    """Ensure the auto weight loader can load batchnorm stats."""
+    mod = ModuleWithBatchNorm()
+    # Run some data through the module with batchnorm
+    mod(torch.Tensor([[1, 2], [3, 4]]))
+    # Try to load the weights to a new instance
+    def weight_generator():
+        yield from mod.state_dict().items()
+    new_mod = ModuleWithBatchNorm()
+    assert not torch.all(new_mod.bn.running_mean == mod.bn.running_mean)
+    assert not torch.all(new_mod.bn.running_var == mod.bn.running_var)
+    assert new_mod.bn.num_batches_tracked.item() == 0
+    loader = AutoWeightsLoader(new_mod)
+    loader.load_weights(weight_generator())
+    # Ensure the stats are updated
+    assert torch.all(new_mod.bn.running_mean == mod.bn.running_mean)
+    assert torch.all(new_mod.bn.running_var == mod.bn.running_var)
+    assert new_mod.bn.num_batches_tracked.item() == 1
+def test_module_with_child_containing_batchnorm_can_autoload():
+    """Ensure the auto weight loader can load nested modules batchnorm stats."""
+    mod = ModuleWithNestedBatchNorm()
+    # Run some data through the module with batchnorm
+    mod(torch.Tensor([[1, 2], [3, 4]]))
+    # Try to load the weights to a new instance
+    def weight_generator():
+        yield from mod.state_dict().items()
+    new_mod = ModuleWithNestedBatchNorm()
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert not torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 0
+    loader = AutoWeightsLoader(new_mod)
+    loader.load_weights(weight_generator())
+    # Ensure the stats are updated
+    assert torch.all(
+        new_mod.nested_mod.bn.running_mean == mod.nested_mod.bn.running_mean)
+    assert torch.all(
+        new_mod.nested_mod.bn.running_var == mod.nested_mod.bn.running_var)
+    assert new_mod.nested_mod.bn.num_batches_tracked.item() == 1
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -28,8 +28,7 @@ from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
                                        replace_token_matches)
 # yapf: enable
 from vllm.multimodal.profiling import MultiModalProfiler
-from vllm.transformers_utils.tokenizer import (AnyTokenizer,
+from vllm.transformers_utils.tokenizer import AnyTokenizer
-                                               cached_tokenizer_from_config)
 from vllm.utils import full_groupby
 from .utils import random_image
@@ -955,10 +954,7 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
        limit_mm_per_prompt=limit_mm_per_prompt,
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(
+    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
    profiler = MultiModalProfiler(processor)
    mock_supported_mm_limits = MagicMock(return_value={"image": num_supported})
@@ -994,10 +990,7 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
        limit_mm_per_prompt=limit_mm_per_prompt,
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(
+    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
    rng = np.random.RandomState(0)
    image = random_image(rng, min_wh=128, max_wh=256)
@@ -1066,10 +1059,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
        revision=None,
    )
-    processor = MULTIMODAL_REGISTRY.create_processor(
+    processor = MULTIMODAL_REGISTRY.create_processor(model_config)
-        model_config,
-        tokenizer=cached_tokenizer_from_config(model_config),
-    )
    orig_get_hf_processor = processor.info.get_hf_processor
    def get_hf_processor(self, **kwargs):

--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -11,12 +11,10 @@ import pytest
 import os
 from PIL import Image, ImageChops
-from transformers import AutoConfig, AutoTokenizer
 from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import (MediaConnector,
-                                   merge_and_sort_multimodal_metadata,
+                                   merge_and_sort_multimodal_metadata)
-                                   repeat_and_pad_placeholder_tokens)
 from ..utils import models_path_prefix, urls_port
 if TYPE_CHECKING:
@@ -139,71 +137,6 @@ async def test_fetch_image_local_files(image_url: str):
                f"file://{temp_dir}/../{os.path.basename(image_url)}")
-@pytest.mark.parametrize("model", [os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")])
-def test_repeat_and_pad_placeholder_tokens(model):
-    config = AutoConfig.from_pretrained(model)
-    image_token_id = config.image_token_index
-    tokenizer = AutoTokenizer.from_pretrained(model)
-    test_cases = [
-        (
-            "<image>",
-            2,
-            "<image><image>",
-            [32000, 32000],
-            [{ "offset": 0, "length": 2 }],
-        ),
-        (
-            "<image><image>",
-            2,
-            "<image><image><image>",
-            [32000, 32000, 32000],
-            [{ "offset": 0, "length": 2 }],
-        ),
-        (
-            "<image><image>",
-            [3, 2],
-            "<image><image><image><image><image>",
-            [32000, 32000, 32000, 32000, 32000],
-            [{ "offset": 0, "length": 3 }, { "offset": 3, "length": 2 }],
-        ),
-        (
-            "Image:<image>Image:<image>!",
-            [3, 2],
-            "Image:<image><image><image>Image:<image><image>!",
-            [9833, 28747, 32000, 32000, 32000, 9833, 28747, 32000, 32000, 918],
-            [{ "offset": 2, "length": 3 }, { "offset": 7, "length": 2 }],
-        ),
-        (
-            "<image>",
-            [3, 2],
-            "<image><image><image>",
-            [32000, 32000, 32000],
-            [{ "offset": 0, "length": 3 }],
-        ),
-    ]  # yapf: disable
-    for (
-            prompt,
-            repeat_count,
-            expected_prompt,
-            expected_token_ids,
-            expected_ranges,
-    ) in test_cases:
-        new_prompt, new_token_ids, ranges = repeat_and_pad_placeholder_tokens(
-            tokenizer=tokenizer,
-            prompt=prompt,
-            prompt_token_ids=tokenizer.encode(prompt,
-                                              add_special_tokens=False),
-            placeholder_token_id=image_token_id,
-            repeat_count=repeat_count,
-        )
-        assert new_prompt == expected_prompt
-        assert new_token_ids == expected_token_ids
-        assert ranges == expected_ranges
 # Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
 class TestCase(NamedTuple):
    mm_positions: "MultiModalPlaceholderDict"
@@ -225,7 +158,7 @@ def test_merge_and_sort_multimodal_metadata():
                ]
            },
            mm_hashes={"image": ["hash1", "hash2"]},
-            expected_modalities=["image"],
+            expected_modalities=["image", "image"],
            expected_ranges=[
                PlaceholderRange(offset=0, length=2),
                PlaceholderRange(offset=3, length=2),
@@ -242,7 +175,7 @@ def test_merge_and_sort_multimodal_metadata():
                ]
            },
            mm_hashes=None,
-            expected_modalities=["image"],
+            expected_modalities=["image", "image"],
            expected_ranges=[
                PlaceholderRange(offset=0, length=2),
                PlaceholderRange(offset=2, length=2),
@@ -267,7 +200,7 @@ def test_merge_and_sort_multimodal_metadata():
                "image": ["image_hash1", "image_hash2"],
                "audio": ["audio_hash1", "audio_hash2"],
            },
-            expected_modalities=["audio", "image"],
+            expected_modalities=["audio", "audio", "image", "image"],
            expected_ranges=[
                PlaceholderRange(offset=0, length=2),
                PlaceholderRange(offset=2, length=3),
@@ -293,7 +226,7 @@ def test_merge_and_sort_multimodal_metadata():
                ]
            },
            mm_hashes=None,
-            expected_modalities=["audio", "image"],
+            expected_modalities=["audio", "audio", "image", "image"],
            expected_ranges=[
                PlaceholderRange(offset=0, length=2),
                PlaceholderRange(offset=2, length=3),
@@ -324,7 +257,9 @@ def test_merge_and_sort_multimodal_metadata():
                "audio": ["audio_hash1"],
                "video": ["video_hash1", "video_hash2", "video_hash3"]
            },
-            expected_modalities=["audio", "video", "image"],
+            expected_modalities=[
+                "audio", "video", "video", "video", "image", "image"
+            ],
            expected_ranges=[
                PlaceholderRange(offset=0, length=2),
                PlaceholderRange(offset=3, length=4),
@@ -370,12 +305,19 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
                "image": ["image_hash1", "image_hash2"],
                "audio": ["audio_hash1", "audio_hash2"],
            },
-            expected_modalities=[],
+            expected_modalities=["image", "audio", "image", "audio"],
-            expected_ranges=[],
+            expected_ranges=[
-            expected_hashes=None,
+                PlaceholderRange(offset=0, length=4),
+                PlaceholderRange(offset=5, length=2),
+                PlaceholderRange(offset=8, length=2),
+                PlaceholderRange(offset=11, length=4),
+            ],
+            expected_hashes=[
+                "image_hash1", "audio_hash1", "image_hash2", "audio_hash2"
+            ],
        ),
-        # <image> <image> <video> <audio> <image>
+        # <image> <image> <audio> <video> <image>
        TestCase(
            mm_positions={
                "image": [
@@ -391,15 +333,54 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
                ]
            },
            mm_hashes=None,
-            expected_modalities=[],
+            expected_modalities=["image", "image", "audio", "video", "image"],
-            expected_ranges=[],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=3),
+                PlaceholderRange(offset=5, length=2),
+                PlaceholderRange(offset=8, length=5),
+                PlaceholderRange(offset=20, length=4),
+            ],
            expected_hashes=None,
        ),
+        # <image> <audio> <video> <image> with hashes
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=18, length=4),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=6, length=2),
+                ],
+                "video": [
+                    PlaceholderRange(offset=10, length=5),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1"],
+                "video": ["video_hash1"],
+            },
+            expected_modalities=["image", "audio", "video", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=6, length=2),
+                PlaceholderRange(offset=10, length=5),
+                PlaceholderRange(offset=18, length=4),
+            ],
+            expected_hashes=[
+                "image_hash1", "audio_hash1", "video_hash1", "image_hash2"
+            ],
+        ),
    ]
-    for case in test_cases:
+    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
-        with pytest.raises(ValueError) as ex_info:
+         expected_hashes) in test_cases:
-            merge_and_sort_multimodal_metadata(case.mm_positions,
+        modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
-                                               case.mm_hashes)
+            mm_positions, mm_hashes)
-        assert "Interleaved mixed-modality" in str(ex_info.value)
+        assert modalities == expected_modalities
+        assert ranges == expected_ranges
+        assert hashes == expected_hashes
--- a/tests/neuron/1_core/test_cache.py
+++ b/tests/neuron/1_core/test_cache.py
@@ -64,9 +64,11 @@ def test_reshape_and_cache(num_tokens, n_kv_head, d_head, num_blocks,
    key_cache = torch.zeros_like(key_cache_cpu, device=device)
    value_cache = torch.zeros_like(value_cache_cpu, device=device)
    slot_mapping = slot_mapping_cpu.to(device)
+    kv_cache = torch.stack([key_cache, value_cache])
    # Run vectorized implementation on XLA device
-    reshape_and_cache(key, value, key_cache, value_cache, slot_mapping)
+    reshape_and_cache(key, value, kv_cache, slot_mapping)
+    key_cache, value_cache = torch.unbind(kv_cache, dim=0)
    # Move results back to CPU for comparison
    key_cache_result = key_cache.cpu()

--- a/tests/neuron/1_core/test_prefix_prefill.py
+++ b/tests/neuron/1_core/test_prefix_prefill.py
@@ -258,13 +258,13 @@ def sample_inputs(
                             value[start_loc:end_loc])
            cur_ctx += block_size
            block_id += 1
+    kv_cache = torch.stack([k_cache, v_cache])
    return (
        query,
        k,
        v,
-        k_cache,
+        kv_cache,
-        v_cache,
        block_table,
        key,
        value,
@@ -361,8 +361,7 @@ def test_contexted_kv_attention(
            query,
            k_active,
            v_active,
-            k_cache,
+            kv_cache,
-            v_cache,
            block_table,
            key,
            value,
@@ -439,8 +438,7 @@ def test_contexted_kv_attention(
        query = query.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
        k = k.unsqueeze(0).permute(0, 2, 3, 1).contiguous()
        v = v.unsqueeze(0).permute(0, 2, 1, 3).contiguous()
-        k_cache = k_cache.permute(0, 2, 1, 3).contiguous()
+        kv_cache = kv_cache.permute(0, 1, 3, 2, 4).contiguous()
-        v_cache = v_cache.permute(0, 2, 1, 3).contiguous()
        # transform block table
        active_block_table = get_active_block_tables(
@@ -487,8 +485,7 @@ def test_contexted_kv_attention(
            query.to(device=device),
            k.to(device=device),
            v.to(device=device),
-            k_cache.to(device=device),
+            kv_cache.to(device=device),
-            v_cache.to(device=device),
            active_block_table.to(device=device),
            attn_mask.to(device=device),
        )

--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -105,8 +105,6 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
        "--enable-prefix-caching",
        "--quantization",
        "bitsandbytes",
-        "--load-format",
-        "bitsandbytes",
        "--gpu-memory-utilization",
        "0.7",
    ]
@@ -141,7 +139,6 @@ def validate_generated_texts(hf_runner,
    # when using distributed inference
    with vllm_runner(model_name,
                     quantization='bitsandbytes',
-                     load_format='bitsandbytes',
                     tensor_parallel_size=vllm_tp_size,
                     enforce_eager=False) as llm:
        vllm_outputs = llm.generate_greedy(prompts, 8)

--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -23,6 +23,23 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
 from vllm.platforms import current_platform
 from ..utils import models_path_prefix
+# AITER only supports per-channel-per-channel INT8 gemm
+# and per-tensor-per-tensor INT8 GEMM.
+# It does not support mix precision MM and mix quantization scheme.
+ROCM_AITER_SUPPORTED_INT8_MODEL = [
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2"
+]
+# TritonScaledMMLinearKernel only supports symmetric quantization.
+ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL = [
+    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+    "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
+]
 @pytest.fixture(scope="function", autouse=True)
 def use_v0_only(monkeypatch):
@@ -60,6 +77,11 @@ def use_v0_only(monkeypatch):
 )
 def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
    model_path, strategy, quant_type, shape_0, is_symmetric = model_args
+    if current_platform.is_rocm(
+    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
    with vllm_runner(model_path, enforce_eager=True) as llm:
        def check_model(model):
@@ -126,6 +148,8 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
 )
 @pytest.mark.parametrize("max_tokens", [32])
 @pytest.mark.parametrize("num_logprobs", [10])
+@pytest.mark.parametrize(
+    "use_aiter", [True, False] if current_platform.is_rocm() else [False])
 def test_compressed_tensors_w8a8_logprobs(
    hf_runner,
    vllm_runner,
@@ -133,7 +157,21 @@ def test_compressed_tensors_w8a8_logprobs(
    model_path,
    max_tokens,
    num_logprobs,
+    use_aiter,
+    monkeypatch,
 ):
+    if current_platform.is_rocm(
+    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+    if use_aiter:
+        if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
+            pytest.skip(
+                f"Skip model {model_path} as it is not support by aiter.")
+        # this will enable VLLM_ROCM_USE_AITER_LINEAR
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
    dtype = "bfloat16"
    # skip language translation prompt for the static per tensor asym model
@@ -157,6 +195,9 @@ def test_compressed_tensors_w8a8_logprobs(
        name_1="vllm",
    )
+    if current_platform.is_rocm():
+        torch.cuda.synchronize()
 def test_compressed_tensors_no_enforce_eager(vllm_runner):
    model_path = os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change")
@@ -180,8 +221,27 @@ def test_compressed_tensors_no_enforce_eager(vllm_runner):
        ),
    ],
 )
-def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
+@pytest.mark.parametrize(
+    "use_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_compressed_tensors_w8a8_dynamic_per_token(
+    vllm_runner,
+    model_args,
+    use_aiter,
+    monkeypatch,
+):
    model_path, strategy = model_args
+    if current_platform.is_rocm(
+    ) and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL:
+        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+    if use_aiter:
+        if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
+            pytest.skip(
+                f"Skip model {model_path} as it is not support by aiter.")
+        # this will enable VLLM_ROCM_USE_AITER_LINEAR
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
    with vllm_runner(model_path, dtype=torch.float16) as llm:
        def check_model(model):
@@ -212,6 +272,8 @@ def test_compressed_tensors_w8a8_dynamic_per_token(vllm_runner, model_args):
     (os.path.join(models_path_prefix,"nm-testing/tinyllama-oneshot-w8a16-per-channel"), "channel", None, 4),
    ],
 )
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="The tests are skipped on non-CUDA platform.")
 def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
    model, strategy, group, pack_factor = wNa16_args
    with vllm_runner(model) as llm:
@@ -236,8 +298,8 @@ def test_compressed_tensors_wNa16(vllm_runner, wNa16_args):
        assert output
-@pytest.mark.skipif(current_platform(),
+@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="W4A16 MARLIN is not supported on ROCm.")
+                    reason="This test is skipped on non-CUDA platform.")
 def test_compressed_tensors_w4a16_marlin24(vllm_runner):
    model_path = os.path.join(models_path_prefix,"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t")
    with vllm_runner(model_path) as llm:
@@ -280,7 +342,7 @@ def test_compressed_tensors_fp8(vllm_runner):
            if isinstance(qkv_proj.scheme, CompressedTensorsW8A8Fp8):
                assert len(qkv_proj.input_scale.shape) == 0
-                assert qkv_proj.weight.dtype is torch.float8_e4m3fn
+                assert qkv_proj.weight.dtype is current_platform.fp8_dtype()
                assert qkv_proj.weight_scale.dtype is torch.float32
                assert len(qkv_proj.weight_scale.shape) == 0
@@ -290,8 +352,8 @@ def test_compressed_tensors_fp8(vllm_runner):
        assert output
-@pytest.mark.skipif(current_platform(),
+@pytest.mark.skipif(not current_platform.is_cuda(),
-                    reason="FP8 KV cache is not supported on ROCm.")
+                    reason="This test is skipped on non-CUDA platform.")
 def test_compressed_tensors_kv_cache(vllm_runner):
    model_path = os.path.join(models_path_prefix,"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")
    with vllm_runner(model_path, kv_cache_dtype="fp8") as llm:
@@ -320,7 +382,8 @@ def _test_2of4_quant_models(qkv_proj,
 @pytest.mark.skipif(
-    not current_platform.has_device_capability(90),
+    not current_platform.is_cuda()
+    or not current_platform.has_device_capability(90),
    reason="Sparse FP8 is not yet supported on this GPU type.",
 )
 @pytest.mark.parametrize(
@@ -367,7 +430,8 @@ def test_compressed_tensors_2of4_quant_fp8(vllm_runner, args_2of4):
 @pytest.mark.skipif(
-    not current_platform.has_device_capability(90),
+    not current_platform.is_cuda()
+    or not current_platform.has_device_capability(90),
    reason="Sparse FP8 is not yet supported on this GPU type.",
 )
 @pytest.mark.parametrize(

--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -12,13 +12,6 @@ from ..utils import compare_two_settings, models_path_prefix
 from vllm.platforms import current_platform
-@pytest.fixture(scope="function", autouse=True)
-def use_v0_only(monkeypatch):
-    # Fall back to V0 if cpu offloading is enabled.
-    # Fixture is required to that baseline uses V0.
-    monkeypatch.setenv('VLLM_USE_V1', '0')
 @pytest.mark.skipif(not is_quant_method_supported("fp8") or current_platform.is_rocm(),
                    reason="fp8 is not supported on this GPU type.")
 def test_cpu_offload_fp8():
@@ -35,7 +28,9 @@ def test_cpu_offload_fp8():
 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or current_platform.is_rocm(),
                    reason="gptq_marlin is not supported on this GPU type.")
-def test_cpu_offload_gptq():
+def test_cpu_offload_gptq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
    # Test GPTQ Marlin
    compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-GPTQ-Int4"), [],
                         ["--cpu-offload-gb", "1"],
@@ -49,7 +44,9 @@ def test_cpu_offload_gptq():
 @pytest.mark.skipif(not is_quant_method_supported("awq_marlin") or current_platform.is_rocm(),
                    reason="awq_marlin is not supported on this GPU type.")
-def test_cpu_offload_awq():
+def test_cpu_offload_awq(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
    # Test AWQ Marlin
    compare_two_settings(os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct-AWQ"), [],
                         ["--cpu-offload-gb", "1"],
@@ -63,7 +60,9 @@ def test_cpu_offload_awq():
 @pytest.mark.skipif(not is_quant_method_supported("gptq_marlin") or current_platform.is_rocm(),
                    reason="gptq_marlin is not supported on this GPU type.")
-def test_cpu_offload_compressed_tensors():
+def test_cpu_offload_compressed_tensors(monkeypatch):
+    # This quant method is sensitive to dummy weights, so we force real weights
+    monkeypatch.setenv('VLLM_TEST_FORCE_LOAD_FORMAT', 'auto')
    # Test wNa16
    compare_two_settings(os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w4a16-channel-v2"), [],
                         ["--cpu-offload-gb", "1"],

--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -25,8 +25,14 @@ MODELS = [
                    reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_id", MODELS)
 @pytest.mark.parametrize("force_marlin", [False, True])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 def test_model_load_and_run(vllm_runner, model_id: str, force_marlin: bool,
-                            monkeypatch) -> None:
+                            use_rocm_aiter: bool, monkeypatch) -> None:
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
    if force_marlin:
        monkeypatch.setenv("VLLM_TEST_FORCE_FP8_MARLIN", "1")
@@ -49,7 +55,13 @@ KV_CACHE_MODELS = [
 @pytest.mark.skipif(not is_quant_method_supported("fp8") or current_platform.is_rocm(),
                    reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_id", KV_CACHE_MODELS)
-def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, monkeypatch):
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
+def test_kv_cache_model_load_and_run(vllm_runner, model_id: str,
+                                     use_rocm_aiter: bool, monkeypatch):
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
    # vllm_runner.apply_model() relies on V0 internals.
    monkeypatch.setenv("VLLM_USE_V1", "0")
    with vllm_runner(model_id, kv_cache_dtype="fp8") as llm:
@@ -88,8 +100,13 @@ def test_kv_cache_model_load_and_run(vllm_runner, model_id: str, monkeypatch):
                    reason="FP8 is not supported on this GPU type.")
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
 @pytest.mark.parametrize("force_marlin", [False, True])
+@pytest.mark.parametrize(
+    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False])
 def test_load_fp16_model(vllm_runner, kv_cache_dtype: str, force_marlin: bool,
-                         monkeypatch) -> None:
+                         use_rocm_aiter: bool, monkeypatch) -> None:
+    if use_rocm_aiter:
+        monkeypatch.setenv("VLLM_ROCM_USE_AITER", "1")
    # vllm_runner.apply_model() relies on V0 internals.
    monkeypatch.setenv("VLLM_USE_V1", "0")

--- a/tests/entrypoints/openai/reasoning_parsers/__init__.py
+++ b/tests/entrypoints/openai/reasoning_parsers/__init__.py
--- a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
+++ b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
@@ -3,74 +3,126 @@
 import pytest
 from transformers import AutoTokenizer
-from tests.entrypoints.openai.reasoning_parsers.utils import (
+from tests.reasoning.utils import run_reasoning_extraction
-    run_reasoning_extraction)
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
-from vllm.entrypoints.openai.reasoning_parsers import (ReasoningParser,
-                                                       ReasoningParserManager)
 parser_name = "deepseek_r1"
 start_token = "<think>"
 end_token = "</think>"
+REASONING_MODEL_NAME = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
+@pytest.fixture(scope="module")
+def deepseek_r1_qwen_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
 SIMPLE_REASONING = {
    "output": "This is a reasoning section</think>This is the rest",
    "reasoning_content": "This is a reasoning section",
    "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 COMPLETE_REASONING = {
    "output": "This is a reasoning section</think>",
    "reasoning_content": "This is a reasoning section",
    "content": None,
+    "is_reasoning_end": True,
 }
 NO_CONTENT = {
    "output": "This is content",
    "reasoning_content": "This is content",
    "content": None,
+    "is_reasoning_end": False,
 }
 NO_REASONING_STREAMING = {
    "output": "This is a reasoning section",
    "reasoning_content": "This is a reasoning section",
    "content": None,
+    "is_reasoning_end": False,
 }
 MULTIPLE_LINES = {
    "output": "This\nThat</think>This is the rest\nThat",
    "reasoning_content": "This\nThat",
    "content": "This is the rest\nThat",
+    "is_reasoning_end": True,
 }
 SHORTEST_REASONING_NO_STREAMING = {
    "output": "</think>This is the rest",
    "reasoning_content": "",
    "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 SHORTEST_REASONING = {
    "output": "</think>This is the rest",
    "reasoning_content": None,
    "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 REASONING_WITH_THINK = {
    "output": "<think>This is a reasoning section</think>This is the rest",
    "reasoning_content": "This is a reasoning section",
    "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 COMPLETE_REASONING_WITH_THINK = {
    "output": "<think>This is a reasoning section</think>",
    "reasoning_content": "This is a reasoning section",
    "content": None,
+    "is_reasoning_end": True,
 }
 MULTIPLE_LINES_WITH_THINK = {
    "output": "<think>This\nThat</think>This is the rest\nThat",
    "reasoning_content": "This\nThat",
    "content": "This is the rest\nThat",
+    "is_reasoning_end": True,
 }
 SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
    "output": "</think>This is the rest",
    "reasoning_content": "",
    "content": "This is the rest",
+    "is_reasoning_end": True,
 }
 SHORTEST_REASONING_WITH_THINK = {
    "output": "</think>This is the rest",
    "reasoning_content": None,
    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+THINK_NO_END = {
+    "output": "<think>This is a reasoning section",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+    "is_reasoning_end": False,
+}
+EMPTY = {
+    "output": "",
+    "reasoning_content": "",
+    "content": None,
+    "is_reasoning_end": False,
+}
+EMPTY_STREAMING = {
+    "output": "",
+    "reasoning_content": None,
+    "content": None,
+    "is_reasoning_end": False,
+}
+NEW_LINE = {
+    "output": "\n<think>This is a reasoning section</think>\nThis is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "\nThis is the rest",
+    "is_reasoning_end": True,
+}
+# Streaming cannot handle new lines at the beginning of the output
+# because we need to support <think>...</think> and </think>...
+# We cannot know if the text before <think> is reasoning content
+# or not.
+NEW_LINE_STREAMING = {
+    "output": "\n<think>This is a reasoning section</think>\nThis is the rest",
+    "reasoning_content": "\nThis is a reasoning section",
+    "content": "\nThis is the rest",
+    "is_reasoning_end": True,
 }
 TEST_CASES = [
@@ -164,25 +216,53 @@ TEST_CASES = [
        SHORTEST_REASONING_WITH_THINK,
        id="shortest_with_think_streaming",
    ),
+    pytest.param(
+        False,
+        THINK_NO_END,
+        id="think_no_end",
+    ),
+    pytest.param(
+        True,
+        THINK_NO_END,
+        id="think_no_end_streaming",
+    ),
+    pytest.param(
+        False,
+        EMPTY,
+        id="empty",
+    ),
+    pytest.param(
+        True,
+        EMPTY_STREAMING,
+        id="empty_streaming",
+    ),
+    pytest.param(
+        False,
+        NEW_LINE,
+        id="new_line",
+    ),
+    pytest.param(
+        True,
+        NEW_LINE_STREAMING,
+        id="new_line_streaming",
+    ),
 ]
-# Global tokenizer initialization to avoid repeated loading
-tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
-tokenizer.add_tokens([start_token, end_token])
 @pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
 def test_reasoning(
    streaming: bool,
    param_dict: dict,
+    deepseek_r1_qwen_tokenizer,
 ):
-    output = tokenizer.tokenize(param_dict["output"])
+    output = deepseek_r1_qwen_tokenizer.tokenize(param_dict["output"])
    # decode everything to tokens
    output_tokens: list[str] = [
-        tokenizer.convert_tokens_to_string([token]) for token in output
+        deepseek_r1_qwen_tokenizer.convert_tokens_to_string([token])
+        for token in output
    ]
    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
-        parser_name)(tokenizer)
+        parser_name)(deepseek_r1_qwen_tokenizer)
    reasoning, content = run_reasoning_extraction(parser,
                                                  output_tokens,
@@ -190,3 +270,17 @@ def test_reasoning(
    assert reasoning == param_dict["reasoning_content"]
    assert content == param_dict["content"]
+    # Test is_reasoning_end
+    output_ids = deepseek_r1_qwen_tokenizer.convert_tokens_to_ids(output)
+    is_reasoning_end = parser.is_reasoning_end(output_ids)
+    assert is_reasoning_end == param_dict["is_reasoning_end"]
+    # Test extract_content
+    if param_dict["content"] is not None:
+        content = parser.extract_content_ids(output_ids)
+        assert content == deepseek_r1_qwen_tokenizer.convert_tokens_to_ids(
+            deepseek_r1_qwen_tokenizer.tokenize(param_dict["content"]))
+    else:
+        content = parser.extract_content_ids(output)
+        assert content == []
--- a/tests/reasoning/test_granite_reasoning_parser.py
+++ b/tests/reasoning/test_granite_reasoning_parser.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+from transformers import AutoTokenizer
+from tests.reasoning.utils import DeltaMessage, run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+parser_name = "granite"
+START_REASONING = "Here is my thought process:"
+START_RESPONSE = "Here is my response:"
+SIMPLE_REASONING = {
+    "output":
+    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  #noqa: E501
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+COMPLETE_REASONING = {
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+NO_REASONING = {
+    "output": "This is content",
+    "reasoning_content": None,
+    "content": "This is content",
+}
+MULTIPLE_LINES = {
+    "output":
+    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+}
+REASONING_WITH_THINK = {
+    "output":
+    f"{START_REASONING}This is a reasoning section{START_RESPONSE}This is the rest",  #noqa: E501
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+COMPLETE_REASONING_WITH_THINK = {
+    "output": f"{START_REASONING}This is a reasoning section{START_RESPONSE}",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+MULTIPLE_LINES_WITH_THINK = {
+    "output":
+    f"{START_REASONING}This\nThat{START_RESPONSE}This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+}
+TEST_CASES = [
+    pytest.param(
+        False,
+        SIMPLE_REASONING,
+        id="simple_reasoning",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_reasoning",
+    ),
+    pytest.param(
+        False,
+        NO_REASONING,
+        id="no_reasoning",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES,
+        id="multiple_lines",
+    ),
+    pytest.param(
+        False,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think",
+    ),
+    pytest.param(
+        True,
+        SIMPLE_REASONING,
+        id="simple_reasoning_streaming",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_reasoning_streaming",
+    ),
+    pytest.param(
+        True,
+        NO_REASONING,
+        id="no_reasoning_streaming",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES,
+        id="multiple_lines_streaming",
+    ),
+    pytest.param(
+        True,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think_streaming",
+    ),
+]
+# Global tokenizer initialization to avoid repeated loading
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+):
+    output = tokenizer.tokenize(param_dict["output"])
+    # decode everything to tokens
+    output_tokens: list[str] = [
+        tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(tokenizer)
+    reasoning, content = run_reasoning_extraction(parser,
+                                                  output_tokens,
+                                                  streaming=streaming)
+    assert reasoning == param_dict["reasoning_content"]
+    assert content == param_dict["content"]
+# Additional tests for verifying the correctness of granite streaming; this
+# is complicated because granite uses multiple tokens to indicate when thinking
+# is starting / when it's starting its response, so skipping special tokens
+# is awkward.
+### Handling the start of reasoning
+STREAMING_1 = {
+    "previous_text": None,
+    "current_text": "Here",
+    "delta_text": "Here",
+    "reasoning_content": None,
+    "content": None,
+}
+# When we fail, we should give what was previously being silenced first
+STREAMING_2 = {
+    "previous_text": "Here is my thought",
+    "current_text": "Here is my thought failure",
+    "delta_text": " failure",
+    "reasoning_content": None,
+    "content": "Here is my thought failure",
+}
+# But then after the first one, we should only add the delta text to content
+STREAMING_3 = {
+    "previous_text": "Here wrong",
+    "current_text": " words",
+    "delta_text": " Here wrong words",
+    "reasoning_content": None,
+    "content": " words",
+}
+# But then after the first one, we should only add the delta text to content
+STREAMING_4 = {
+    "previous_text": "Here is my thought",
+    "current_text": "Here is my thought process:",
+    "delta_text": " process:",
+    "reasoning_content": None,
+    "content": None,
+}
+# Reasoning started successfully; parse reasoning content
+STREAMING_5 = {
+    "previous_text": "Here is my thought process:",
+    "current_text": "Here is my thought process: foo",
+    "delta_text": " foo",
+    "reasoning_content": " foo",
+    "content": None,
+}
+# Response special sequence has started, but not finished.
+STREAMING_6 = {
+    "previous_text": "Here is my thought process: foo",
+    "current_text": "Here is my thought process: foo Here is",
+    "delta_text": " Here is",
+    "reasoning_content": " ",
+    "content": None,
+}
+# Response special sequence started, but was broken; the reasoning
+# content should be the content that was previously unused.
+STREAMING_7 = {
+    "previous_text": "Here is my thought process: foo Here is",
+    "current_text": "Here is my thought process: foo Here is Here",
+    "delta_text": " Here",
+    "reasoning_content": "Here is ",
+    "content": None,
+}
+# Response special sequence is ongoing
+STREAMING_8 = {
+    "previous_text": "Here is my thought process: foo Here is my response:",
+    "current_text": "Here is my thought process: foo Here is my response: bar",
+    "delta_text": " bar",
+    "reasoning_content": None,
+    "content": " bar",
+}
+# The delta text has everything; we should be able to correctly parse both
+STREAMING_9 = {
+    "previous_text": None,
+    "current_text": "Here is my thought process: foo Here is my response: bar",
+    "delta_text": "Here is my thought process: foo Here is my response: bar",
+    "reasoning_content": " foo ",
+    "content": " bar",
+}
+## The Response is ongoing, and the delta mixes reasoning content / content
+STREAMING_10 = {
+    "previous_text": "Here is my thought process: foo",
+    "current_text":
+    "Here is my thought process: foo bar Here is my response: baz",
+    "delta_text": " bar Here is my response: baz",
+    "reasoning_content": " bar ",
+    "content": " baz",
+}
+# The delta text starts a new substring that might be a response special seq
+STREAMING_11 = {
+    "previous_text":
+    "Here is my thought process: This is a reasoning section ",
+    "current_text":
+    "Here is my thought process: This is a reasoning section Here",
+    "delta_text": "Here",
+    "reasoning_content": None,
+    "content": None,
+}
+# The delta text is finishing the response special seq
+STREAMING_12 = {
+    "previous_text": "Here is my thought process: foo Here is my response",
+    "current_text": "Here is my thought process: foo Here is my response:",
+    "delta_text": ":",
+    "reasoning_content": None,
+    "content": None,
+}
+STREAMING_13 = {
+    "previous_text": "Here is my thought process: foo Here",
+    "current_text": "Here is my thought process: foo Here was",
+    "delta_text": " was",
+    "reasoning_content": "Here was",
+    "content": None,
+}
+STREAMING_SUBCASES = [
+    pytest.param(
+        STREAMING_1,
+        id="Starting reasoning special sequence",
+    ),
+    pytest.param(
+        STREAMING_2,
+        id="Unexpected start reasoning sequence",
+    ),
+    pytest.param(
+        STREAMING_3,
+        id="Continuing unexpected start reasoning sequence",
+    ),
+    pytest.param(
+        STREAMING_4,
+        id="Only start reasoning sequence and nothing else",
+    ),
+    pytest.param(
+        STREAMING_5,
+        id="Reasoning content has started",
+    ),
+    pytest.param(
+        STREAMING_6,
+        id="Response special sequence has started",
+    ),
+    pytest.param(
+        STREAMING_7,
+        id="Response special sequence reset",
+    ),
+    pytest.param(
+        STREAMING_8,
+        id="Response text has started",
+    ),
+    pytest.param(
+        STREAMING_9,
+        id="Delta contains everything",
+    ),
+    pytest.param(
+        STREAMING_10,
+        id="Delta contains some reasoning and response",
+    ),
+    pytest.param(
+        STREAMING_11,
+        id="Delta starts response sequence",
+    ),
+    pytest.param(
+        STREAMING_12,
+        id="Delta finishes response sequence",
+    ),
+    pytest.param(
+        STREAMING_13,
+        id="Delta breaks potential responise sequence",
+    ),
+]
+@pytest.mark.parametrize("param_dict", STREAMING_SUBCASES)
+def test_streaming_subcases(param_dict):
+    # Get all of the token IDs
+    previous_token_ids = tokenizer.encode(
+        param_dict["previous_text"]
+    ) if param_dict["previous_text"] is not None else []
+    current_token_ids = tokenizer.encode(param_dict["current_text"])
+    delta_token_ids = tokenizer.encode(param_dict["delta_text"])
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(tokenizer)
+    response = parser.extract_reasoning_content_streaming(
+        previous_text=param_dict["previous_text"],
+        current_text=param_dict["current_text"],
+        delta_text=param_dict["delta_text"],
+        previous_token_ids=previous_token_ids,
+        current_token_ids=current_token_ids,
+        delta_token_ids=delta_token_ids,
+    )
+    # Streaming currently expects at least one of reasoning content / content,
+    # so the response should return None in that case.
+    if param_dict["reasoning_content"] is None and param_dict[
+            "content"] is None:
+        assert response is None
+    else:
+        assert isinstance(response, DeltaMessage)
+        assert param_dict["reasoning_content"] == response.reasoning_content
+        assert param_dict["content"] == response.content
--- a/tests/entrypoints/openai/reasoning_parsers/utils.py
+++ b/tests/entrypoints/openai/reasoning_parsers/utils.py
@@ -4,7 +4,7 @@ from typing import Optional, Union
 from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
                                              DeltaMessage)
-from vllm.entrypoints.openai.reasoning_parsers import ReasoningParser
+from vllm.reasoning import ReasoningParser
 class StreamingReasoningReconstructor:

--- a/tests/spec_decode/e2e/test_integration_dist_tp2.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp2.py
@@ -3,6 +3,7 @@
 tensor parallelism.
 """
+import json
 from typing import Optional
 import pytest
@@ -30,14 +31,14 @@ from ...utils import models_path_prefix
 @pytest.mark.parametrize("test_llm_kwargs", [
    [
        "--speculative_config",
-        str({
+        json.dumps({
            "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
            "num_speculative_tokens": 3,
        }),
    ],
    [
        "--speculative_config",
-        str({
+        json.dumps({
            "model": "ngram",
            "num_speculative_tokens": 5,
            "prompt_lookup_max": 3,
@@ -90,7 +91,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
    "model, test_llm_kwargs",
    [(os.path.join(models_path_prefix, "JackFram/llama-68m"), [
        "--speculative_config",
-        str({
+        json.dumps({
            "model": os.path.join(models_path_prefix, "JackFram/llama-68m"),
            "num_speculative_tokens": 5,
            "draft_tensor_parallel_size": 1,
@@ -98,7 +99,7 @@ def test_target_model_tp_gt_1(common_llm_kwargs, per_test_common_llm_kwargs,
    ]),
     (os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"), [
         "--speculative_config",
-         str({
+         json.dumps({
             "model": os.path.join(models_path_prefix, "ibm-granite/granite-3b-code-instruct"),
             "num_speculative_tokens": 5,
             "draft_tensor_parallel_size": 1,
@@ -149,20 +150,20 @@ def test_draft_model_tp_lt_target_model_tp2(model, common_llm_kwargs,
 @pytest.mark.parametrize("model, test_llm_kwargs",
                         [("JackFram/llama-68m", [
                             "--speculative_config",
-                             str({
+                             json.dumps({
                                 "model": "JackFram/llama-68m",
                                 "num_speculative_tokens": 3,
                             }),
                         ]),
                          ("JackFram/llama-68m", [
                              "--speculative_config",
-                              str({
+                              json.dumps({
                                  "model": "JackFram/llama-68m",
                                  "num_speculative_tokens": 3,
                                  "draft_tensor_parallel_size": 1,
                              }),
                          ])])
-@pytest.mark.parametrize("logprobs", [None, 2])
+@pytest.mark.parametrize("logprobs", [None])
 @pytest.mark.parametrize("batch_size", [2])
 @pytest.mark.parametrize("seed", [1])
 def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
@@ -173,9 +174,68 @@ def test_spec_decode_chunked_prefill_tp2(model, common_llm_kwargs,
    """Verify spec decode works well with same and different TP size for
    the draft model with chunked prefill.
    """
-    if logprobs:
+    run_equality_correctness_test_tp(model,
-        test_llm_kwargs.extend(
+                                     common_llm_kwargs,
-            ["--disable_logprobs_during_spec_decoding", "False"])
+                                     per_test_common_llm_kwargs,
+                                     baseline_llm_kwargs,
+                                     test_llm_kwargs,
+                                     batch_size,
+                                     max_output_len=32,
+                                     seed=seed,
+                                     temperature=0.0,
+                                     logprobs=logprobs)
+@pytest.mark.skipif(torch.cuda.device_count() < 2,
+                    reason="Need at least 2 GPUs to run the test.")
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [[
+        # Skip cuda graph recording for fast test.
+        "--enforce-eager",
+        "--tensor_parallel_size",
+        "2",
+        # precision
+        "--dtype",
+        "bfloat16",
+    ]])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [["--enable-chunked-prefill", "False"],
+     [
+         "--enable-chunked-prefill", "True", "--max-num-batched-tokens", "4",
+         "--max-num-seqs", "4"
+     ]])
+@pytest.mark.parametrize("baseline_llm_kwargs", [[]])
+@pytest.mark.parametrize("model, test_llm_kwargs",
+                         [("JackFram/llama-68m", [
+                             "--speculative_config",
+                             json.dumps({
+                                 "model": "JackFram/llama-68m",
+                                 "num_speculative_tokens": 3,
+                                 "disable_logprobs": False,
+                             }),
+                         ]),
+                          ("JackFram/llama-68m", [
+                              "--speculative_config",
+                              json.dumps({
+                                  "model": "JackFram/llama-68m",
+                                  "num_speculative_tokens": 3,
+                                  "draft_tensor_parallel_size": 1,
+                                  "disable_logprobs": False,
+                              }),
+                          ])])
+@pytest.mark.parametrize("logprobs", [2])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_chunked_prefill_tp2_with_logprobs(
+        model, common_llm_kwargs, per_test_common_llm_kwargs,
+        baseline_llm_kwargs, test_llm_kwargs, logprobs: Optional[int],
+        batch_size: int, seed: int):
+    """Verify spec decode works well with same and different TP size for
+    the draft model with chunked prefill.
+    """
    run_equality_correctness_test_tp(model,
                                     common_llm_kwargs,
                                     per_test_common_llm_kwargs,

--- a/tests/spec_decode/e2e/test_integration_dist_tp4.py
+++ b/tests/spec_decode/e2e/test_integration_dist_tp4.py
@@ -3,6 +3,8 @@
 tensor parallelism.
 """
+import json
 import openai
 import pytest
 import torch
@@ -35,7 +37,7 @@ SPEC_MODEL = os.path.join(models_path_prefix, "JackFram/llama-68m")
        #TODO(wooyeon): add spec_draft_dp=2 case
        [
            "--speculative_config",
-            str({
+            json.dumps({
                "model": f"{SPEC_MODEL}",
                "num_speculative_tokens": 5,
                "draft_tensor_parallel_size": 1,
@@ -82,7 +84,7 @@ def test_draft_model_tp_lt_target_model_tp4(common_llm_kwargs,
            # Artificially limit the draft model max model len; this forces vLLM
            # to skip speculation once the sequences grow beyond 32-k tokens.
            "--speculative_config",
-            str({
+            json.dumps({
                "model": f"{SPEC_MODEL}",
                "num_speculative_tokens": 5,
                "max_model_len": 32,

--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -2,19 +2,22 @@
 # ruff: noqa
 import asyncio
+import hashlib
+import pickle
 import socket
 from collections.abc import AsyncIterator
 from unittest.mock import patch
 import pytest
 import torch
-from vllm_test_utils import monitor
+from vllm_test_utils.monitor import monitor
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.utils import (FlexibleArgumentParser, MemorySnapshot,
                        PlaceholderModule, StoreBoolean, bind_kv_cache,
                        deprecate_kwargs, get_open_port, memory_profiling,
-                        merge_async_iterators, supports_kw, swap_dict_values)
+                        merge_async_iterators, sha256, supports_kw,
+                        swap_dict_values)
 from .utils import create_new_process_for_each_test, error_on_warning
 from .utils import models_path_prefix
@@ -141,7 +144,8 @@ def parser():
 def parser_with_config():
    parser = FlexibleArgumentParser()
    parser.add_argument('serve')
-    parser.add_argument('model_tag')
+    parser.add_argument('model_tag', nargs='?')
+    parser.add_argument('--model', type=str)
    parser.add_argument('--served-model-name', type=str)
    parser.add_argument('--config', type=str)
    parser.add_argument('--port', type=int)
@@ -198,29 +202,29 @@ def test_missing_required_argument(parser):
        parser.parse_args([])
-def test_cli_override_to_config(parser_with_config):
+def test_cli_override_to_config(parser_with_config, cli_config_file):
    args = parser_with_config.parse_args([
-        'serve', 'mymodel', '--config', './data/test_config.yaml',
+        'serve', 'mymodel', '--config', cli_config_file,
        '--tensor-parallel-size', '3'
    ])
    assert args.tensor_parallel_size == 3
    args = parser_with_config.parse_args([
        'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-        './data/test_config.yaml'
+        cli_config_file
    ])
    assert args.tensor_parallel_size == 3
    assert args.port == 12312
    args = parser_with_config.parse_args([
        'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
-        './data/test_config.yaml', '--port', '666'
+        cli_config_file, '--port', '666'
    ])
    assert args.tensor_parallel_size == 3
    assert args.port == 666
-def test_config_args(parser_with_config):
+def test_config_args(parser_with_config, cli_config_file):
    args = parser_with_config.parse_args(
-        ['serve', 'mymodel', '--config', './data/test_config.yaml'])
+        ['serve', 'mymodel', '--config', cli_config_file])
    assert args.tensor_parallel_size == 2
    assert args.trust_remote_code
    assert not args.multi_step_stream_outputs
@@ -242,10 +246,9 @@ def test_config_file(parser_with_config):
        ])
-def test_no_model_tag(parser_with_config):
+def test_no_model_tag(parser_with_config, cli_config_file):
    with pytest.raises(ValueError):
-        parser_with_config.parse_args(
+        parser_with_config.parse_args(['serve', '--config', cli_config_file])
-            ['serve', '--config', './data/test_config.yaml'])
 # yapf: enable
@@ -478,3 +481,63 @@ def test_swap_dict_values(obj, key1, key2):
        assert obj[key1] == original_obj[key2]
    else:
        assert key1 not in obj
+def test_model_specification(parser_with_config,
+                             cli_config_file,
+                             cli_config_file_with_model):
+    # Test model in CLI takes precedence over config
+    args = parser_with_config.parse_args([
+        'serve', 'cli-model', '--config', cli_config_file_with_model
+    ])
+    assert args.model_tag == 'cli-model'
+    assert args.served_model_name == 'mymodel'
+    # Test model from config file works
+    args = parser_with_config.parse_args([
+        'serve', '--config', cli_config_file_with_model,
+    ])
+    assert args.model == 'config-model'
+    assert args.served_model_name == 'mymodel'
+    # Test no model specified anywhere raises error
+    with pytest.raises(ValueError, match="No model specified!"):
+        parser_with_config.parse_args(['serve', '--config', cli_config_file])
+    # Test using --model option raises error
+    with pytest.raises(
+        ValueError,
+        match=(
+            "With `vllm serve`, you should provide the model as a positional "
+            "argument or in a config file instead of via the `--model` option."
+        ),
+    ):
+        parser_with_config.parse_args(['serve', '--model', 'my-model'])
+    # Test other config values are preserved
+    args = parser_with_config.parse_args([
+        'serve', 'cli-model', '--config', cli_config_file_with_model,
+    ])
+    assert args.tensor_parallel_size == 2
+    assert args.trust_remote_code is True
+    assert args.multi_step_stream_outputs is False
+    assert args.port == 12312
+@pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
+                                    (None, bool, [1, 2, 3])])
+@pytest.mark.parametrize("output", [0, 1, 2])
+def test_sha256(input: tuple, output: int):
+    hash = sha256(input)
+    assert hash is not None
+    assert isinstance(hash, int)
+    assert hash != 0
+    bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
+    assert hash == int.from_bytes(hashlib.sha256(bytes).digest(), byteorder="big")
+    # hashing again, returns the same value
+    assert hash == sha256(input)
+    # hashing different input, returns different value
+    assert hash != sha256(input + (1, ))
--- a/tests/tool_use/test_chat_completion_request_validations.py
+++ b/tests/tool_use/test_chat_completion_request_validations.py
@@ -45,7 +45,8 @@ def test_chat_completion_request_with_no_tools():
    assert request.tool_choice == 'none'
-def test_chat_completion_request_with_tool_choice_but_no_tools():
+@pytest.mark.parametrize('tool_choice', ['auto', 'required'])
+def test_chat_completion_request_with_tool_choice_but_no_tools(tool_choice):
    with pytest.raises(ValueError,
                       match="When using `tool_choice`, `tools` must be set."):
        ChatCompletionRequest.model_validate({
@@ -56,7 +57,7 @@ def test_chat_completion_request_with_tool_choice_but_no_tools():
            'model':
            os.path.join(models_path_prefix, 'facebook/opt-125m'),
            'tool_choice':
-            'auto'
+            tool_choice
        })
    with pytest.raises(ValueError,
@@ -69,7 +70,7 @@ def test_chat_completion_request_with_tool_choice_but_no_tools():
            'model':
            os.path.join(models_path_prefix, 'facebook/opt-125m'),
            'tool_choice':
-            'auto',
+            tool_choice,
            'tools':
            None
        })