Merge tag 'v0.14.0' into v0.14.0-dev

7e63ef82 · zhuwenwen · 8cbcac5d · b17039bc · 7e63ef82 · 7e63ef82
Commit 7e63ef82 authored Jan 21, 2026 by zhuwenwen
20 changed files
--- a/tests/multimodal/test_cache.py
+++ b/tests/multimodal/test_cache.py
@@ -24,10 +24,12 @@ from vllm.multimodal.cache import (
 )
 from vllm.multimodal.hasher import MultiModalHasher
 from vllm.multimodal.inputs import (
+    MultiModalFeatureSpec,
    MultiModalFieldElem,
    MultiModalKwargsItem,
    MultiModalKwargsItems,
    MultiModalSharedField,
+    PlaceholderRange,
 )
 from vllm.multimodal.processing import PromptInsertion
 from vllm.utils.mem_constants import GiB_bytes, MiB_bytes
@@ -518,3 +520,40 @@ def test_cache_eviction_shm_cache():
    receiver_cache = ShmObjectStoreReceiverCache(vllm_config, mp.Lock())

    _run_test_cache_eviction_shm(sender_cache, receiver_cache, base_item_size=MiB_bytes)
+
+
+def test_processor_cache_shared_across_loras():
+    """Test that processor cache uses mm_hash to share data across LoRAs."""
+    model_config = ModelConfig(
+        model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
+        mm_processor_cache_gb=1,
+    )
+    receiver_cache = MultiModalReceiverCache(model_config)
+
+    base_mm_hash = "image_hash_abc123"
+    lora_a_identifier = f"12345:{base_mm_hash}"
+    lora_b_identifier = f"67890:{base_mm_hash}"
+
+    item_data = MultiModalKwargsItem.dummy("test_image", nbytes=1024)
+
+    feature_lora_a = MultiModalFeatureSpec(
+        data=item_data,
+        modality="image",
+        identifier=lora_a_identifier,
+        mm_position=PlaceholderRange(offset=0, length=100),
+        mm_hash=base_mm_hash,
+    )
+
+    receiver_cache.get_and_update_features([feature_lora_a])
+    assert base_mm_hash in receiver_cache._cache
+
+    feature_lora_b = MultiModalFeatureSpec(
+        data=None,
+        modality="image",
+        identifier=lora_b_identifier,
+        mm_position=PlaceholderRange(offset=0, length=100),
+        mm_hash=base_mm_hash,
+    )
+
+    receiver_cache.get_and_update_features([feature_lora_b])
+    assert feature_lora_b.data == item_data
--- a/tests/multimodal/test_embedding_shape_validation_unit.py
+++ b/tests/multimodal/test_embedding_shape_validation_unit.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Unit tests for embedding shape validation.
+
+Simple, fast unit tests that can run without server fixtures.
+Run with: pytest tests/multimodal/test_embedding_shape_validation_unit.py -v
+"""
+
+import pytest
+import torch
+
+from vllm.multimodal.parse import (
+    AudioEmbeddingItems,
+    ImageEmbeddingItems,
+)
+
+
+class TestImageEmbedBasicValidation:
+    """Test basic ndim validation in image embeddings via ImageEmbeddingItems."""
+
+    def test_valid_2d_tensor_accepted(self):
+        """Baseline: 2D tensors should be accepted."""
+        valid_tensor = torch.randn(10, 768, dtype=torch.float32)
+
+        # Should not raise - 2D is valid
+        items = ImageEmbeddingItems(valid_tensor)
+        assert items.get_count() == 10
+
+    def test_valid_3d_tensor_accepted(self):
+        """Baseline: 3D tensors should be accepted."""
+        valid_tensor = torch.randn(2, 10, 768, dtype=torch.float32)
+
+        # Should not raise - 3D is valid
+        items = ImageEmbeddingItems(valid_tensor)
+        assert items.get_count() == 2
+
+    def test_valid_list_of_2d_tensors_accepted(self):
+        """Baseline: List of 2D tensors should be accepted."""
+        tensors = [
+            torch.randn(10, 768, dtype=torch.float32),
+            torch.randn(15, 768, dtype=torch.float32),
+        ]
+
+        # Should not raise
+        items = ImageEmbeddingItems(tensors)
+        assert items.get_count() == 2
+
+    def test_1d_tensor_rejected(self):
+        """Security: 1D tensors should be rejected (invalid ndim)."""
+        invalid_tensor = torch.randn(768, dtype=torch.float32)  # 1D
+
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(invalid_tensor)
+
+        assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
+
+    def test_4d_tensor_rejected(self):
+        """Security: 4D tensors should be rejected (invalid ndim)."""
+        invalid_tensor = torch.randn(1, 2, 10, 768, dtype=torch.float32)  # 4D
+
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(invalid_tensor)
+
+        assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
+
+    def test_hidden_size_validation_correct_size(self):
+        """Embeddings with correct hidden size should be accepted."""
+        expected_hidden_size = 768
+        valid_tensor = torch.randn(10, expected_hidden_size, dtype=torch.float32)
+
+        # Should not raise
+        items = ImageEmbeddingItems(
+            valid_tensor, expected_hidden_size=expected_hidden_size
+        )
+        assert items.get_count() == 10
+
+    def test_hidden_size_validation_wrong_size_rejected(self):
+        """Embeddings with wrong hidden size should be rejected."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 4096
+        invalid_tensor = torch.randn(10, wrong_hidden_size, dtype=torch.float32)
+
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(
+                invalid_tensor, expected_hidden_size=expected_hidden_size
+            )
+
+        error_msg = str(exc_info.value)
+        assert "hidden dimension mismatch" in error_msg.lower()
+        assert str(wrong_hidden_size) in error_msg
+        assert str(expected_hidden_size) in error_msg
+
+
+class TestAudioEmbedBasicValidation:
+    """Test basic ndim validation in audio embeddings via AudioEmbeddingItems."""
+
+    def test_valid_2d_tensor_accepted(self):
+        """Baseline: 2D tensors should be accepted."""
+        valid_tensor = torch.randn(10, 768, dtype=torch.float32)
+
+        # Should not raise - 2D is valid
+        items = AudioEmbeddingItems(valid_tensor)
+        assert items.get_count() == 10
+
+    def test_valid_3d_tensor_accepted(self):
+        """Baseline: 3D tensors should be accepted."""
+        valid_tensor = torch.randn(2, 10, 768, dtype=torch.float32)
+
+        # Should not raise - 3D is valid
+        items = AudioEmbeddingItems(valid_tensor)
+        assert items.get_count() == 2
+
+    def test_valid_list_of_2d_tensors_accepted(self):
+        """Baseline: List of 2D tensors should be accepted."""
+        tensors = [
+            torch.randn(10, 768, dtype=torch.float32),
+            torch.randn(15, 768, dtype=torch.float32),
+        ]
+
+        # Should not raise
+        items = AudioEmbeddingItems(tensors)
+        assert items.get_count() == 2
+
+    def test_1d_tensor_rejected(self):
+        """Security: 1D tensors should be rejected (invalid ndim)."""
+        invalid_tensor = torch.randn(768, dtype=torch.float32)  # 1D
+
+        with pytest.raises(ValueError) as exc_info:
+            AudioEmbeddingItems(invalid_tensor)
+
+        assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
+
+    def test_scalar_rejected(self):
+        """Security: Scalar tensors should be rejected."""
+        invalid_tensor = torch.tensor(1.0)  # 0D (scalar)
+
+        with pytest.raises(ValueError):
+            AudioEmbeddingItems(invalid_tensor)
+
+    def test_hidden_size_validation_correct_size(self):
+        """Embeddings with correct hidden size should be accepted."""
+        expected_hidden_size = 768
+        valid_tensor = torch.randn(10, expected_hidden_size, dtype=torch.float32)
+
+        # Should not raise
+        items = AudioEmbeddingItems(
+            valid_tensor, expected_hidden_size=expected_hidden_size
+        )
+        assert items.get_count() == 10
+
+    def test_hidden_size_validation_wrong_size_rejected(self):
+        """Embeddings with wrong hidden size should be rejected."""
+        expected_hidden_size = 768
+        wrong_hidden_size = 4096
+        invalid_tensor = torch.randn(10, wrong_hidden_size, dtype=torch.float32)
+
+        with pytest.raises(ValueError) as exc_info:
+            AudioEmbeddingItems(
+                invalid_tensor, expected_hidden_size=expected_hidden_size
+            )
+
+        error_msg = str(exc_info.value)
+        assert "hidden dimension mismatch" in error_msg.lower()
+        assert str(wrong_hidden_size) in error_msg
+        assert str(expected_hidden_size) in error_msg
+
+
+class TestShapeValidationDoSPrevention:
+    """
+    Tests for DoS prevention through shape validation.
+
+    Verifies that embeddings with incorrect shapes are rejected early,
+    preventing crashes during model inference.
+    """
+
+    def test_prevent_crash_from_wrong_shape_image_embeds(self):
+        """
+        Prevent crash scenario: wrong hidden size in image embeddings.
+
+        Without validation, this would pass initial checks but crash later
+        during model forward pass when dimensions don't match.
+        """
+        expected_hidden_size = 768  # Typical model hidden size
+        wrong_hidden_size = 4096  # Wrong size (e.g., Llama-sized)
+
+        wrong_embedding = torch.randn(100, wrong_hidden_size, dtype=torch.float32)
+
+        # Should be rejected at instantiation time, not during inference
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(
+                wrong_embedding, expected_hidden_size=expected_hidden_size
+            )
+
+        error_msg = str(exc_info.value)
+        assert "hidden dimension mismatch" in error_msg.lower()
+        assert str(expected_hidden_size) in error_msg  # Expected
+        assert str(wrong_hidden_size) in error_msg  # Received
+
+    def test_prevent_crash_from_wrong_shape_audio_embeds(self):
+        """
+        Prevent crash scenario: wrong hidden size in audio embeddings.
+        """
+        expected_hidden_size = 768
+        wrong_hidden_size = 4096
+
+        wrong_embedding = torch.randn(100, wrong_hidden_size, dtype=torch.float32)
+
+        with pytest.raises(ValueError) as exc_info:
+            AudioEmbeddingItems(
+                wrong_embedding, expected_hidden_size=expected_hidden_size
+            )
+
+        error_msg = str(exc_info.value)
+        assert "hidden dimension mismatch" in error_msg.lower()
+
+    def test_extremely_large_hidden_size_rejected(self):
+        """Security: Prevent DoS from extremely large embeddings."""
+        expected_hidden_size = 768
+        huge_hidden_size = 100000  # Large but not extreme to avoid test OOM
+
+        invalid_tensor = torch.randn(10, huge_hidden_size, dtype=torch.float32)
+
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(
+                invalid_tensor, expected_hidden_size=expected_hidden_size
+            )
+
+        assert "hidden dimension mismatch" in str(exc_info.value).lower()
+
+    def test_batch_with_mixed_hidden_sizes_rejected(self):
+        """All embeddings in a list must have the same hidden size."""
+        expected_hidden_size = 768
+
+        # One correct, one wrong
+        batch = [
+            torch.randn(10, expected_hidden_size, dtype=torch.float32),
+            torch.randn(10, expected_hidden_size + 100, dtype=torch.float32),  # Wrong!
+        ]
+
+        # Should fail on the second one
+        with pytest.raises(ValueError) as exc_info:
+            ImageEmbeddingItems(batch, expected_hidden_size=expected_hidden_size)
+
+        assert "hidden dimension mismatch" in str(exc_info.value).lower()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "--tb=short"])
--- a/tests/multimodal/test_image.py
+++ b/tests/multimodal/test_image.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pickle
 from pathlib import Path

 import numpy as np
 import pytest
 from PIL import Image, ImageChops

+from vllm.multimodal.base import MediaWithBytes
 from vllm.multimodal.image import ImageMediaIO, convert_image_mode

 pytestmark = pytest.mark.cpu_test
@@ -157,3 +159,34 @@ def test_rgba_background_color_validation():
    ImageMediaIO(rgba_background_color=(0, 0, 0))  # Should not raise
    ImageMediaIO(rgba_background_color=[255, 255, 255])  # Should not raise
    ImageMediaIO(rgba_background_color=(128, 128, 128))  # Should not raise
+
+
+def test_media_with_bytes_pickle_roundtrip():
+    """Regression test for pickle/unpickle of MediaWithBytes.
+
+    Verifies that MediaWithBytes can be pickled and unpickled without
+    RecursionError. See: https://github.com/vllm-project/vllm/issues/30818
+    """
+    original_image = Image.open(ASSETS_DIR / "image1.png").convert("RGB")
+    original_bytes = b"test_bytes_data"
+
+    wrapper = MediaWithBytes(media=original_image, original_bytes=original_bytes)
+
+    # Verify attribute delegation works before pickling
+    assert wrapper.width == original_image.width
+    assert wrapper.height == original_image.height
+    assert wrapper.mode == original_image.mode
+
+    # Pickle and unpickle (this would cause RecursionError before the fix)
+    pickled = pickle.dumps(wrapper)
+    unpickled = pickle.loads(pickled)
+
+    # Verify the unpickled object works correctly
+    assert unpickled.original_bytes == original_bytes
+    assert unpickled.media.width == original_image.width
+    assert unpickled.media.height == original_image.height
+
+    # Verify attribute delegation works after unpickling
+    assert unpickled.width == original_image.width
+    assert unpickled.height == original_image.height
+    assert unpickled.mode == original_image.mode
--- a/tests/multimodal/test_processing.py
+++ b/tests/multimodal/test_processing.py
@@ -1021,9 +1021,8 @@ def test_hf_processor_init_kwargs(
        DummyProcessor,  # type: ignore[arg-type]
        **inference_kwargs,
    )
-
-    for k, v in expected_kwargs.items():
-        assert getattr(processor, k) == v
+    assert processor.a == expected_kwargs["a"]
+    assert processor.b == expected_kwargs["b"]


 @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])  # Dummy

--- a/tests/multimodal/test_video.py
+++ b/tests/multimodal/test_video.py
@@ -299,3 +299,212 @@ def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch
        frames_missing, metadata_missing = videoio_missing.load_bytes(b"test")
        np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2)
        assert metadata_missing["video_backend"] == "test_video_backend_override_2"
+
+
+# ============================================================================
+# Frame Recovery Tests
+# ============================================================================
+
+
+def test_video_recovery_simulated_failures(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test that frame recovery correctly uses the next valid frame when
+    target frames fail to load.
+
+    Uses corrupted.mp4 and mocks VideoCapture.grab() to fail on specific
+    frame indices (in addition to the real corruption at frame 17), then
+    verifies recovery produces more frames.
+    """
+    import cv2
+
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")
+
+        # Load corrupted.mp4 (26 frames, frame 17 is genuinely corrupted)
+        video_path = ASSETS_DIR / "corrupted.mp4"
+        with open(video_path, "rb") as f:
+            video_data = f.read()
+
+        # Simulate additional failures on frames 3 and 10
+        # (in addition to the real corruption at frame 17)
+        fail_on_frames = {3, 10}
+
+        # Store original VideoCapture class
+        original_video_capture = cv2.VideoCapture
+
+        class MockVideoCapture:
+            """Wrapper that simulates grab() failures on specific frames."""
+
+            def __init__(self, *args, **kwargs):
+                self._cap = original_video_capture(*args, **kwargs)
+                self._current_frame = -1
+
+            def grab(self):
+                self._current_frame += 1
+                if self._current_frame in fail_on_frames:
+                    return False  # Simulate failure
+                return self._cap.grab()
+
+            def retrieve(self):
+                return self._cap.retrieve()
+
+            def get(self, prop):
+                return self._cap.get(prop)
+
+            def isOpened(self):
+                return self._cap.isOpened()
+
+            def release(self):
+                return self._cap.release()
+
+        # Patch cv2.VideoCapture
+        m.setattr(cv2, "VideoCapture", MockVideoCapture)
+
+        loader = VIDEO_LOADER_REGISTRY.load("opencv")
+
+        # Use num_frames=8 which samples: [0, 3, 7, 10, 14, 17, 21, 25]
+        # Frame 3: mocked failure, recovery window [3, 7) -> use frame 4
+        # Frame 10: mocked failure, recovery window [10, 14) -> use frame 11
+        # Frame 17: real corruption, recovery window [17, 21) -> use frame 18
+
+        # Test WITHOUT recovery - should have fewer frames due to failures
+        frames_no_recovery, meta_no = loader.load_bytes(
+            video_data, num_frames=8, frame_recovery=False
+        )
+
+        # Test WITH recovery - should recover using next valid frames
+        frames_with_recovery, meta_yes = loader.load_bytes(
+            video_data, num_frames=8, frame_recovery=True
+        )
+
+        # With recovery should have MORE frames than without
+        # Without: 5 frames (3, 10, 17 all fail)
+        # With: 8 frames (all recovered)
+        assert frames_with_recovery.shape[0] > frames_no_recovery.shape[0], (
+            f"Recovery should produce more frames. "
+            f"Without: {frames_no_recovery.shape[0]}, "
+            f"With: {frames_with_recovery.shape[0]}"
+        )
+
+        # Verify metadata consistency
+        assert frames_no_recovery.shape[0] == len(meta_no["frames_indices"])
+        assert frames_with_recovery.shape[0] == len(meta_yes["frames_indices"])
+
+        # Verify temporal order is preserved
+        assert meta_yes["frames_indices"] == sorted(meta_yes["frames_indices"])
+
+
+def test_video_recovery_with_corrupted_file(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test frame recovery with an actual corrupted video file using sparse sampling.
+
+    This test uses corrupted.mp4 which has genuine H.264 codec errors on
+    frame 17. With num_frames=8, the target frames are [0, 3, 7, 10, 14, 17, 21, 25].
+    Frame 17 is corrupted but frames 18-20 are readable, so recovery can use
+    frame 18 to fill in for the failed frame 17.
+
+    This test verifies:
+    1. Without recovery: frame 17 is skipped (7 frames loaded)
+    2. With recovery: frame 18 fills in for frame 17 (8 frames loaded)
+    3. Recovery produces MORE frames than without recovery
+    4. Metadata is consistent with loaded frames
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")
+
+        corrupted_video_path = ASSETS_DIR / "corrupted.mp4"
+
+        with open(corrupted_video_path, "rb") as f:
+            video_data = f.read()
+
+        loader = VIDEO_LOADER_REGISTRY.load("opencv")
+
+        # Use num_frames=8 which makes frame 17 a target with recovery window [17, 21)
+        # Target frames: [0, 3, 7, 10, 14, 17, 21, 25]
+        # Frame 17 is corrupted, but frames 18-20 are readable for recovery
+
+        # Test without recovery - frame 17 will be skipped
+        frames_no_recovery, meta_no_recovery = loader.load_bytes(
+            video_data, num_frames=8, frame_recovery=False
+        )
+
+        # Test with recovery - frame 18 should fill in for frame 17
+        frames_with_recovery, meta_with_recovery = loader.load_bytes(
+            video_data, num_frames=8, frame_recovery=True
+        )
+
+        # Verify metadata consistency for both modes
+        assert frames_no_recovery.shape[0] == len(meta_no_recovery["frames_indices"]), (
+            "Frame count must match indices without recovery"
+        )
+        assert frames_with_recovery.shape[0] == len(
+            meta_with_recovery["frames_indices"]
+        ), "Frame count must match indices with recovery"
+
+        # KEY ASSERTION: Recovery should produce MORE frames than without recovery
+        # Without recovery: 7 frames (frame 17 skipped)
+        # With recovery: 8 frames (frame 18 used for frame 17)
+        assert frames_with_recovery.shape[0] > frames_no_recovery.shape[0], (
+            f"Recovery should produce more frames with sparse sampling. "
+            f"Got {frames_with_recovery.shape[0]} with recovery vs "
+            f"{frames_no_recovery.shape[0]} without"
+        )
+
+        # Verify we got all 8 requested frames with recovery
+        assert frames_with_recovery.shape[0] == 8, (
+            f"With recovery, should load all 8 requested frames. "
+            f"Got {frames_with_recovery.shape[0]}"
+        )
+
+        # Verify the video metadata is correct
+        expected_total_frames = 26
+        assert meta_with_recovery["total_num_frames"] == expected_total_frames, (
+            f"Expected {expected_total_frames} total frames in metadata"
+        )
+
+
+def test_video_recovery_dynamic_backend(monkeypatch: pytest.MonkeyPatch):
+    """
+    Test that frame_recovery works with the dynamic video backend.
+
+    The dynamic backend samples frames based on fps/duration rather than
+    loading all frames. This test verifies recovery works in that context.
+    """
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
+
+        corrupted_video_path = ASSETS_DIR / "corrupted.mp4"
+
+        with open(corrupted_video_path, "rb") as f:
+            video_data = f.read()
+
+        loader = VIDEO_LOADER_REGISTRY.load("opencv_dynamic")
+
+        # Test without recovery
+        frames_no_recovery, meta_no = loader.load_bytes(
+            video_data, fps=2, max_duration=10, frame_recovery=False
+        )
+
+        # Test with frame_recovery enabled
+        frames_with_recovery, meta_with = loader.load_bytes(
+            video_data, fps=2, max_duration=10, frame_recovery=True
+        )
+
+        # Verify basic properties
+        assert frames_no_recovery.shape[0] > 0, (
+            "Should load some frames without recovery"
+        )
+        assert frames_with_recovery.shape[0] > 0, (
+            "Should load some frames with recovery"
+        )
+        assert "do_sample_frames" in meta_with
+        assert meta_with["do_sample_frames"] is False  # Dynamic backend always False
+        assert frames_with_recovery.shape[0] == len(meta_with["frames_indices"])
+
+        # Key assertion: recovery should help when corrupted frames are sampled
+        # We expect recovery to produce >= frames than without recovery
+        assert frames_with_recovery.shape[0] >= frames_no_recovery.shape[0], (
+            f"Recovery should produce at least as many frames. "
+            f"Got {frames_with_recovery.shape[0]} with recovery vs "
+            f"{frames_no_recovery.shape[0]} without"
+        )
--- a/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+++ b/tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
@@ -7,7 +7,7 @@ import torch
 import torch.nn as nn

 from vllm.config import VllmConfig
-from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
+from vllm.model_executor.layers.pooler import DispatchPooler
 from vllm.model_executor.models.gemma2 import Gemma2Model
 from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix
 from vllm.sequence import IntermediateTensors
@@ -28,12 +28,7 @@ class MyGemma2Embedding(nn.Module):
        pooler_config = vllm_config.model_config.pooler_config
        assert pooler_config is not None

-        self.pooler = DispatchPooler(
-            {
-                "token_embed": Pooler.for_token_embed(pooler_config),
-                "embed": Pooler.for_embed(pooler_config),
-            }
-        )
+        self.pooler = DispatchPooler.for_embedding(pooler_config)

        self.make_empty_intermediate_tensors = (
            self.model.make_empty_intermediate_tensors

--- a/tests/plugins_tests/test_platform_plugins.py
+++ b/tests/plugins_tests/test_platform_plugins.py
@@ -31,7 +31,7 @@ def test_platform_plugins():
    )


-# def test_oot_custom_op(monkeypatch: pytest.MonkeyPatch):
+# def test_oot_custom_op(default_vllm_config, monkeypatch: pytest.MonkeyPatch):
 #     # simulate workload by running an example
 #     load_general_plugins()
 #     from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding

--- a/tests/quantization/test_auto_round.py
+++ b/tests/quantization/test_auto_round.py
@@ -26,7 +26,9 @@ MODELS = [
 )
 @pytest.mark.parametrize("model", MODELS)
 def test_auto_round(vllm_runner, model):
-    with vllm_runner(model, enforce_eager=True) as llm:
+    with vllm_runner(
+        model, enforce_eager=True, allow_deprecated_quantization=True
+    ) as llm:
        output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
    assert output
    print(f"{output[0][1]}")
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -86,7 +86,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
        current_platform.is_rocm()
        and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
    ):
-        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+        pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")

    with vllm_runner(model_path, enforce_eager=True) as llm:

@@ -164,7 +164,7 @@ def test_compressed_tensors_w8a8_logprobs(
        current_platform.is_rocm()
        and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
    ):
-        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+        pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")

    if use_aiter:
        if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
@@ -234,7 +234,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
        current_platform.is_rocm()
        and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
    ):
-        pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
+        pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")

    if use_aiter:
        if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
@@ -651,6 +651,9 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
        assert output


+@pytest.mark.skipif(
+    not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
+)
 @pytest.mark.parametrize(
    "args",
    [
@@ -783,7 +786,10 @@ def test_compressed_tensors_fp8_block_enabled(vllm_runner):

            input_quant_op = qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op
            assert isinstance(input_quant_op, QuantFP8)
-            assert input_quant_op._forward_method == input_quant_op.forward_cuda
+            assert input_quant_op._forward_method in (
+                input_quant_op.forward_cuda,
+                input_quant_op.forward_hip,
+            )

        llm.apply_model(check_model)


--- a/tests/quantization/test_configs.py
+++ b/tests/quantization/test_configs.py
@@ -11,7 +11,8 @@ import pytest
 import os

 from vllm.config import ModelConfig
-from ..utils import models_path_prefix
+from vllm.platforms import current_platform
+from tests.utils import models_path_prefix


 @dataclass
@@ -25,21 +26,45 @@ MODEL_ARG_EXPTYPES = [
    # AUTOGPTQ
    # compat: autogptq <=0.7.1 is_marlin_format: bool
    # Model Serialized in Exllama Format.
-    # (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), None, "gptq_marlin"),
-    # (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "marlin", "gptq_marlin"),
-    # (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "gptq", "gptq"),
+    (
+        os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"),
+        None,
+        "gptq_marlin" if current_platform.is_cuda() else "gptq",
+    ),
+    (
+        os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"),
+        "marlin",
+        "gptq_marlin" if current_platform.is_cuda() else "ERROR",
+    ),
+    (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "gptq", "gptq"),
    (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "awq", "ERROR"),
    # compat: autogptq >=0.8.0 use checkpoint_format: str

    # Model Serialized in Exllama Format.
-    (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), None, "gptq_marlin"),
-    (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), "marlin", "gptq_marlin"),
+    (
+        os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
+        None,
+        "gptq_marlin" if current_platform.is_cuda() else "gptq",
+    ),
+    (
+        os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
+        "marlin",
+        "gptq_marlin" if current_platform.is_cuda() else "ERROR",
+    ),
    (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), "gptq", "gptq"),
    (os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), "awq", "ERROR"),
    # AUTOAWQ
-    # (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), None, "awq_marlin"),
+    (
+        os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"),
+        None,
+        "awq_marlin" if current_platform.is_cuda() else "awq",
+    ),
    (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), "awq", "awq"),
-    # (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), "marlin", "awq_marlin"),
+    (
+        os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"),
+        "marlin",
+        "awq_marlin" if current_platform.is_cuda() else "ERROR",
+    ),
    (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), "gptq", "ERROR"),
 ]


--- a/tests/quantization/test_cpu_offload.py
+++ b/tests/quantization/test_cpu_offload.py
@@ -67,7 +67,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
    monkeypatch.setenv("VLLM_TEST_FORCE_LOAD_FORMAT", "auto")
    # Test wNa16
    compare_two_settings(
-        os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w4a16-channel-v2"),
+        os.path.join(models_path_prefix, "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"),
        ["--enforce_eager"],
        ["--enforce_eager", "--cpu-offload-gb", "1"],
        max_wait_seconds=480,

--- a/tests/quantization/test_cpu_wna16.py
+++ b/tests/quantization/test_cpu_wna16.py
@@ -10,6 +10,7 @@ if not current_platform.is_cpu():
 MODELS = [
    "TheBloke/TinyLlama-1.1B-Chat-v1.0-AWQ",
    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",  # with g_idx
+    "Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4",  # without g_idx
 ]
 DTYPE = ["bfloat16"]


--- a/tests/quantization/test_experts_int8.py
+++ b/tests/quantization/test_experts_int8.py
@@ -38,6 +38,10 @@ def test_model_experts_int8_startup(
    model_info.check_transformers_version(on_fail="skip")

    with vllm_runner(
-        model, dtype=dtype, enforce_eager=True, quantization="experts_int8"
+        model,
+        dtype=dtype,
+        enforce_eager=True,
+        quantization="experts_int8",
+        allow_deprecated_quantization=True,
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)
--- a/tests/quantization/test_gptq_dynamic.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -15,7 +15,9 @@ from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinLinear
 from vllm.model_executor.layers.quantization.utils.gptq_utils import (
    get_dynamic_override,
 )
+
 from ..utils import models_path_prefix
+from vllm.platforms import current_platform

 PROMPT = "On the surface of Mars, we found"

@@ -23,7 +25,10 @@ PROMPT = "On the surface of Mars, we found"
 # The second layer is quantized using bits=8, group_size=32
 # All other layers (layer index >= 2) are not quantized
 MODEL_QUANT = [
-    (os.path.join(models_path_prefix, "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue"), True),
+    (
+        os.path.join(models_path_prefix, "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue"),
+        current_platform.is_cuda(),
+    ),
    (
        os.path.join(models_path_prefix, "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse"),
        False,

--- a/tests/quantization/test_modelopt.py
+++ b/tests/quantization/test_modelopt.py
@@ -6,6 +6,7 @@ Run `pytest tests/quantization/test_modelopt.py`.
 """

 import os
+from typing import NoReturn

 import pytest
 import torch
@@ -19,6 +20,28 @@ def enable_pickle(monkeypatch):
    monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")


+def _skip(msg: str) -> NoReturn:
+    pytest.skip(msg)
+    raise RuntimeError(msg)
+
+
+def _snapshot_download_or_skip(model_id: str) -> str:
+    try:
+        from huggingface_hub import snapshot_download
+    except Exception as e:  # pragma: no cover
+        _skip(f"huggingface_hub is required to download {model_id}: {e}")
+
+    try:
+        return snapshot_download(
+            repo_id=model_id,
+            repo_type="model",
+            # These checkpoints are already small; download full repo for simplicity.
+            allow_patterns=["*"],
+        )
+    except Exception as e:
+        _skip(f"Failed to download {model_id} from the HF Hub: {e}")
+
+
 @pytest.mark.skipif(
    not is_quant_method_supported("modelopt"),
    reason="ModelOpt FP8 is not supported on this GPU type.",
@@ -91,3 +114,121 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
        assert output
        print(f"ModelOpt FP8 output: {output}")
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("modelopt"),
+    reason="ModelOpt FP8 is not supported on this GPU type.",
+)
+def test_modelopt_fp8_pc_pt_checkpoint_setup(vllm_runner):
+    """Test ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoint setup."""
+    model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pc-pt"
+    model_path = _snapshot_download_or_skip(model_id)
+
+    with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            o_proj = layer.self_attn.o_proj
+            gate_up_proj = layer.mlp.gate_up_proj
+            down_proj = layer.mlp.down_proj
+
+            from vllm.model_executor.layers.quantization.modelopt import (
+                ModelOptFp8PcPtLinearMethod,
+            )
+
+            assert isinstance(qkv_proj.quant_method, ModelOptFp8PcPtLinearMethod)
+            assert isinstance(o_proj.quant_method, ModelOptFp8PcPtLinearMethod)
+            assert isinstance(gate_up_proj.quant_method, ModelOptFp8PcPtLinearMethod)
+            assert isinstance(down_proj.quant_method, ModelOptFp8PcPtLinearMethod)
+
+            assert qkv_proj.weight.dtype == torch.float8_e4m3fn
+            assert o_proj.weight.dtype == torch.float8_e4m3fn
+            assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
+            assert down_proj.weight.dtype == torch.float8_e4m3fn
+
+            # Per-channel scales; activations are dynamically scaled per token.
+            assert hasattr(qkv_proj, "weight_scale")
+            assert qkv_proj.weight_scale.dtype == torch.float32
+            assert qkv_proj.weight_scale.dim() == 1
+            assert not hasattr(qkv_proj, "input_scale")
+
+            assert hasattr(o_proj, "weight_scale")
+            assert o_proj.weight_scale.dtype == torch.float32
+            assert o_proj.weight_scale.dim() == 1
+            assert not hasattr(o_proj, "input_scale")
+
+            assert hasattr(gate_up_proj, "weight_scale")
+            assert gate_up_proj.weight_scale.dtype == torch.float32
+            assert gate_up_proj.weight_scale.dim() == 1
+            assert not hasattr(gate_up_proj, "input_scale")
+
+            assert hasattr(down_proj, "weight_scale")
+            assert down_proj.weight_scale.dtype == torch.float32
+            assert down_proj.weight_scale.dim() == 1
+            assert not hasattr(down_proj, "input_scale")
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
+        assert output
+        print(f"ModelOpt FP8_PER_CHANNEL_PER_TOKEN output: {output}")
+
+
+@pytest.mark.skipif(
+    not is_quant_method_supported("modelopt"),
+    reason="ModelOpt FP8 is not supported on this GPU type.",
+)
+def test_modelopt_fp8_pb_wo_checkpoint_setup(vllm_runner):
+    """Test ModelOpt FP8_PB_WO checkpoint setup."""
+    model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pb-wo"
+    model_path = _snapshot_download_or_skip(model_id)
+
+    with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
+
+        def check_model(model):
+            layer = model.model.layers[0]
+
+            qkv_proj = layer.self_attn.qkv_proj
+            o_proj = layer.self_attn.o_proj
+            gate_up_proj = layer.mlp.gate_up_proj
+            down_proj = layer.mlp.down_proj
+
+            from vllm.model_executor.layers.quantization.modelopt import (
+                ModelOptFp8PbWoLinearMethod,
+            )
+
+            assert isinstance(qkv_proj.quant_method, ModelOptFp8PbWoLinearMethod)
+            assert isinstance(o_proj.quant_method, ModelOptFp8PbWoLinearMethod)
+            assert isinstance(gate_up_proj.quant_method, ModelOptFp8PbWoLinearMethod)
+            assert isinstance(down_proj.quant_method, ModelOptFp8PbWoLinearMethod)
+
+            assert qkv_proj.weight.dtype == torch.float8_e4m3fn
+            assert o_proj.weight.dtype == torch.float8_e4m3fn
+            assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
+            assert down_proj.weight.dtype == torch.float8_e4m3fn
+
+            # Block scales; should be materialized as a 2D [out_blk, in_blk] tensor.
+            assert hasattr(qkv_proj, "weight_scale")
+            assert qkv_proj.weight_scale.dtype == torch.float32
+            assert qkv_proj.weight_scale.dim() == 2
+
+            assert hasattr(o_proj, "weight_scale")
+            assert o_proj.weight_scale.dtype == torch.float32
+            assert o_proj.weight_scale.dim() == 2
+
+            assert hasattr(gate_up_proj, "weight_scale")
+            assert gate_up_proj.weight_scale.dtype == torch.float32
+            assert gate_up_proj.weight_scale.dim() == 2
+
+            assert hasattr(down_proj, "weight_scale")
+            assert down_proj.weight_scale.dtype == torch.float32
+            assert down_proj.weight_scale.dim() == 2
+
+        llm.apply_model(check_model)
+
+        output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
+        assert output
+        print(f"ModelOpt FP8_PB_WO output: {output}")
--- a/tests/quantization/test_rtn.py
+++ b/tests/quantization/test_rtn.py
@@ -30,6 +30,10 @@ def test_model_rtn_startup(
    max_tokens: int,
 ) -> None:
    with vllm_runner(
-        model, enforce_eager=True, dtype=dtype, quantization="rtn"
+        model,
+        enforce_eager=True,
+        dtype=dtype,
+        quantization="rtn",
+        allow_deprecated_quantization=True,
    ) as vllm_model:
        vllm_model.generate_greedy(example_prompts, max_tokens)
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -6,11 +6,17 @@ import importlib.util
 import pytest
 import torch

+from vllm.platforms import current_platform
+
 DTYPE = ["bfloat16"]

 TORCHAO_AVAILABLE = importlib.util.find_spec("torchao") is not None


+@pytest.mark.skipif(
+    current_platform.is_rocm() and current_platform.is_fp8_fnuz(),
+    reason="Only fp8_fnuz supported on CDNA3 architecture",
+)
 @pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
 def test_pre_quantized_model(vllm_runner):
    with vllm_runner(

--- a/tests/quantization/untest_fp8.py
+++ b/tests/quantization/untest_fp8.py
@@ -38,7 +38,9 @@ MODELS = [
    reason="FP8 is not supported on this GPU type.",
 )
 @pytest.mark.parametrize("model_id", MODELS)
-@pytest.mark.parametrize("force_marlin", [False, True])
+@pytest.mark.parametrize(
+    "force_marlin", [False] if current_platform.is_rocm() else [False, True]
+)
 @pytest.mark.parametrize(
    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
 )
@@ -127,7 +129,9 @@ def test_kv_cache_model_load_and_run(
    reason="FP8 is not supported on this GPU type.",
 )
 @pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
-@pytest.mark.parametrize("force_marlin", [False, True])
+@pytest.mark.parametrize(
+    "force_marlin", [False] if current_platform.is_rocm() else [False, True]
+)
 @pytest.mark.parametrize(
    "use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
 )
@@ -199,10 +203,10 @@ def test_scaled_fp8_quant(dtype) -> None:
    def quantize_ref(tensor, inv_scale):
        # The reference implementation that fully aligns to
        # the kernel being tested.
-        finfo = torch.finfo(torch.float8_e4m3fn)
+        finfo = torch.finfo(current_platform.fp8_dtype())
        scale = inv_scale.reciprocal()
        qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min, max=finfo.max)
-        qweight = qweight.to(torch.float8_e4m3fn)
+        qweight = qweight.to(current_platform.fp8_dtype())
        return qweight

    def per_tensor_dequantize(tensor, inv_scale, dtype):
@@ -218,7 +222,7 @@ def test_scaled_fp8_quant(dtype) -> None:
    ref_y, inv_scale = ops.scaled_fp8_quant(x, None)
    ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype)

-    # Reference dynamic quantizaton
+    # Reference dynamic quantization
    y = quantize_ref(x, inv_scale)
    torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype))

@@ -269,6 +273,10 @@ def test_scaled_fp8_quant(dtype) -> None:
    )


+@pytest.mark.skipif(
+    current_platform.is_fp8_fnuz(),
+    reason="FP8 e4m3fn weight reloading is not supported on e4m3fnuz platforms",
+)
 @pytest.mark.parametrize("method_cls", [Fp8LinearMethod, Fp8MoEMethod])
 # FP8 weight reloading does not support online quantization
 @pytest.mark.parametrize("is_checkpoint_fp8_serialized", [True])  # skip False
@@ -279,8 +287,19 @@ def test_scaled_fp8_quant(dtype) -> None:
 # this is the case for marlin as well as per-tensor Fp8MoEMethod
 @pytest.mark.parametrize("use_marlin", [False])  # skip True
 def test_fp8_reloading(
-    method_cls, is_checkpoint_fp8_serialized, weight_block_size, use_marlin, dist_init
+    default_vllm_config,
+    method_cls,
+    is_checkpoint_fp8_serialized,
+    weight_block_size,
+    use_marlin,
+    dist_init,
+    monkeypatch,
 ):
+    # NOTE(rob): this test fails when using DeepGEMM because the
+    # shapes are invalid. Previously the test was passing because
+    # we set fp8_backend to None, which sidestepped the issue.
+    monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "0")
+
    if is_checkpoint_fp8_serialized is False:
        pytest.skip("FP8 weight reloading does not support online quantization")

@@ -308,6 +327,7 @@ def test_fp8_reloading(
                params_dtype=torch.bfloat16,
                weight_loader=default_weight_loader,
            )
+            method.use_marlin = use_marlin

        else:
            layer = FusedMoE(
@@ -326,8 +346,6 @@ def test_fp8_reloading(
                weight_loader=default_weight_loader,
            )

-        method.use_marlin = use_marlin
-
    # capture weights format during loading
    original_metadata = [
        (name, param.shape, getattr(param, "weight_loader", default_weight_loader))

--- a/tests/quantization/untest_ptpc_fp8.py
+++ b/tests/quantization/untest_ptpc_fp8.py
@@ -6,18 +6,12 @@ Run `pytest tests/quantization/test_ptpc_fp8.py --forked`.
 """

 import pytest
-import torch

 from tests.quantization.utils import is_quant_method_supported
 from vllm.model_executor.layers.quantization.fp8 import Fp8KVCacheMethod
 from vllm.model_executor.layers.quantization.ptpc_fp8 import PTPCFp8LinearMethod
 from vllm.platforms import current_platform

-UNSUPPORTED_STR = (
-    "Currently torch._scaled_mm (hipBLASLt) rowwise gemm only "
-    "support output dtype of bfloat16. torch.float16 is specified."
-)
-

 @pytest.fixture(scope="function", autouse=True)
 def enable_pickle(monkeypatch):
@@ -30,24 +24,17 @@ def enable_pickle(monkeypatch):
    reason="PTPC FP8 is not supported on this GPU type.",
 )
 @pytest.mark.skipif(not current_platform.is_rocm(), reason="This test is for ROCm GPU.")
-@pytest.mark.parametrize("dtype", ["auto", "bfloat16", "float16"])
-@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8", "fp8_e4m3"])
+@pytest.mark.parametrize("dtype", ["bfloat16"])
+@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
 def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
-    try:
-        llm = vllm_runner(
-            "facebook/opt-125m",
-            dtype=dtype,
-            quantization="ptpc_fp8",
-            enforce_eager=True,
-            kv_cache_dtype=kv_cache_dtype,
-        )
-    except AssertionError as e:
-        if str(e) == UNSUPPORTED_STR:
-            # If the error message matches, the test passes
-            return
-        else:
-            # If the error message does not match, re-raise the exception
-            raise
+    llm = vllm_runner(
+        "facebook/opt-125m",
+        dtype=dtype,
+        quantization="ptpc_fp8",
+        enforce_eager=True,
+        kv_cache_dtype=kv_cache_dtype,
+        allow_deprecated_quantization=True,
+    )

    with llm:

@@ -60,9 +47,9 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
                assert attn._k_scale == 1.0
                assert attn._v_scale == 1.0

+            # For GPUs with hardware support, we keep weights in fp8
            if current_platform.has_device_capability(94):
-                # For GPUs with hardware support, we keep weights in fp8
-                assert fc1.weight.dtype == torch.float8_e4m3fnuz
+                assert fc1.weight.dtype == current_platform.fp8_dtype()

        llm.apply_model(check_model)


--- a/tests/quantization/utils.py
+++ b/tests/quantization/utils.py
@@ -10,6 +10,11 @@ def is_quant_method_supported(quant_method: str) -> bool:
    if not (current_platform.is_cuda() or current_platform.is_rocm()):
        return False

+    try:
+        current_platform.verify_quantization(quant_method)
+    except ValueError:
+        return False
+
    capability = current_platform.get_device_capability()
    assert capability is not None