Commit 7e63ef82 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.14.0' into v0.14.0-dev

parents 8cbcac5d b17039bc
......@@ -24,10 +24,12 @@ from vllm.multimodal.cache import (
)
from vllm.multimodal.hasher import MultiModalHasher
from vllm.multimodal.inputs import (
MultiModalFeatureSpec,
MultiModalFieldElem,
MultiModalKwargsItem,
MultiModalKwargsItems,
MultiModalSharedField,
PlaceholderRange,
)
from vllm.multimodal.processing import PromptInsertion
from vllm.utils.mem_constants import GiB_bytes, MiB_bytes
......@@ -518,3 +520,40 @@ def test_cache_eviction_shm_cache():
receiver_cache = ShmObjectStoreReceiverCache(vllm_config, mp.Lock())
_run_test_cache_eviction_shm(sender_cache, receiver_cache, base_item_size=MiB_bytes)
def test_processor_cache_shared_across_loras():
"""Test that processor cache uses mm_hash to share data across LoRAs."""
model_config = ModelConfig(
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
mm_processor_cache_gb=1,
)
receiver_cache = MultiModalReceiverCache(model_config)
base_mm_hash = "image_hash_abc123"
lora_a_identifier = f"12345:{base_mm_hash}"
lora_b_identifier = f"67890:{base_mm_hash}"
item_data = MultiModalKwargsItem.dummy("test_image", nbytes=1024)
feature_lora_a = MultiModalFeatureSpec(
data=item_data,
modality="image",
identifier=lora_a_identifier,
mm_position=PlaceholderRange(offset=0, length=100),
mm_hash=base_mm_hash,
)
receiver_cache.get_and_update_features([feature_lora_a])
assert base_mm_hash in receiver_cache._cache
feature_lora_b = MultiModalFeatureSpec(
data=None,
modality="image",
identifier=lora_b_identifier,
mm_position=PlaceholderRange(offset=0, length=100),
mm_hash=base_mm_hash,
)
receiver_cache.get_and_update_features([feature_lora_b])
assert feature_lora_b.data == item_data
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Unit tests for embedding shape validation.
Simple, fast unit tests that can run without server fixtures.
Run with: pytest tests/multimodal/test_embedding_shape_validation_unit.py -v
"""
import pytest
import torch
from vllm.multimodal.parse import (
AudioEmbeddingItems,
ImageEmbeddingItems,
)
class TestImageEmbedBasicValidation:
"""Test basic ndim validation in image embeddings via ImageEmbeddingItems."""
def test_valid_2d_tensor_accepted(self):
"""Baseline: 2D tensors should be accepted."""
valid_tensor = torch.randn(10, 768, dtype=torch.float32)
# Should not raise - 2D is valid
items = ImageEmbeddingItems(valid_tensor)
assert items.get_count() == 10
def test_valid_3d_tensor_accepted(self):
"""Baseline: 3D tensors should be accepted."""
valid_tensor = torch.randn(2, 10, 768, dtype=torch.float32)
# Should not raise - 3D is valid
items = ImageEmbeddingItems(valid_tensor)
assert items.get_count() == 2
def test_valid_list_of_2d_tensors_accepted(self):
"""Baseline: List of 2D tensors should be accepted."""
tensors = [
torch.randn(10, 768, dtype=torch.float32),
torch.randn(15, 768, dtype=torch.float32),
]
# Should not raise
items = ImageEmbeddingItems(tensors)
assert items.get_count() == 2
def test_1d_tensor_rejected(self):
"""Security: 1D tensors should be rejected (invalid ndim)."""
invalid_tensor = torch.randn(768, dtype=torch.float32) # 1D
with pytest.raises(ValueError) as exc_info:
ImageEmbeddingItems(invalid_tensor)
assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
def test_4d_tensor_rejected(self):
"""Security: 4D tensors should be rejected (invalid ndim)."""
invalid_tensor = torch.randn(1, 2, 10, 768, dtype=torch.float32) # 4D
with pytest.raises(ValueError) as exc_info:
ImageEmbeddingItems(invalid_tensor)
assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
def test_hidden_size_validation_correct_size(self):
"""Embeddings with correct hidden size should be accepted."""
expected_hidden_size = 768
valid_tensor = torch.randn(10, expected_hidden_size, dtype=torch.float32)
# Should not raise
items = ImageEmbeddingItems(
valid_tensor, expected_hidden_size=expected_hidden_size
)
assert items.get_count() == 10
def test_hidden_size_validation_wrong_size_rejected(self):
"""Embeddings with wrong hidden size should be rejected."""
expected_hidden_size = 768
wrong_hidden_size = 4096
invalid_tensor = torch.randn(10, wrong_hidden_size, dtype=torch.float32)
with pytest.raises(ValueError) as exc_info:
ImageEmbeddingItems(
invalid_tensor, expected_hidden_size=expected_hidden_size
)
error_msg = str(exc_info.value)
assert "hidden dimension mismatch" in error_msg.lower()
assert str(wrong_hidden_size) in error_msg
assert str(expected_hidden_size) in error_msg
class TestAudioEmbedBasicValidation:
"""Test basic ndim validation in audio embeddings via AudioEmbeddingItems."""
def test_valid_2d_tensor_accepted(self):
"""Baseline: 2D tensors should be accepted."""
valid_tensor = torch.randn(10, 768, dtype=torch.float32)
# Should not raise - 2D is valid
items = AudioEmbeddingItems(valid_tensor)
assert items.get_count() == 10
def test_valid_3d_tensor_accepted(self):
"""Baseline: 3D tensors should be accepted."""
valid_tensor = torch.randn(2, 10, 768, dtype=torch.float32)
# Should not raise - 3D is valid
items = AudioEmbeddingItems(valid_tensor)
assert items.get_count() == 2
def test_valid_list_of_2d_tensors_accepted(self):
"""Baseline: List of 2D tensors should be accepted."""
tensors = [
torch.randn(10, 768, dtype=torch.float32),
torch.randn(15, 768, dtype=torch.float32),
]
# Should not raise
items = AudioEmbeddingItems(tensors)
assert items.get_count() == 2
def test_1d_tensor_rejected(self):
"""Security: 1D tensors should be rejected (invalid ndim)."""
invalid_tensor = torch.randn(768, dtype=torch.float32) # 1D
with pytest.raises(ValueError) as exc_info:
AudioEmbeddingItems(invalid_tensor)
assert "must be 2D" in str(exc_info.value) or "3D" in str(exc_info.value)
def test_scalar_rejected(self):
"""Security: Scalar tensors should be rejected."""
invalid_tensor = torch.tensor(1.0) # 0D (scalar)
with pytest.raises(ValueError):
AudioEmbeddingItems(invalid_tensor)
def test_hidden_size_validation_correct_size(self):
"""Embeddings with correct hidden size should be accepted."""
expected_hidden_size = 768
valid_tensor = torch.randn(10, expected_hidden_size, dtype=torch.float32)
# Should not raise
items = AudioEmbeddingItems(
valid_tensor, expected_hidden_size=expected_hidden_size
)
assert items.get_count() == 10
def test_hidden_size_validation_wrong_size_rejected(self):
"""Embeddings with wrong hidden size should be rejected."""
expected_hidden_size = 768
wrong_hidden_size = 4096
invalid_tensor = torch.randn(10, wrong_hidden_size, dtype=torch.float32)
with pytest.raises(ValueError) as exc_info:
AudioEmbeddingItems(
invalid_tensor, expected_hidden_size=expected_hidden_size
)
error_msg = str(exc_info.value)
assert "hidden dimension mismatch" in error_msg.lower()
assert str(wrong_hidden_size) in error_msg
assert str(expected_hidden_size) in error_msg
class TestShapeValidationDoSPrevention:
"""
Tests for DoS prevention through shape validation.
Verifies that embeddings with incorrect shapes are rejected early,
preventing crashes during model inference.
"""
def test_prevent_crash_from_wrong_shape_image_embeds(self):
"""
Prevent crash scenario: wrong hidden size in image embeddings.
Without validation, this would pass initial checks but crash later
during model forward pass when dimensions don't match.
"""
expected_hidden_size = 768 # Typical model hidden size
wrong_hidden_size = 4096 # Wrong size (e.g., Llama-sized)
wrong_embedding = torch.randn(100, wrong_hidden_size, dtype=torch.float32)
# Should be rejected at instantiation time, not during inference
with pytest.raises(ValueError) as exc_info:
ImageEmbeddingItems(
wrong_embedding, expected_hidden_size=expected_hidden_size
)
error_msg = str(exc_info.value)
assert "hidden dimension mismatch" in error_msg.lower()
assert str(expected_hidden_size) in error_msg # Expected
assert str(wrong_hidden_size) in error_msg # Received
def test_prevent_crash_from_wrong_shape_audio_embeds(self):
"""
Prevent crash scenario: wrong hidden size in audio embeddings.
"""
expected_hidden_size = 768
wrong_hidden_size = 4096
wrong_embedding = torch.randn(100, wrong_hidden_size, dtype=torch.float32)
with pytest.raises(ValueError) as exc_info:
AudioEmbeddingItems(
wrong_embedding, expected_hidden_size=expected_hidden_size
)
error_msg = str(exc_info.value)
assert "hidden dimension mismatch" in error_msg.lower()
def test_extremely_large_hidden_size_rejected(self):
"""Security: Prevent DoS from extremely large embeddings."""
expected_hidden_size = 768
huge_hidden_size = 100000 # Large but not extreme to avoid test OOM
invalid_tensor = torch.randn(10, huge_hidden_size, dtype=torch.float32)
with pytest.raises(ValueError) as exc_info:
ImageEmbeddingItems(
invalid_tensor, expected_hidden_size=expected_hidden_size
)
assert "hidden dimension mismatch" in str(exc_info.value).lower()
def test_batch_with_mixed_hidden_sizes_rejected(self):
"""All embeddings in a list must have the same hidden size."""
expected_hidden_size = 768
# One correct, one wrong
batch = [
torch.randn(10, expected_hidden_size, dtype=torch.float32),
torch.randn(10, expected_hidden_size + 100, dtype=torch.float32), # Wrong!
]
# Should fail on the second one
with pytest.raises(ValueError) as exc_info:
ImageEmbeddingItems(batch, expected_hidden_size=expected_hidden_size)
assert "hidden dimension mismatch" in str(exc_info.value).lower()
if __name__ == "__main__":
pytest.main([__file__, "-v", "--tb=short"])
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pickle
from pathlib import Path
import numpy as np
import pytest
from PIL import Image, ImageChops
from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.image import ImageMediaIO, convert_image_mode
pytestmark = pytest.mark.cpu_test
......@@ -157,3 +159,34 @@ def test_rgba_background_color_validation():
ImageMediaIO(rgba_background_color=(0, 0, 0)) # Should not raise
ImageMediaIO(rgba_background_color=[255, 255, 255]) # Should not raise
ImageMediaIO(rgba_background_color=(128, 128, 128)) # Should not raise
def test_media_with_bytes_pickle_roundtrip():
"""Regression test for pickle/unpickle of MediaWithBytes.
Verifies that MediaWithBytes can be pickled and unpickled without
RecursionError. See: https://github.com/vllm-project/vllm/issues/30818
"""
original_image = Image.open(ASSETS_DIR / "image1.png").convert("RGB")
original_bytes = b"test_bytes_data"
wrapper = MediaWithBytes(media=original_image, original_bytes=original_bytes)
# Verify attribute delegation works before pickling
assert wrapper.width == original_image.width
assert wrapper.height == original_image.height
assert wrapper.mode == original_image.mode
# Pickle and unpickle (this would cause RecursionError before the fix)
pickled = pickle.dumps(wrapper)
unpickled = pickle.loads(pickled)
# Verify the unpickled object works correctly
assert unpickled.original_bytes == original_bytes
assert unpickled.media.width == original_image.width
assert unpickled.media.height == original_image.height
# Verify attribute delegation works after unpickling
assert unpickled.width == original_image.width
assert unpickled.height == original_image.height
assert unpickled.mode == original_image.mode
......@@ -1021,9 +1021,8 @@ def test_hf_processor_init_kwargs(
DummyProcessor, # type: ignore[arg-type]
**inference_kwargs,
)
for k, v in expected_kwargs.items():
assert getattr(processor, k) == v
assert processor.a == expected_kwargs["a"]
assert processor.b == expected_kwargs["b"]
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy
......
......@@ -299,3 +299,212 @@ def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch
frames_missing, metadata_missing = videoio_missing.load_bytes(b"test")
np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2)
assert metadata_missing["video_backend"] == "test_video_backend_override_2"
# ============================================================================
# Frame Recovery Tests
# ============================================================================
def test_video_recovery_simulated_failures(monkeypatch: pytest.MonkeyPatch):
"""
Test that frame recovery correctly uses the next valid frame when
target frames fail to load.
Uses corrupted.mp4 and mocks VideoCapture.grab() to fail on specific
frame indices (in addition to the real corruption at frame 17), then
verifies recovery produces more frames.
"""
import cv2
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")
# Load corrupted.mp4 (26 frames, frame 17 is genuinely corrupted)
video_path = ASSETS_DIR / "corrupted.mp4"
with open(video_path, "rb") as f:
video_data = f.read()
# Simulate additional failures on frames 3 and 10
# (in addition to the real corruption at frame 17)
fail_on_frames = {3, 10}
# Store original VideoCapture class
original_video_capture = cv2.VideoCapture
class MockVideoCapture:
"""Wrapper that simulates grab() failures on specific frames."""
def __init__(self, *args, **kwargs):
self._cap = original_video_capture(*args, **kwargs)
self._current_frame = -1
def grab(self):
self._current_frame += 1
if self._current_frame in fail_on_frames:
return False # Simulate failure
return self._cap.grab()
def retrieve(self):
return self._cap.retrieve()
def get(self, prop):
return self._cap.get(prop)
def isOpened(self):
return self._cap.isOpened()
def release(self):
return self._cap.release()
# Patch cv2.VideoCapture
m.setattr(cv2, "VideoCapture", MockVideoCapture)
loader = VIDEO_LOADER_REGISTRY.load("opencv")
# Use num_frames=8 which samples: [0, 3, 7, 10, 14, 17, 21, 25]
# Frame 3: mocked failure, recovery window [3, 7) -> use frame 4
# Frame 10: mocked failure, recovery window [10, 14) -> use frame 11
# Frame 17: real corruption, recovery window [17, 21) -> use frame 18
# Test WITHOUT recovery - should have fewer frames due to failures
frames_no_recovery, meta_no = loader.load_bytes(
video_data, num_frames=8, frame_recovery=False
)
# Test WITH recovery - should recover using next valid frames
frames_with_recovery, meta_yes = loader.load_bytes(
video_data, num_frames=8, frame_recovery=True
)
# With recovery should have MORE frames than without
# Without: 5 frames (3, 10, 17 all fail)
# With: 8 frames (all recovered)
assert frames_with_recovery.shape[0] > frames_no_recovery.shape[0], (
f"Recovery should produce more frames. "
f"Without: {frames_no_recovery.shape[0]}, "
f"With: {frames_with_recovery.shape[0]}"
)
# Verify metadata consistency
assert frames_no_recovery.shape[0] == len(meta_no["frames_indices"])
assert frames_with_recovery.shape[0] == len(meta_yes["frames_indices"])
# Verify temporal order is preserved
assert meta_yes["frames_indices"] == sorted(meta_yes["frames_indices"])
def test_video_recovery_with_corrupted_file(monkeypatch: pytest.MonkeyPatch):
"""
Test frame recovery with an actual corrupted video file using sparse sampling.
This test uses corrupted.mp4 which has genuine H.264 codec errors on
frame 17. With num_frames=8, the target frames are [0, 3, 7, 10, 14, 17, 21, 25].
Frame 17 is corrupted but frames 18-20 are readable, so recovery can use
frame 18 to fill in for the failed frame 17.
This test verifies:
1. Without recovery: frame 17 is skipped (7 frames loaded)
2. With recovery: frame 18 fills in for frame 17 (8 frames loaded)
3. Recovery produces MORE frames than without recovery
4. Metadata is consistent with loaded frames
"""
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv")
corrupted_video_path = ASSETS_DIR / "corrupted.mp4"
with open(corrupted_video_path, "rb") as f:
video_data = f.read()
loader = VIDEO_LOADER_REGISTRY.load("opencv")
# Use num_frames=8 which makes frame 17 a target with recovery window [17, 21)
# Target frames: [0, 3, 7, 10, 14, 17, 21, 25]
# Frame 17 is corrupted, but frames 18-20 are readable for recovery
# Test without recovery - frame 17 will be skipped
frames_no_recovery, meta_no_recovery = loader.load_bytes(
video_data, num_frames=8, frame_recovery=False
)
# Test with recovery - frame 18 should fill in for frame 17
frames_with_recovery, meta_with_recovery = loader.load_bytes(
video_data, num_frames=8, frame_recovery=True
)
# Verify metadata consistency for both modes
assert frames_no_recovery.shape[0] == len(meta_no_recovery["frames_indices"]), (
"Frame count must match indices without recovery"
)
assert frames_with_recovery.shape[0] == len(
meta_with_recovery["frames_indices"]
), "Frame count must match indices with recovery"
# KEY ASSERTION: Recovery should produce MORE frames than without recovery
# Without recovery: 7 frames (frame 17 skipped)
# With recovery: 8 frames (frame 18 used for frame 17)
assert frames_with_recovery.shape[0] > frames_no_recovery.shape[0], (
f"Recovery should produce more frames with sparse sampling. "
f"Got {frames_with_recovery.shape[0]} with recovery vs "
f"{frames_no_recovery.shape[0]} without"
)
# Verify we got all 8 requested frames with recovery
assert frames_with_recovery.shape[0] == 8, (
f"With recovery, should load all 8 requested frames. "
f"Got {frames_with_recovery.shape[0]}"
)
# Verify the video metadata is correct
expected_total_frames = 26
assert meta_with_recovery["total_num_frames"] == expected_total_frames, (
f"Expected {expected_total_frames} total frames in metadata"
)
def test_video_recovery_dynamic_backend(monkeypatch: pytest.MonkeyPatch):
"""
Test that frame_recovery works with the dynamic video backend.
The dynamic backend samples frames based on fps/duration rather than
loading all frames. This test verifies recovery works in that context.
"""
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
corrupted_video_path = ASSETS_DIR / "corrupted.mp4"
with open(corrupted_video_path, "rb") as f:
video_data = f.read()
loader = VIDEO_LOADER_REGISTRY.load("opencv_dynamic")
# Test without recovery
frames_no_recovery, meta_no = loader.load_bytes(
video_data, fps=2, max_duration=10, frame_recovery=False
)
# Test with frame_recovery enabled
frames_with_recovery, meta_with = loader.load_bytes(
video_data, fps=2, max_duration=10, frame_recovery=True
)
# Verify basic properties
assert frames_no_recovery.shape[0] > 0, (
"Should load some frames without recovery"
)
assert frames_with_recovery.shape[0] > 0, (
"Should load some frames with recovery"
)
assert "do_sample_frames" in meta_with
assert meta_with["do_sample_frames"] is False # Dynamic backend always False
assert frames_with_recovery.shape[0] == len(meta_with["frames_indices"])
# Key assertion: recovery should help when corrupted frames are sampled
# We expect recovery to produce >= frames than without recovery
assert frames_with_recovery.shape[0] >= frames_no_recovery.shape[0], (
f"Recovery should produce at least as many frames. "
f"Got {frames_with_recovery.shape[0]} with recovery vs "
f"{frames_no_recovery.shape[0]} without"
)
......@@ -7,7 +7,7 @@ import torch
import torch.nn as nn
from vllm.config import VllmConfig
from vllm.model_executor.layers.pooler import DispatchPooler, Pooler
from vllm.model_executor.layers.pooler import DispatchPooler
from vllm.model_executor.models.gemma2 import Gemma2Model
from vllm.model_executor.models.utils import WeightsMapper, maybe_prefix
from vllm.sequence import IntermediateTensors
......@@ -28,12 +28,7 @@ class MyGemma2Embedding(nn.Module):
pooler_config = vllm_config.model_config.pooler_config
assert pooler_config is not None
self.pooler = DispatchPooler(
{
"token_embed": Pooler.for_token_embed(pooler_config),
"embed": Pooler.for_embed(pooler_config),
}
)
self.pooler = DispatchPooler.for_embedding(pooler_config)
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors
......
......@@ -31,7 +31,7 @@ def test_platform_plugins():
)
# def test_oot_custom_op(monkeypatch: pytest.MonkeyPatch):
# def test_oot_custom_op(default_vllm_config, monkeypatch: pytest.MonkeyPatch):
# # simulate workload by running an example
# load_general_plugins()
# from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
......
......@@ -26,7 +26,9 @@ MODELS = [
)
@pytest.mark.parametrize("model", MODELS)
def test_auto_round(vllm_runner, model):
with vllm_runner(model, enforce_eager=True) as llm:
with vllm_runner(
model, enforce_eager=True, allow_deprecated_quantization=True
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=8)
assert output
print(f"{output[0][1]}")
......@@ -86,7 +86,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
current_platform.is_rocm()
and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
):
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
with vllm_runner(model_path, enforce_eager=True) as llm:
......@@ -164,7 +164,7 @@ def test_compressed_tensors_w8a8_logprobs(
current_platform.is_rocm()
and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
):
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
if use_aiter:
if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
......@@ -234,7 +234,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
current_platform.is_rocm()
and model_path not in ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
):
pytest.skip(f"Skip model {model_path} as it is not support on ROCm.")
pytest.skip(f"Skip model {model_path} as it is not supported on ROCm.")
if use_aiter:
if model_path not in ROCM_AITER_SUPPORTED_INT8_MODEL:
......@@ -651,6 +651,9 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
assert output
@pytest.mark.skipif(
not current_platform.is_cuda(), reason="This test is skipped on non-CUDA platform."
)
@pytest.mark.parametrize(
"args",
[
......@@ -783,7 +786,10 @@ def test_compressed_tensors_fp8_block_enabled(vllm_runner):
input_quant_op = qkv_proj.scheme.w8a8_block_fp8_linear.input_quant_op
assert isinstance(input_quant_op, QuantFP8)
assert input_quant_op._forward_method == input_quant_op.forward_cuda
assert input_quant_op._forward_method in (
input_quant_op.forward_cuda,
input_quant_op.forward_hip,
)
llm.apply_model(check_model)
......
......@@ -11,7 +11,8 @@ import pytest
import os
from vllm.config import ModelConfig
from ..utils import models_path_prefix
from vllm.platforms import current_platform
from tests.utils import models_path_prefix
@dataclass
......@@ -25,21 +26,45 @@ MODEL_ARG_EXPTYPES = [
# AUTOGPTQ
# compat: autogptq <=0.7.1 is_marlin_format: bool
# Model Serialized in Exllama Format.
# (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), None, "gptq_marlin"),
# (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "marlin", "gptq_marlin"),
# (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "gptq", "gptq"),
(
os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"),
None,
"gptq_marlin" if current_platform.is_cuda() else "gptq",
),
(
os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"),
"marlin",
"gptq_marlin" if current_platform.is_cuda() else "ERROR",
),
(os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "gptq", "gptq"),
(os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "awq", "ERROR"),
# compat: autogptq >=0.8.0 use checkpoint_format: str
# Model Serialized in Exllama Format.
(os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), None, "gptq_marlin"),
(os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), "marlin", "gptq_marlin"),
(
os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
None,
"gptq_marlin" if current_platform.is_cuda() else "gptq",
),
(
os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"),
"marlin",
"gptq_marlin" if current_platform.is_cuda() else "ERROR",
),
(os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), "gptq", "gptq"),
(os.path.join(models_path_prefix, "LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"), "awq", "ERROR"),
# AUTOAWQ
# (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), None, "awq_marlin"),
(
os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"),
None,
"awq_marlin" if current_platform.is_cuda() else "awq",
),
(os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), "awq", "awq"),
# (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), "marlin", "awq_marlin"),
(
os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"),
"marlin",
"awq_marlin" if current_platform.is_cuda() else "ERROR",
),
(os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), "gptq", "ERROR"),
]
......
......@@ -67,7 +67,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
monkeypatch.setenv("VLLM_TEST_FORCE_LOAD_FORMAT", "auto")
# Test wNa16
compare_two_settings(
os.path.join(models_path_prefix, "nm-testing/tinyllama-oneshot-w4a16-channel-v2"),
os.path.join(models_path_prefix, "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"),
["--enforce_eager"],
["--enforce_eager", "--cpu-offload-gb", "1"],
max_wait_seconds=480,
......
......@@ -10,6 +10,7 @@ if not current_platform.is_cpu():
MODELS = [
"TheBloke/TinyLlama-1.1B-Chat-v1.0-AWQ",
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", # with g_idx
"Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4", # without g_idx
]
DTYPE = ["bfloat16"]
......
......@@ -38,6 +38,10 @@ def test_model_experts_int8_startup(
model_info.check_transformers_version(on_fail="skip")
with vllm_runner(
model, dtype=dtype, enforce_eager=True, quantization="experts_int8"
model,
dtype=dtype,
enforce_eager=True,
quantization="experts_int8",
allow_deprecated_quantization=True,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
......@@ -15,7 +15,9 @@ from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinLinear
from vllm.model_executor.layers.quantization.utils.gptq_utils import (
get_dynamic_override,
)
from ..utils import models_path_prefix
from vllm.platforms import current_platform
PROMPT = "On the surface of Mars, we found"
......@@ -23,7 +25,10 @@ PROMPT = "On the surface of Mars, we found"
# The second layer is quantized using bits=8, group_size=32
# All other layers (layer index >= 2) are not quantized
MODEL_QUANT = [
(os.path.join(models_path_prefix, "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue"), True),
(
os.path.join(models_path_prefix, "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue"),
current_platform.is_cuda(),
),
(
os.path.join(models_path_prefix, "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse"),
False,
......
......@@ -6,6 +6,7 @@ Run `pytest tests/quantization/test_modelopt.py`.
"""
import os
from typing import NoReturn
import pytest
import torch
......@@ -19,6 +20,28 @@ def enable_pickle(monkeypatch):
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
def _skip(msg: str) -> NoReturn:
pytest.skip(msg)
raise RuntimeError(msg)
def _snapshot_download_or_skip(model_id: str) -> str:
try:
from huggingface_hub import snapshot_download
except Exception as e: # pragma: no cover
_skip(f"huggingface_hub is required to download {model_id}: {e}")
try:
return snapshot_download(
repo_id=model_id,
repo_type="model",
# These checkpoints are already small; download full repo for simplicity.
allow_patterns=["*"],
)
except Exception as e:
_skip(f"Failed to download {model_id} from the HF Hub: {e}")
@pytest.mark.skipif(
not is_quant_method_supported("modelopt"),
reason="ModelOpt FP8 is not supported on this GPU type.",
......@@ -91,3 +114,121 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
assert output
print(f"ModelOpt FP8 output: {output}")
@pytest.mark.skipif(
not is_quant_method_supported("modelopt"),
reason="ModelOpt FP8 is not supported on this GPU type.",
)
def test_modelopt_fp8_pc_pt_checkpoint_setup(vllm_runner):
"""Test ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoint setup."""
model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pc-pt"
model_path = _snapshot_download_or_skip(model_id)
with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
def check_model(model):
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
o_proj = layer.self_attn.o_proj
gate_up_proj = layer.mlp.gate_up_proj
down_proj = layer.mlp.down_proj
from vllm.model_executor.layers.quantization.modelopt import (
ModelOptFp8PcPtLinearMethod,
)
assert isinstance(qkv_proj.quant_method, ModelOptFp8PcPtLinearMethod)
assert isinstance(o_proj.quant_method, ModelOptFp8PcPtLinearMethod)
assert isinstance(gate_up_proj.quant_method, ModelOptFp8PcPtLinearMethod)
assert isinstance(down_proj.quant_method, ModelOptFp8PcPtLinearMethod)
assert qkv_proj.weight.dtype == torch.float8_e4m3fn
assert o_proj.weight.dtype == torch.float8_e4m3fn
assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
assert down_proj.weight.dtype == torch.float8_e4m3fn
# Per-channel scales; activations are dynamically scaled per token.
assert hasattr(qkv_proj, "weight_scale")
assert qkv_proj.weight_scale.dtype == torch.float32
assert qkv_proj.weight_scale.dim() == 1
assert not hasattr(qkv_proj, "input_scale")
assert hasattr(o_proj, "weight_scale")
assert o_proj.weight_scale.dtype == torch.float32
assert o_proj.weight_scale.dim() == 1
assert not hasattr(o_proj, "input_scale")
assert hasattr(gate_up_proj, "weight_scale")
assert gate_up_proj.weight_scale.dtype == torch.float32
assert gate_up_proj.weight_scale.dim() == 1
assert not hasattr(gate_up_proj, "input_scale")
assert hasattr(down_proj, "weight_scale")
assert down_proj.weight_scale.dtype == torch.float32
assert down_proj.weight_scale.dim() == 1
assert not hasattr(down_proj, "input_scale")
llm.apply_model(check_model)
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
assert output
print(f"ModelOpt FP8_PER_CHANNEL_PER_TOKEN output: {output}")
@pytest.mark.skipif(
not is_quant_method_supported("modelopt"),
reason="ModelOpt FP8 is not supported on this GPU type.",
)
def test_modelopt_fp8_pb_wo_checkpoint_setup(vllm_runner):
"""Test ModelOpt FP8_PB_WO checkpoint setup."""
model_id = "CedricHwang/qwen2.5-0.5b-modelopt-fp8-pb-wo"
model_path = _snapshot_download_or_skip(model_id)
with vllm_runner(model_path, quantization="modelopt", enforce_eager=True) as llm:
def check_model(model):
layer = model.model.layers[0]
qkv_proj = layer.self_attn.qkv_proj
o_proj = layer.self_attn.o_proj
gate_up_proj = layer.mlp.gate_up_proj
down_proj = layer.mlp.down_proj
from vllm.model_executor.layers.quantization.modelopt import (
ModelOptFp8PbWoLinearMethod,
)
assert isinstance(qkv_proj.quant_method, ModelOptFp8PbWoLinearMethod)
assert isinstance(o_proj.quant_method, ModelOptFp8PbWoLinearMethod)
assert isinstance(gate_up_proj.quant_method, ModelOptFp8PbWoLinearMethod)
assert isinstance(down_proj.quant_method, ModelOptFp8PbWoLinearMethod)
assert qkv_proj.weight.dtype == torch.float8_e4m3fn
assert o_proj.weight.dtype == torch.float8_e4m3fn
assert gate_up_proj.weight.dtype == torch.float8_e4m3fn
assert down_proj.weight.dtype == torch.float8_e4m3fn
# Block scales; should be materialized as a 2D [out_blk, in_blk] tensor.
assert hasattr(qkv_proj, "weight_scale")
assert qkv_proj.weight_scale.dtype == torch.float32
assert qkv_proj.weight_scale.dim() == 2
assert hasattr(o_proj, "weight_scale")
assert o_proj.weight_scale.dtype == torch.float32
assert o_proj.weight_scale.dim() == 2
assert hasattr(gate_up_proj, "weight_scale")
assert gate_up_proj.weight_scale.dtype == torch.float32
assert gate_up_proj.weight_scale.dim() == 2
assert hasattr(down_proj, "weight_scale")
assert down_proj.weight_scale.dtype == torch.float32
assert down_proj.weight_scale.dim() == 2
llm.apply_model(check_model)
output = llm.generate_greedy(["Hello my name is"], max_tokens=4)
assert output
print(f"ModelOpt FP8_PB_WO output: {output}")
......@@ -30,6 +30,10 @@ def test_model_rtn_startup(
max_tokens: int,
) -> None:
with vllm_runner(
model, enforce_eager=True, dtype=dtype, quantization="rtn"
model,
enforce_eager=True,
dtype=dtype,
quantization="rtn",
allow_deprecated_quantization=True,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
......@@ -6,11 +6,17 @@ import importlib.util
import pytest
import torch
from vllm.platforms import current_platform
DTYPE = ["bfloat16"]
TORCHAO_AVAILABLE = importlib.util.find_spec("torchao") is not None
@pytest.mark.skipif(
current_platform.is_rocm() and current_platform.is_fp8_fnuz(),
reason="Only fp8_fnuz supported on CDNA3 architecture",
)
@pytest.mark.skipif(not TORCHAO_AVAILABLE, reason="torchao is not available")
def test_pre_quantized_model(vllm_runner):
with vllm_runner(
......
......@@ -38,7 +38,9 @@ MODELS = [
reason="FP8 is not supported on this GPU type.",
)
@pytest.mark.parametrize("model_id", MODELS)
@pytest.mark.parametrize("force_marlin", [False, True])
@pytest.mark.parametrize(
"force_marlin", [False] if current_platform.is_rocm() else [False, True]
)
@pytest.mark.parametrize(
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
)
......@@ -127,7 +129,9 @@ def test_kv_cache_model_load_and_run(
reason="FP8 is not supported on this GPU type.",
)
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
@pytest.mark.parametrize("force_marlin", [False, True])
@pytest.mark.parametrize(
"force_marlin", [False] if current_platform.is_rocm() else [False, True]
)
@pytest.mark.parametrize(
"use_rocm_aiter", [True, False] if current_platform.is_rocm() else [False]
)
......@@ -199,10 +203,10 @@ def test_scaled_fp8_quant(dtype) -> None:
def quantize_ref(tensor, inv_scale):
# The reference implementation that fully aligns to
# the kernel being tested.
finfo = torch.finfo(torch.float8_e4m3fn)
finfo = torch.finfo(current_platform.fp8_dtype())
scale = inv_scale.reciprocal()
qweight = (tensor.to(torch.float32) * scale).clamp(min=finfo.min, max=finfo.max)
qweight = qweight.to(torch.float8_e4m3fn)
qweight = qweight.to(current_platform.fp8_dtype())
return qweight
def per_tensor_dequantize(tensor, inv_scale, dtype):
......@@ -218,7 +222,7 @@ def test_scaled_fp8_quant(dtype) -> None:
ref_y, inv_scale = ops.scaled_fp8_quant(x, None)
ref_y = per_tensor_dequantize(ref_y, inv_scale, dtype)
# Reference dynamic quantizaton
# Reference dynamic quantization
y = quantize_ref(x, inv_scale)
torch.testing.assert_close(ref_y, per_tensor_dequantize(y, inv_scale, dtype))
......@@ -269,6 +273,10 @@ def test_scaled_fp8_quant(dtype) -> None:
)
@pytest.mark.skipif(
current_platform.is_fp8_fnuz(),
reason="FP8 e4m3fn weight reloading is not supported on e4m3fnuz platforms",
)
@pytest.mark.parametrize("method_cls", [Fp8LinearMethod, Fp8MoEMethod])
# FP8 weight reloading does not support online quantization
@pytest.mark.parametrize("is_checkpoint_fp8_serialized", [True]) # skip False
......@@ -279,8 +287,19 @@ def test_scaled_fp8_quant(dtype) -> None:
# this is the case for marlin as well as per-tensor Fp8MoEMethod
@pytest.mark.parametrize("use_marlin", [False]) # skip True
def test_fp8_reloading(
method_cls, is_checkpoint_fp8_serialized, weight_block_size, use_marlin, dist_init
default_vllm_config,
method_cls,
is_checkpoint_fp8_serialized,
weight_block_size,
use_marlin,
dist_init,
monkeypatch,
):
# NOTE(rob): this test fails when using DeepGEMM because the
# shapes are invalid. Previously the test was passing because
# we set fp8_backend to None, which sidestepped the issue.
monkeypatch.setenv("VLLM_USE_DEEP_GEMM", "0")
if is_checkpoint_fp8_serialized is False:
pytest.skip("FP8 weight reloading does not support online quantization")
......@@ -308,6 +327,7 @@ def test_fp8_reloading(
params_dtype=torch.bfloat16,
weight_loader=default_weight_loader,
)
method.use_marlin = use_marlin
else:
layer = FusedMoE(
......@@ -326,8 +346,6 @@ def test_fp8_reloading(
weight_loader=default_weight_loader,
)
method.use_marlin = use_marlin
# capture weights format during loading
original_metadata = [
(name, param.shape, getattr(param, "weight_loader", default_weight_loader))
......
......@@ -6,18 +6,12 @@ Run `pytest tests/quantization/test_ptpc_fp8.py --forked`.
"""
import pytest
import torch
from tests.quantization.utils import is_quant_method_supported
from vllm.model_executor.layers.quantization.fp8 import Fp8KVCacheMethod
from vllm.model_executor.layers.quantization.ptpc_fp8 import PTPCFp8LinearMethod
from vllm.platforms import current_platform
UNSUPPORTED_STR = (
"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only "
"support output dtype of bfloat16. torch.float16 is specified."
)
@pytest.fixture(scope="function", autouse=True)
def enable_pickle(monkeypatch):
......@@ -30,24 +24,17 @@ def enable_pickle(monkeypatch):
reason="PTPC FP8 is not supported on this GPU type.",
)
@pytest.mark.skipif(not current_platform.is_rocm(), reason="This test is for ROCm GPU.")
@pytest.mark.parametrize("dtype", ["auto", "bfloat16", "float16"])
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8", "fp8_e4m3"])
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
try:
llm = vllm_runner(
"facebook/opt-125m",
dtype=dtype,
quantization="ptpc_fp8",
enforce_eager=True,
kv_cache_dtype=kv_cache_dtype,
)
except AssertionError as e:
if str(e) == UNSUPPORTED_STR:
# If the error message matches, the test passes
return
else:
# If the error message does not match, re-raise the exception
raise
llm = vllm_runner(
"facebook/opt-125m",
dtype=dtype,
quantization="ptpc_fp8",
enforce_eager=True,
kv_cache_dtype=kv_cache_dtype,
allow_deprecated_quantization=True,
)
with llm:
......@@ -60,9 +47,9 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
assert attn._k_scale == 1.0
assert attn._v_scale == 1.0
# For GPUs with hardware support, we keep weights in fp8
if current_platform.has_device_capability(94):
# For GPUs with hardware support, we keep weights in fp8
assert fc1.weight.dtype == torch.float8_e4m3fnuz
assert fc1.weight.dtype == current_platform.fp8_dtype()
llm.apply_model(check_model)
......
......@@ -10,6 +10,11 @@ def is_quant_method_supported(quant_method: str) -> bool:
if not (current_platform.is_cuda() or current_platform.is_rocm()):
return False
try:
current_platform.verify_quantization(quant_method)
except ValueError:
return False
capability = current_platform.get_device_capability()
assert capability is not None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment