Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7e63ef82
Commit
7e63ef82
authored
Jan 21, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0' into v0.14.0-dev
parents
8cbcac5d
b17039bc
Changes
681
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
789 additions
and
61 deletions
+789
-61
tests/multimodal/test_cache.py
tests/multimodal/test_cache.py
+39
-0
tests/multimodal/test_embedding_shape_validation_unit.py
tests/multimodal/test_embedding_shape_validation_unit.py
+249
-0
tests/multimodal/test_image.py
tests/multimodal/test_image.py
+33
-0
tests/multimodal/test_processing.py
tests/multimodal/test_processing.py
+2
-3
tests/multimodal/test_video.py
tests/multimodal/test_video.py
+209
-0
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
...dd_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+2
-7
tests/plugins_tests/test_platform_plugins.py
tests/plugins_tests/test_platform_plugins.py
+1
-1
tests/quantization/test_auto_round.py
tests/quantization/test_auto_round.py
+3
-1
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+10
-4
tests/quantization/test_configs.py
tests/quantization/test_configs.py
+33
-8
tests/quantization/test_cpu_offload.py
tests/quantization/test_cpu_offload.py
+1
-1
tests/quantization/test_cpu_wna16.py
tests/quantization/test_cpu_wna16.py
+1
-0
tests/quantization/test_experts_int8.py
tests/quantization/test_experts_int8.py
+5
-1
tests/quantization/test_gptq_dynamic.py
tests/quantization/test_gptq_dynamic.py
+6
-1
tests/quantization/test_modelopt.py
tests/quantization/test_modelopt.py
+141
-0
tests/quantization/test_rtn.py
tests/quantization/test_rtn.py
+5
-1
tests/quantization/test_torchao.py
tests/quantization/test_torchao.py
+6
-0
tests/quantization/untest_fp8.py
tests/quantization/untest_fp8.py
+26
-8
tests/quantization/untest_ptpc_fp8.py
tests/quantization/untest_ptpc_fp8.py
+12
-25
tests/quantization/utils.py
tests/quantization/utils.py
+5
-0
No files found.
Too many changes to show.
To preserve performance only
681 of 681+
files are displayed.
Plain diff
Email patch
tests/multimodal/test_cache.py
View file @
7e63ef82
...
...
@@ -24,10 +24,12 @@ from vllm.multimodal.cache import (
)
from
vllm.multimodal.hasher
import
MultiModalHasher
from
vllm.multimodal.inputs
import
(
MultiModalFeatureSpec
,
MultiModalFieldElem
,
MultiModalKwargsItem
,
MultiModalKwargsItems
,
MultiModalSharedField
,
PlaceholderRange
,
)
from
vllm.multimodal.processing
import
PromptInsertion
from
vllm.utils.mem_constants
import
GiB_bytes
,
MiB_bytes
...
...
@@ -518,3 +520,40 @@ def test_cache_eviction_shm_cache():
receiver_cache
=
ShmObjectStoreReceiverCache
(
vllm_config
,
mp
.
Lock
())
_run_test_cache_eviction_shm
(
sender_cache
,
receiver_cache
,
base_item_size
=
MiB_bytes
)
def
test_processor_cache_shared_across_loras
():
"""Test that processor cache uses mm_hash to share data across LoRAs."""
model_config
=
ModelConfig
(
model
=
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
,
mm_processor_cache_gb
=
1
,
)
receiver_cache
=
MultiModalReceiverCache
(
model_config
)
base_mm_hash
=
"image_hash_abc123"
lora_a_identifier
=
f
"12345:
{
base_mm_hash
}
"
lora_b_identifier
=
f
"67890:
{
base_mm_hash
}
"
item_data
=
MultiModalKwargsItem
.
dummy
(
"test_image"
,
nbytes
=
1024
)
feature_lora_a
=
MultiModalFeatureSpec
(
data
=
item_data
,
modality
=
"image"
,
identifier
=
lora_a_identifier
,
mm_position
=
PlaceholderRange
(
offset
=
0
,
length
=
100
),
mm_hash
=
base_mm_hash
,
)
receiver_cache
.
get_and_update_features
([
feature_lora_a
])
assert
base_mm_hash
in
receiver_cache
.
_cache
feature_lora_b
=
MultiModalFeatureSpec
(
data
=
None
,
modality
=
"image"
,
identifier
=
lora_b_identifier
,
mm_position
=
PlaceholderRange
(
offset
=
0
,
length
=
100
),
mm_hash
=
base_mm_hash
,
)
receiver_cache
.
get_and_update_features
([
feature_lora_b
])
assert
feature_lora_b
.
data
==
item_data
tests/multimodal/test_embedding_shape_validation_unit.py
0 → 100644
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Unit tests for embedding shape validation.
Simple, fast unit tests that can run without server fixtures.
Run with: pytest tests/multimodal/test_embedding_shape_validation_unit.py -v
"""
import
pytest
import
torch
from
vllm.multimodal.parse
import
(
AudioEmbeddingItems
,
ImageEmbeddingItems
,
)
class
TestImageEmbedBasicValidation
:
"""Test basic ndim validation in image embeddings via ImageEmbeddingItems."""
def
test_valid_2d_tensor_accepted
(
self
):
"""Baseline: 2D tensors should be accepted."""
valid_tensor
=
torch
.
randn
(
10
,
768
,
dtype
=
torch
.
float32
)
# Should not raise - 2D is valid
items
=
ImageEmbeddingItems
(
valid_tensor
)
assert
items
.
get_count
()
==
10
def
test_valid_3d_tensor_accepted
(
self
):
"""Baseline: 3D tensors should be accepted."""
valid_tensor
=
torch
.
randn
(
2
,
10
,
768
,
dtype
=
torch
.
float32
)
# Should not raise - 3D is valid
items
=
ImageEmbeddingItems
(
valid_tensor
)
assert
items
.
get_count
()
==
2
def
test_valid_list_of_2d_tensors_accepted
(
self
):
"""Baseline: List of 2D tensors should be accepted."""
tensors
=
[
torch
.
randn
(
10
,
768
,
dtype
=
torch
.
float32
),
torch
.
randn
(
15
,
768
,
dtype
=
torch
.
float32
),
]
# Should not raise
items
=
ImageEmbeddingItems
(
tensors
)
assert
items
.
get_count
()
==
2
def
test_1d_tensor_rejected
(
self
):
"""Security: 1D tensors should be rejected (invalid ndim)."""
invalid_tensor
=
torch
.
randn
(
768
,
dtype
=
torch
.
float32
)
# 1D
with
pytest
.
raises
(
ValueError
)
as
exc_info
:
ImageEmbeddingItems
(
invalid_tensor
)
assert
"must be 2D"
in
str
(
exc_info
.
value
)
or
"3D"
in
str
(
exc_info
.
value
)
def
test_4d_tensor_rejected
(
self
):
"""Security: 4D tensors should be rejected (invalid ndim)."""
invalid_tensor
=
torch
.
randn
(
1
,
2
,
10
,
768
,
dtype
=
torch
.
float32
)
# 4D
with
pytest
.
raises
(
ValueError
)
as
exc_info
:
ImageEmbeddingItems
(
invalid_tensor
)
assert
"must be 2D"
in
str
(
exc_info
.
value
)
or
"3D"
in
str
(
exc_info
.
value
)
def
test_hidden_size_validation_correct_size
(
self
):
"""Embeddings with correct hidden size should be accepted."""
expected_hidden_size
=
768
valid_tensor
=
torch
.
randn
(
10
,
expected_hidden_size
,
dtype
=
torch
.
float32
)
# Should not raise
items
=
ImageEmbeddingItems
(
valid_tensor
,
expected_hidden_size
=
expected_hidden_size
)
assert
items
.
get_count
()
==
10
def
test_hidden_size_validation_wrong_size_rejected
(
self
):
"""Embeddings with wrong hidden size should be rejected."""
expected_hidden_size
=
768
wrong_hidden_size
=
4096
invalid_tensor
=
torch
.
randn
(
10
,
wrong_hidden_size
,
dtype
=
torch
.
float32
)
with
pytest
.
raises
(
ValueError
)
as
exc_info
:
ImageEmbeddingItems
(
invalid_tensor
,
expected_hidden_size
=
expected_hidden_size
)
error_msg
=
str
(
exc_info
.
value
)
assert
"hidden dimension mismatch"
in
error_msg
.
lower
()
assert
str
(
wrong_hidden_size
)
in
error_msg
assert
str
(
expected_hidden_size
)
in
error_msg
class
TestAudioEmbedBasicValidation
:
"""Test basic ndim validation in audio embeddings via AudioEmbeddingItems."""
def
test_valid_2d_tensor_accepted
(
self
):
"""Baseline: 2D tensors should be accepted."""
valid_tensor
=
torch
.
randn
(
10
,
768
,
dtype
=
torch
.
float32
)
# Should not raise - 2D is valid
items
=
AudioEmbeddingItems
(
valid_tensor
)
assert
items
.
get_count
()
==
10
def
test_valid_3d_tensor_accepted
(
self
):
"""Baseline: 3D tensors should be accepted."""
valid_tensor
=
torch
.
randn
(
2
,
10
,
768
,
dtype
=
torch
.
float32
)
# Should not raise - 3D is valid
items
=
AudioEmbeddingItems
(
valid_tensor
)
assert
items
.
get_count
()
==
2
def
test_valid_list_of_2d_tensors_accepted
(
self
):
"""Baseline: List of 2D tensors should be accepted."""
tensors
=
[
torch
.
randn
(
10
,
768
,
dtype
=
torch
.
float32
),
torch
.
randn
(
15
,
768
,
dtype
=
torch
.
float32
),
]
# Should not raise
items
=
AudioEmbeddingItems
(
tensors
)
assert
items
.
get_count
()
==
2
def
test_1d_tensor_rejected
(
self
):
"""Security: 1D tensors should be rejected (invalid ndim)."""
invalid_tensor
=
torch
.
randn
(
768
,
dtype
=
torch
.
float32
)
# 1D
with
pytest
.
raises
(
ValueError
)
as
exc_info
:
AudioEmbeddingItems
(
invalid_tensor
)
assert
"must be 2D"
in
str
(
exc_info
.
value
)
or
"3D"
in
str
(
exc_info
.
value
)
def
test_scalar_rejected
(
self
):
"""Security: Scalar tensors should be rejected."""
invalid_tensor
=
torch
.
tensor
(
1.0
)
# 0D (scalar)
with
pytest
.
raises
(
ValueError
):
AudioEmbeddingItems
(
invalid_tensor
)
def
test_hidden_size_validation_correct_size
(
self
):
"""Embeddings with correct hidden size should be accepted."""
expected_hidden_size
=
768
valid_tensor
=
torch
.
randn
(
10
,
expected_hidden_size
,
dtype
=
torch
.
float32
)
# Should not raise
items
=
AudioEmbeddingItems
(
valid_tensor
,
expected_hidden_size
=
expected_hidden_size
)
assert
items
.
get_count
()
==
10
def
test_hidden_size_validation_wrong_size_rejected
(
self
):
"""Embeddings with wrong hidden size should be rejected."""
expected_hidden_size
=
768
wrong_hidden_size
=
4096
invalid_tensor
=
torch
.
randn
(
10
,
wrong_hidden_size
,
dtype
=
torch
.
float32
)
with
pytest
.
raises
(
ValueError
)
as
exc_info
:
AudioEmbeddingItems
(
invalid_tensor
,
expected_hidden_size
=
expected_hidden_size
)
error_msg
=
str
(
exc_info
.
value
)
assert
"hidden dimension mismatch"
in
error_msg
.
lower
()
assert
str
(
wrong_hidden_size
)
in
error_msg
assert
str
(
expected_hidden_size
)
in
error_msg
class
TestShapeValidationDoSPrevention
:
"""
Tests for DoS prevention through shape validation.
Verifies that embeddings with incorrect shapes are rejected early,
preventing crashes during model inference.
"""
def
test_prevent_crash_from_wrong_shape_image_embeds
(
self
):
"""
Prevent crash scenario: wrong hidden size in image embeddings.
Without validation, this would pass initial checks but crash later
during model forward pass when dimensions don't match.
"""
expected_hidden_size
=
768
# Typical model hidden size
wrong_hidden_size
=
4096
# Wrong size (e.g., Llama-sized)
wrong_embedding
=
torch
.
randn
(
100
,
wrong_hidden_size
,
dtype
=
torch
.
float32
)
# Should be rejected at instantiation time, not during inference
with
pytest
.
raises
(
ValueError
)
as
exc_info
:
ImageEmbeddingItems
(
wrong_embedding
,
expected_hidden_size
=
expected_hidden_size
)
error_msg
=
str
(
exc_info
.
value
)
assert
"hidden dimension mismatch"
in
error_msg
.
lower
()
assert
str
(
expected_hidden_size
)
in
error_msg
# Expected
assert
str
(
wrong_hidden_size
)
in
error_msg
# Received
def
test_prevent_crash_from_wrong_shape_audio_embeds
(
self
):
"""
Prevent crash scenario: wrong hidden size in audio embeddings.
"""
expected_hidden_size
=
768
wrong_hidden_size
=
4096
wrong_embedding
=
torch
.
randn
(
100
,
wrong_hidden_size
,
dtype
=
torch
.
float32
)
with
pytest
.
raises
(
ValueError
)
as
exc_info
:
AudioEmbeddingItems
(
wrong_embedding
,
expected_hidden_size
=
expected_hidden_size
)
error_msg
=
str
(
exc_info
.
value
)
assert
"hidden dimension mismatch"
in
error_msg
.
lower
()
def
test_extremely_large_hidden_size_rejected
(
self
):
"""Security: Prevent DoS from extremely large embeddings."""
expected_hidden_size
=
768
huge_hidden_size
=
100000
# Large but not extreme to avoid test OOM
invalid_tensor
=
torch
.
randn
(
10
,
huge_hidden_size
,
dtype
=
torch
.
float32
)
with
pytest
.
raises
(
ValueError
)
as
exc_info
:
ImageEmbeddingItems
(
invalid_tensor
,
expected_hidden_size
=
expected_hidden_size
)
assert
"hidden dimension mismatch"
in
str
(
exc_info
.
value
).
lower
()
def
test_batch_with_mixed_hidden_sizes_rejected
(
self
):
"""All embeddings in a list must have the same hidden size."""
expected_hidden_size
=
768
# One correct, one wrong
batch
=
[
torch
.
randn
(
10
,
expected_hidden_size
,
dtype
=
torch
.
float32
),
torch
.
randn
(
10
,
expected_hidden_size
+
100
,
dtype
=
torch
.
float32
),
# Wrong!
]
# Should fail on the second one
with
pytest
.
raises
(
ValueError
)
as
exc_info
:
ImageEmbeddingItems
(
batch
,
expected_hidden_size
=
expected_hidden_size
)
assert
"hidden dimension mismatch"
in
str
(
exc_info
.
value
).
lower
()
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
,
"-v"
,
"--tb=short"
])
tests/multimodal/test_image.py
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pickle
from
pathlib
import
Path
import
numpy
as
np
import
pytest
from
PIL
import
Image
,
ImageChops
from
vllm.multimodal.base
import
MediaWithBytes
from
vllm.multimodal.image
import
ImageMediaIO
,
convert_image_mode
pytestmark
=
pytest
.
mark
.
cpu_test
...
...
@@ -157,3 +159,34 @@ def test_rgba_background_color_validation():
ImageMediaIO
(
rgba_background_color
=
(
0
,
0
,
0
))
# Should not raise
ImageMediaIO
(
rgba_background_color
=
[
255
,
255
,
255
])
# Should not raise
ImageMediaIO
(
rgba_background_color
=
(
128
,
128
,
128
))
# Should not raise
def
test_media_with_bytes_pickle_roundtrip
():
"""Regression test for pickle/unpickle of MediaWithBytes.
Verifies that MediaWithBytes can be pickled and unpickled without
RecursionError. See: https://github.com/vllm-project/vllm/issues/30818
"""
original_image
=
Image
.
open
(
ASSETS_DIR
/
"image1.png"
).
convert
(
"RGB"
)
original_bytes
=
b
"test_bytes_data"
wrapper
=
MediaWithBytes
(
media
=
original_image
,
original_bytes
=
original_bytes
)
# Verify attribute delegation works before pickling
assert
wrapper
.
width
==
original_image
.
width
assert
wrapper
.
height
==
original_image
.
height
assert
wrapper
.
mode
==
original_image
.
mode
# Pickle and unpickle (this would cause RecursionError before the fix)
pickled
=
pickle
.
dumps
(
wrapper
)
unpickled
=
pickle
.
loads
(
pickled
)
# Verify the unpickled object works correctly
assert
unpickled
.
original_bytes
==
original_bytes
assert
unpickled
.
media
.
width
==
original_image
.
width
assert
unpickled
.
media
.
height
==
original_image
.
height
# Verify attribute delegation works after unpickling
assert
unpickled
.
width
==
original_image
.
width
assert
unpickled
.
height
==
original_image
.
height
assert
unpickled
.
mode
==
original_image
.
mode
tests/multimodal/test_processing.py
View file @
7e63ef82
...
...
@@ -1021,9 +1021,8 @@ def test_hf_processor_init_kwargs(
DummyProcessor
,
# type: ignore[arg-type]
**
inference_kwargs
,
)
for
k
,
v
in
expected_kwargs
.
items
():
assert
getattr
(
processor
,
k
)
==
v
assert
processor
.
a
==
expected_kwargs
[
"a"
]
assert
processor
.
b
==
expected_kwargs
[
"b"
]
@
pytest
.
mark
.
parametrize
(
"model_id"
,
[
"Qwen/Qwen2-VL-2B-Instruct"
])
# Dummy
...
...
tests/multimodal/test_video.py
View file @
7e63ef82
...
...
@@ -299,3 +299,212 @@ def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch
frames_missing
,
metadata_missing
=
videoio_missing
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
frames_missing
,
FAKE_OUTPUT_2
)
assert
metadata_missing
[
"video_backend"
]
==
"test_video_backend_override_2"
# ============================================================================
# Frame Recovery Tests
# ============================================================================
def
test_video_recovery_simulated_failures
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
Test that frame recovery correctly uses the next valid frame when
target frames fail to load.
Uses corrupted.mp4 and mocks VideoCapture.grab() to fail on specific
frame indices (in addition to the real corruption at frame 17), then
verifies recovery produces more frames.
"""
import
cv2
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"opencv"
)
# Load corrupted.mp4 (26 frames, frame 17 is genuinely corrupted)
video_path
=
ASSETS_DIR
/
"corrupted.mp4"
with
open
(
video_path
,
"rb"
)
as
f
:
video_data
=
f
.
read
()
# Simulate additional failures on frames 3 and 10
# (in addition to the real corruption at frame 17)
fail_on_frames
=
{
3
,
10
}
# Store original VideoCapture class
original_video_capture
=
cv2
.
VideoCapture
class
MockVideoCapture
:
"""Wrapper that simulates grab() failures on specific frames."""
def
__init__
(
self
,
*
args
,
**
kwargs
):
self
.
_cap
=
original_video_capture
(
*
args
,
**
kwargs
)
self
.
_current_frame
=
-
1
def
grab
(
self
):
self
.
_current_frame
+=
1
if
self
.
_current_frame
in
fail_on_frames
:
return
False
# Simulate failure
return
self
.
_cap
.
grab
()
def
retrieve
(
self
):
return
self
.
_cap
.
retrieve
()
def
get
(
self
,
prop
):
return
self
.
_cap
.
get
(
prop
)
def
isOpened
(
self
):
return
self
.
_cap
.
isOpened
()
def
release
(
self
):
return
self
.
_cap
.
release
()
# Patch cv2.VideoCapture
m
.
setattr
(
cv2
,
"VideoCapture"
,
MockVideoCapture
)
loader
=
VIDEO_LOADER_REGISTRY
.
load
(
"opencv"
)
# Use num_frames=8 which samples: [0, 3, 7, 10, 14, 17, 21, 25]
# Frame 3: mocked failure, recovery window [3, 7) -> use frame 4
# Frame 10: mocked failure, recovery window [10, 14) -> use frame 11
# Frame 17: real corruption, recovery window [17, 21) -> use frame 18
# Test WITHOUT recovery - should have fewer frames due to failures
frames_no_recovery
,
meta_no
=
loader
.
load_bytes
(
video_data
,
num_frames
=
8
,
frame_recovery
=
False
)
# Test WITH recovery - should recover using next valid frames
frames_with_recovery
,
meta_yes
=
loader
.
load_bytes
(
video_data
,
num_frames
=
8
,
frame_recovery
=
True
)
# With recovery should have MORE frames than without
# Without: 5 frames (3, 10, 17 all fail)
# With: 8 frames (all recovered)
assert
frames_with_recovery
.
shape
[
0
]
>
frames_no_recovery
.
shape
[
0
],
(
f
"Recovery should produce more frames. "
f
"Without:
{
frames_no_recovery
.
shape
[
0
]
}
, "
f
"With:
{
frames_with_recovery
.
shape
[
0
]
}
"
)
# Verify metadata consistency
assert
frames_no_recovery
.
shape
[
0
]
==
len
(
meta_no
[
"frames_indices"
])
assert
frames_with_recovery
.
shape
[
0
]
==
len
(
meta_yes
[
"frames_indices"
])
# Verify temporal order is preserved
assert
meta_yes
[
"frames_indices"
]
==
sorted
(
meta_yes
[
"frames_indices"
])
def
test_video_recovery_with_corrupted_file
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
Test frame recovery with an actual corrupted video file using sparse sampling.
This test uses corrupted.mp4 which has genuine H.264 codec errors on
frame 17. With num_frames=8, the target frames are [0, 3, 7, 10, 14, 17, 21, 25].
Frame 17 is corrupted but frames 18-20 are readable, so recovery can use
frame 18 to fill in for the failed frame 17.
This test verifies:
1. Without recovery: frame 17 is skipped (7 frames loaded)
2. With recovery: frame 18 fills in for frame 17 (8 frames loaded)
3. Recovery produces MORE frames than without recovery
4. Metadata is consistent with loaded frames
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"opencv"
)
corrupted_video_path
=
ASSETS_DIR
/
"corrupted.mp4"
with
open
(
corrupted_video_path
,
"rb"
)
as
f
:
video_data
=
f
.
read
()
loader
=
VIDEO_LOADER_REGISTRY
.
load
(
"opencv"
)
# Use num_frames=8 which makes frame 17 a target with recovery window [17, 21)
# Target frames: [0, 3, 7, 10, 14, 17, 21, 25]
# Frame 17 is corrupted, but frames 18-20 are readable for recovery
# Test without recovery - frame 17 will be skipped
frames_no_recovery
,
meta_no_recovery
=
loader
.
load_bytes
(
video_data
,
num_frames
=
8
,
frame_recovery
=
False
)
# Test with recovery - frame 18 should fill in for frame 17
frames_with_recovery
,
meta_with_recovery
=
loader
.
load_bytes
(
video_data
,
num_frames
=
8
,
frame_recovery
=
True
)
# Verify metadata consistency for both modes
assert
frames_no_recovery
.
shape
[
0
]
==
len
(
meta_no_recovery
[
"frames_indices"
]),
(
"Frame count must match indices without recovery"
)
assert
frames_with_recovery
.
shape
[
0
]
==
len
(
meta_with_recovery
[
"frames_indices"
]
),
"Frame count must match indices with recovery"
# KEY ASSERTION: Recovery should produce MORE frames than without recovery
# Without recovery: 7 frames (frame 17 skipped)
# With recovery: 8 frames (frame 18 used for frame 17)
assert
frames_with_recovery
.
shape
[
0
]
>
frames_no_recovery
.
shape
[
0
],
(
f
"Recovery should produce more frames with sparse sampling. "
f
"Got
{
frames_with_recovery
.
shape
[
0
]
}
with recovery vs "
f
"
{
frames_no_recovery
.
shape
[
0
]
}
without"
)
# Verify we got all 8 requested frames with recovery
assert
frames_with_recovery
.
shape
[
0
]
==
8
,
(
f
"With recovery, should load all 8 requested frames. "
f
"Got
{
frames_with_recovery
.
shape
[
0
]
}
"
)
# Verify the video metadata is correct
expected_total_frames
=
26
assert
meta_with_recovery
[
"total_num_frames"
]
==
expected_total_frames
,
(
f
"Expected
{
expected_total_frames
}
total frames in metadata"
)
def
test_video_recovery_dynamic_backend
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""
Test that frame_recovery works with the dynamic video backend.
The dynamic backend samples frames based on fps/duration rather than
loading all frames. This test verifies recovery works in that context.
"""
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_VIDEO_LOADER_BACKEND"
,
"opencv_dynamic"
)
corrupted_video_path
=
ASSETS_DIR
/
"corrupted.mp4"
with
open
(
corrupted_video_path
,
"rb"
)
as
f
:
video_data
=
f
.
read
()
loader
=
VIDEO_LOADER_REGISTRY
.
load
(
"opencv_dynamic"
)
# Test without recovery
frames_no_recovery
,
meta_no
=
loader
.
load_bytes
(
video_data
,
fps
=
2
,
max_duration
=
10
,
frame_recovery
=
False
)
# Test with frame_recovery enabled
frames_with_recovery
,
meta_with
=
loader
.
load_bytes
(
video_data
,
fps
=
2
,
max_duration
=
10
,
frame_recovery
=
True
)
# Verify basic properties
assert
frames_no_recovery
.
shape
[
0
]
>
0
,
(
"Should load some frames without recovery"
)
assert
frames_with_recovery
.
shape
[
0
]
>
0
,
(
"Should load some frames with recovery"
)
assert
"do_sample_frames"
in
meta_with
assert
meta_with
[
"do_sample_frames"
]
is
False
# Dynamic backend always False
assert
frames_with_recovery
.
shape
[
0
]
==
len
(
meta_with
[
"frames_indices"
])
# Key assertion: recovery should help when corrupted frames are sampled
# We expect recovery to produce >= frames than without recovery
assert
frames_with_recovery
.
shape
[
0
]
>=
frames_no_recovery
.
shape
[
0
],
(
f
"Recovery should produce at least as many frames. "
f
"Got
{
frames_with_recovery
.
shape
[
0
]
}
with recovery vs "
f
"
{
frames_no_recovery
.
shape
[
0
]
}
without"
)
tests/plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
View file @
7e63ef82
...
...
@@ -7,7 +7,7 @@ import torch
import
torch.nn
as
nn
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.pooler
import
DispatchPooler
,
Pooler
from
vllm.model_executor.layers.pooler
import
DispatchPooler
from
vllm.model_executor.models.gemma2
import
Gemma2Model
from
vllm.model_executor.models.utils
import
WeightsMapper
,
maybe_prefix
from
vllm.sequence
import
IntermediateTensors
...
...
@@ -28,12 +28,7 @@ class MyGemma2Embedding(nn.Module):
pooler_config
=
vllm_config
.
model_config
.
pooler_config
assert
pooler_config
is
not
None
self
.
pooler
=
DispatchPooler
(
{
"token_embed"
:
Pooler
.
for_token_embed
(
pooler_config
),
"embed"
:
Pooler
.
for_embed
(
pooler_config
),
}
)
self
.
pooler
=
DispatchPooler
.
for_embedding
(
pooler_config
)
self
.
make_empty_intermediate_tensors
=
(
self
.
model
.
make_empty_intermediate_tensors
...
...
tests/plugins_tests/test_platform_plugins.py
View file @
7e63ef82
...
...
@@ -31,7 +31,7 @@ def test_platform_plugins():
)
# def test_oot_custom_op(monkeypatch: pytest.MonkeyPatch):
# def test_oot_custom_op(
default_vllm_config,
monkeypatch: pytest.MonkeyPatch):
# # simulate workload by running an example
# load_general_plugins()
# from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding
...
...
tests/quantization/test_auto_round.py
View file @
7e63ef82
...
...
@@ -26,7 +26,9 @@ MODELS = [
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
def
test_auto_round
(
vllm_runner
,
model
):
with
vllm_runner
(
model
,
enforce_eager
=
True
)
as
llm
:
with
vllm_runner
(
model
,
enforce_eager
=
True
,
allow_deprecated_quantization
=
True
)
as
llm
:
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
8
)
assert
output
print
(
f
"
{
output
[
0
][
1
]
}
"
)
tests/quantization/test_compressed_tensors.py
View file @
7e63ef82
...
...
@@ -86,7 +86,7 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner, model_args):
current_platform
.
is_rocm
()
and
model_path
not
in
ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
):
pytest
.
skip
(
f
"Skip model
{
model_path
}
as it is not support on ROCm."
)
pytest
.
skip
(
f
"Skip model
{
model_path
}
as it is not support
ed
on ROCm."
)
with
vllm_runner
(
model_path
,
enforce_eager
=
True
)
as
llm
:
...
...
@@ -164,7 +164,7 @@ def test_compressed_tensors_w8a8_logprobs(
current_platform
.
is_rocm
()
and
model_path
not
in
ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
):
pytest
.
skip
(
f
"Skip model
{
model_path
}
as it is not support on ROCm."
)
pytest
.
skip
(
f
"Skip model
{
model_path
}
as it is not support
ed
on ROCm."
)
if
use_aiter
:
if
model_path
not
in
ROCM_AITER_SUPPORTED_INT8_MODEL
:
...
...
@@ -234,7 +234,7 @@ def test_compressed_tensors_w8a8_dynamic_per_token(
current_platform
.
is_rocm
()
and
model_path
not
in
ROCM_TRITON_SCALED_MM_SUPPORTED_INT8_MODEL
):
pytest
.
skip
(
f
"Skip model
{
model_path
}
as it is not support on ROCm."
)
pytest
.
skip
(
f
"Skip model
{
model_path
}
as it is not support
ed
on ROCm."
)
if
use_aiter
:
if
model_path
not
in
ROCM_AITER_SUPPORTED_INT8_MODEL
:
...
...
@@ -651,6 +651,9 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
assert
output
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"This test is skipped on non-CUDA platform."
)
@
pytest
.
mark
.
parametrize
(
"args"
,
[
...
...
@@ -783,7 +786,10 @@ def test_compressed_tensors_fp8_block_enabled(vllm_runner):
input_quant_op
=
qkv_proj
.
scheme
.
w8a8_block_fp8_linear
.
input_quant_op
assert
isinstance
(
input_quant_op
,
QuantFP8
)
assert
input_quant_op
.
_forward_method
==
input_quant_op
.
forward_cuda
assert
input_quant_op
.
_forward_method
in
(
input_quant_op
.
forward_cuda
,
input_quant_op
.
forward_hip
,
)
llm
.
apply_model
(
check_model
)
...
...
tests/quantization/test_configs.py
View file @
7e63ef82
...
...
@@ -11,7 +11,8 @@ import pytest
import
os
from
vllm.config
import
ModelConfig
from
..utils
import
models_path_prefix
from
vllm.platforms
import
current_platform
from
tests.utils
import
models_path_prefix
@
dataclass
...
...
@@ -25,21 +26,45 @@ MODEL_ARG_EXPTYPES = [
# AUTOGPTQ
# compat: autogptq <=0.7.1 is_marlin_format: bool
# Model Serialized in Exllama Format.
# (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), None, "gptq_marlin"),
# (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "marlin", "gptq_marlin"),
# (os.path.join(models_path_prefix, "TheBloke/Llama-2-7B-Chat-GPTQ"), "gptq", "gptq"),
(
os
.
path
.
join
(
models_path_prefix
,
"TheBloke/Llama-2-7B-Chat-GPTQ"
),
None
,
"gptq_marlin"
if
current_platform
.
is_cuda
()
else
"gptq"
,
),
(
os
.
path
.
join
(
models_path_prefix
,
"TheBloke/Llama-2-7B-Chat-GPTQ"
),
"marlin"
,
"gptq_marlin"
if
current_platform
.
is_cuda
()
else
"ERROR"
,
),
(
os
.
path
.
join
(
models_path_prefix
,
"TheBloke/Llama-2-7B-Chat-GPTQ"
),
"gptq"
,
"gptq"
),
(
os
.
path
.
join
(
models_path_prefix
,
"TheBloke/Llama-2-7B-Chat-GPTQ"
),
"awq"
,
"ERROR"
),
# compat: autogptq >=0.8.0 use checkpoint_format: str
# Model Serialized in Exllama Format.
(
os
.
path
.
join
(
models_path_prefix
,
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
),
None
,
"gptq_marlin"
),
(
os
.
path
.
join
(
models_path_prefix
,
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
),
"marlin"
,
"gptq_marlin"
),
(
os
.
path
.
join
(
models_path_prefix
,
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
),
None
,
"gptq_marlin"
if
current_platform
.
is_cuda
()
else
"gptq"
,
),
(
os
.
path
.
join
(
models_path_prefix
,
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
),
"marlin"
,
"gptq_marlin"
if
current_platform
.
is_cuda
()
else
"ERROR"
,
),
(
os
.
path
.
join
(
models_path_prefix
,
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
),
"gptq"
,
"gptq"
),
(
os
.
path
.
join
(
models_path_prefix
,
"LnL-AI/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
),
"awq"
,
"ERROR"
),
# AUTOAWQ
# (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), None, "awq_marlin"),
(
os
.
path
.
join
(
models_path_prefix
,
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
),
None
,
"awq_marlin"
if
current_platform
.
is_cuda
()
else
"awq"
,
),
(
os
.
path
.
join
(
models_path_prefix
,
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
),
"awq"
,
"awq"
),
# (os.path.join(models_path_prefix, "TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"), "marlin", "awq_marlin"),
(
os
.
path
.
join
(
models_path_prefix
,
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
),
"marlin"
,
"awq_marlin"
if
current_platform
.
is_cuda
()
else
"ERROR"
,
),
(
os
.
path
.
join
(
models_path_prefix
,
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ"
),
"gptq"
,
"ERROR"
),
]
...
...
tests/quantization/test_cpu_offload.py
View file @
7e63ef82
...
...
@@ -67,7 +67,7 @@ def test_cpu_offload_compressed_tensors(monkeypatch):
monkeypatch
.
setenv
(
"VLLM_TEST_FORCE_LOAD_FORMAT"
,
"auto"
)
# Test wNa16
compare_two_settings
(
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/
tinyllama-oneshot-w4a16-channel-v2
"
),
os
.
path
.
join
(
models_path_prefix
,
"nm-testing/
Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16
"
),
[
"--enforce_eager"
],
[
"--enforce_eager"
,
"--cpu-offload-gb"
,
"1"
],
max_wait_seconds
=
480
,
...
...
tests/quantization/test_cpu_wna16.py
View file @
7e63ef82
...
...
@@ -10,6 +10,7 @@ if not current_platform.is_cpu():
MODELS
=
[
"TheBloke/TinyLlama-1.1B-Chat-v1.0-AWQ"
,
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ"
,
# with g_idx
"Qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4"
,
# without g_idx
]
DTYPE
=
[
"bfloat16"
]
...
...
tests/quantization/test_experts_int8.py
View file @
7e63ef82
...
...
@@ -38,6 +38,10 @@ def test_model_experts_int8_startup(
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
with
vllm_runner
(
model
,
dtype
=
dtype
,
enforce_eager
=
True
,
quantization
=
"experts_int8"
model
,
dtype
=
dtype
,
enforce_eager
=
True
,
quantization
=
"experts_int8"
,
allow_deprecated_quantization
=
True
,
)
as
vllm_model
:
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
tests/quantization/test_gptq_dynamic.py
View file @
7e63ef82
...
...
@@ -15,7 +15,9 @@ from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinLinear
from
vllm.model_executor.layers.quantization.utils.gptq_utils
import
(
get_dynamic_override
,
)
from
..utils
import
models_path_prefix
from
vllm.platforms
import
current_platform
PROMPT
=
"On the surface of Mars, we found"
...
...
@@ -23,7 +25,10 @@ PROMPT = "On the surface of Mars, we found"
# The second layer is quantized using bits=8, group_size=32
# All other layers (layer index >= 2) are not quantized
MODEL_QUANT
=
[
(
os
.
path
.
join
(
models_path_prefix
,
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue"
),
True
),
(
os
.
path
.
join
(
models_path_prefix
,
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue"
),
current_platform
.
is_cuda
(),
),
(
os
.
path
.
join
(
models_path_prefix
,
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse"
),
False
,
...
...
tests/quantization/test_modelopt.py
View file @
7e63ef82
...
...
@@ -6,6 +6,7 @@ Run `pytest tests/quantization/test_modelopt.py`.
"""
import
os
from
typing
import
NoReturn
import
pytest
import
torch
...
...
@@ -19,6 +20,28 @@ def enable_pickle(monkeypatch):
monkeypatch
.
setenv
(
"VLLM_ALLOW_INSECURE_SERIALIZATION"
,
"1"
)
def
_skip
(
msg
:
str
)
->
NoReturn
:
pytest
.
skip
(
msg
)
raise
RuntimeError
(
msg
)
def
_snapshot_download_or_skip
(
model_id
:
str
)
->
str
:
try
:
from
huggingface_hub
import
snapshot_download
except
Exception
as
e
:
# pragma: no cover
_skip
(
f
"huggingface_hub is required to download
{
model_id
}
:
{
e
}
"
)
try
:
return
snapshot_download
(
repo_id
=
model_id
,
repo_type
=
"model"
,
# These checkpoints are already small; download full repo for simplicity.
allow_patterns
=
[
"*"
],
)
except
Exception
as
e
:
_skip
(
f
"Failed to download
{
model_id
}
from the HF Hub:
{
e
}
"
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"modelopt"
),
reason
=
"ModelOpt FP8 is not supported on this GPU type."
,
...
...
@@ -91,3 +114,121 @@ def test_modelopt_fp8_checkpoint_setup(vllm_runner):
output
=
llm
.
generate_greedy
([
"Hello my name is"
],
max_tokens
=
4
)
assert
output
print
(
f
"ModelOpt FP8 output:
{
output
}
"
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"modelopt"
),
reason
=
"ModelOpt FP8 is not supported on this GPU type."
,
)
def
test_modelopt_fp8_pc_pt_checkpoint_setup
(
vllm_runner
):
"""Test ModelOpt FP8_PER_CHANNEL_PER_TOKEN checkpoint setup."""
model_id
=
"CedricHwang/qwen2.5-0.5b-modelopt-fp8-pc-pt"
model_path
=
_snapshot_download_or_skip
(
model_id
)
with
vllm_runner
(
model_path
,
quantization
=
"modelopt"
,
enforce_eager
=
True
)
as
llm
:
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
qkv_proj
=
layer
.
self_attn
.
qkv_proj
o_proj
=
layer
.
self_attn
.
o_proj
gate_up_proj
=
layer
.
mlp
.
gate_up_proj
down_proj
=
layer
.
mlp
.
down_proj
from
vllm.model_executor.layers.quantization.modelopt
import
(
ModelOptFp8PcPtLinearMethod
,
)
assert
isinstance
(
qkv_proj
.
quant_method
,
ModelOptFp8PcPtLinearMethod
)
assert
isinstance
(
o_proj
.
quant_method
,
ModelOptFp8PcPtLinearMethod
)
assert
isinstance
(
gate_up_proj
.
quant_method
,
ModelOptFp8PcPtLinearMethod
)
assert
isinstance
(
down_proj
.
quant_method
,
ModelOptFp8PcPtLinearMethod
)
assert
qkv_proj
.
weight
.
dtype
==
torch
.
float8_e4m3fn
assert
o_proj
.
weight
.
dtype
==
torch
.
float8_e4m3fn
assert
gate_up_proj
.
weight
.
dtype
==
torch
.
float8_e4m3fn
assert
down_proj
.
weight
.
dtype
==
torch
.
float8_e4m3fn
# Per-channel scales; activations are dynamically scaled per token.
assert
hasattr
(
qkv_proj
,
"weight_scale"
)
assert
qkv_proj
.
weight_scale
.
dtype
==
torch
.
float32
assert
qkv_proj
.
weight_scale
.
dim
()
==
1
assert
not
hasattr
(
qkv_proj
,
"input_scale"
)
assert
hasattr
(
o_proj
,
"weight_scale"
)
assert
o_proj
.
weight_scale
.
dtype
==
torch
.
float32
assert
o_proj
.
weight_scale
.
dim
()
==
1
assert
not
hasattr
(
o_proj
,
"input_scale"
)
assert
hasattr
(
gate_up_proj
,
"weight_scale"
)
assert
gate_up_proj
.
weight_scale
.
dtype
==
torch
.
float32
assert
gate_up_proj
.
weight_scale
.
dim
()
==
1
assert
not
hasattr
(
gate_up_proj
,
"input_scale"
)
assert
hasattr
(
down_proj
,
"weight_scale"
)
assert
down_proj
.
weight_scale
.
dtype
==
torch
.
float32
assert
down_proj
.
weight_scale
.
dim
()
==
1
assert
not
hasattr
(
down_proj
,
"input_scale"
)
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
([
"Hello my name is"
],
max_tokens
=
4
)
assert
output
print
(
f
"ModelOpt FP8_PER_CHANNEL_PER_TOKEN output:
{
output
}
"
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"modelopt"
),
reason
=
"ModelOpt FP8 is not supported on this GPU type."
,
)
def
test_modelopt_fp8_pb_wo_checkpoint_setup
(
vllm_runner
):
"""Test ModelOpt FP8_PB_WO checkpoint setup."""
model_id
=
"CedricHwang/qwen2.5-0.5b-modelopt-fp8-pb-wo"
model_path
=
_snapshot_download_or_skip
(
model_id
)
with
vllm_runner
(
model_path
,
quantization
=
"modelopt"
,
enforce_eager
=
True
)
as
llm
:
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
qkv_proj
=
layer
.
self_attn
.
qkv_proj
o_proj
=
layer
.
self_attn
.
o_proj
gate_up_proj
=
layer
.
mlp
.
gate_up_proj
down_proj
=
layer
.
mlp
.
down_proj
from
vllm.model_executor.layers.quantization.modelopt
import
(
ModelOptFp8PbWoLinearMethod
,
)
assert
isinstance
(
qkv_proj
.
quant_method
,
ModelOptFp8PbWoLinearMethod
)
assert
isinstance
(
o_proj
.
quant_method
,
ModelOptFp8PbWoLinearMethod
)
assert
isinstance
(
gate_up_proj
.
quant_method
,
ModelOptFp8PbWoLinearMethod
)
assert
isinstance
(
down_proj
.
quant_method
,
ModelOptFp8PbWoLinearMethod
)
assert
qkv_proj
.
weight
.
dtype
==
torch
.
float8_e4m3fn
assert
o_proj
.
weight
.
dtype
==
torch
.
float8_e4m3fn
assert
gate_up_proj
.
weight
.
dtype
==
torch
.
float8_e4m3fn
assert
down_proj
.
weight
.
dtype
==
torch
.
float8_e4m3fn
# Block scales; should be materialized as a 2D [out_blk, in_blk] tensor.
assert
hasattr
(
qkv_proj
,
"weight_scale"
)
assert
qkv_proj
.
weight_scale
.
dtype
==
torch
.
float32
assert
qkv_proj
.
weight_scale
.
dim
()
==
2
assert
hasattr
(
o_proj
,
"weight_scale"
)
assert
o_proj
.
weight_scale
.
dtype
==
torch
.
float32
assert
o_proj
.
weight_scale
.
dim
()
==
2
assert
hasattr
(
gate_up_proj
,
"weight_scale"
)
assert
gate_up_proj
.
weight_scale
.
dtype
==
torch
.
float32
assert
gate_up_proj
.
weight_scale
.
dim
()
==
2
assert
hasattr
(
down_proj
,
"weight_scale"
)
assert
down_proj
.
weight_scale
.
dtype
==
torch
.
float32
assert
down_proj
.
weight_scale
.
dim
()
==
2
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
([
"Hello my name is"
],
max_tokens
=
4
)
assert
output
print
(
f
"ModelOpt FP8_PB_WO output:
{
output
}
"
)
tests/quantization/test_rtn.py
View file @
7e63ef82
...
...
@@ -30,6 +30,10 @@ def test_model_rtn_startup(
max_tokens
:
int
,
)
->
None
:
with
vllm_runner
(
model
,
enforce_eager
=
True
,
dtype
=
dtype
,
quantization
=
"rtn"
model
,
enforce_eager
=
True
,
dtype
=
dtype
,
quantization
=
"rtn"
,
allow_deprecated_quantization
=
True
,
)
as
vllm_model
:
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
tests/quantization/test_torchao.py
View file @
7e63ef82
...
...
@@ -6,11 +6,17 @@ import importlib.util
import
pytest
import
torch
from
vllm.platforms
import
current_platform
DTYPE
=
[
"bfloat16"
]
TORCHAO_AVAILABLE
=
importlib
.
util
.
find_spec
(
"torchao"
)
is
not
None
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
()
and
current_platform
.
is_fp8_fnuz
(),
reason
=
"Only fp8_fnuz supported on CDNA3 architecture"
,
)
@
pytest
.
mark
.
skipif
(
not
TORCHAO_AVAILABLE
,
reason
=
"torchao is not available"
)
def
test_pre_quantized_model
(
vllm_runner
):
with
vllm_runner
(
...
...
tests/quantization/untest_fp8.py
View file @
7e63ef82
...
...
@@ -38,7 +38,9 @@ MODELS = [
reason
=
"FP8 is not supported on this GPU type."
,
)
@
pytest
.
mark
.
parametrize
(
"model_id"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"force_marlin"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"force_marlin"
,
[
False
]
if
current_platform
.
is_rocm
()
else
[
False
,
True
]
)
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter"
,
[
True
,
False
]
if
current_platform
.
is_rocm
()
else
[
False
]
)
...
...
@@ -127,7 +129,9 @@ def test_kv_cache_model_load_and_run(
reason
=
"FP8 is not supported on this GPU type."
,
)
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"auto"
,
"fp8"
])
@
pytest
.
mark
.
parametrize
(
"force_marlin"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"force_marlin"
,
[
False
]
if
current_platform
.
is_rocm
()
else
[
False
,
True
]
)
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter"
,
[
True
,
False
]
if
current_platform
.
is_rocm
()
else
[
False
]
)
...
...
@@ -199,10 +203,10 @@ def test_scaled_fp8_quant(dtype) -> None:
def
quantize_ref
(
tensor
,
inv_scale
):
# The reference implementation that fully aligns to
# the kernel being tested.
finfo
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
finfo
=
torch
.
finfo
(
current_platform
.
fp8_dtype
()
)
scale
=
inv_scale
.
reciprocal
()
qweight
=
(
tensor
.
to
(
torch
.
float32
)
*
scale
).
clamp
(
min
=
finfo
.
min
,
max
=
finfo
.
max
)
qweight
=
qweight
.
to
(
torch
.
float8_e4m3fn
)
qweight
=
qweight
.
to
(
current_platform
.
fp8_dtype
()
)
return
qweight
def
per_tensor_dequantize
(
tensor
,
inv_scale
,
dtype
):
...
...
@@ -218,7 +222,7 @@ def test_scaled_fp8_quant(dtype) -> None:
ref_y
,
inv_scale
=
ops
.
scaled_fp8_quant
(
x
,
None
)
ref_y
=
per_tensor_dequantize
(
ref_y
,
inv_scale
,
dtype
)
# Reference dynamic quantizaton
# Reference dynamic quantizat
i
on
y
=
quantize_ref
(
x
,
inv_scale
)
torch
.
testing
.
assert_close
(
ref_y
,
per_tensor_dequantize
(
y
,
inv_scale
,
dtype
))
...
...
@@ -269,6 +273,10 @@ def test_scaled_fp8_quant(dtype) -> None:
)
@
pytest
.
mark
.
skipif
(
current_platform
.
is_fp8_fnuz
(),
reason
=
"FP8 e4m3fn weight reloading is not supported on e4m3fnuz platforms"
,
)
@
pytest
.
mark
.
parametrize
(
"method_cls"
,
[
Fp8LinearMethod
,
Fp8MoEMethod
])
# FP8 weight reloading does not support online quantization
@
pytest
.
mark
.
parametrize
(
"is_checkpoint_fp8_serialized"
,
[
True
])
# skip False
...
...
@@ -279,8 +287,19 @@ def test_scaled_fp8_quant(dtype) -> None:
# this is the case for marlin as well as per-tensor Fp8MoEMethod
@
pytest
.
mark
.
parametrize
(
"use_marlin"
,
[
False
])
# skip True
def
test_fp8_reloading
(
method_cls
,
is_checkpoint_fp8_serialized
,
weight_block_size
,
use_marlin
,
dist_init
default_vllm_config
,
method_cls
,
is_checkpoint_fp8_serialized
,
weight_block_size
,
use_marlin
,
dist_init
,
monkeypatch
,
):
# NOTE(rob): this test fails when using DeepGEMM because the
# shapes are invalid. Previously the test was passing because
# we set fp8_backend to None, which sidestepped the issue.
monkeypatch
.
setenv
(
"VLLM_USE_DEEP_GEMM"
,
"0"
)
if
is_checkpoint_fp8_serialized
is
False
:
pytest
.
skip
(
"FP8 weight reloading does not support online quantization"
)
...
...
@@ -308,6 +327,7 @@ def test_fp8_reloading(
params_dtype
=
torch
.
bfloat16
,
weight_loader
=
default_weight_loader
,
)
method
.
use_marlin
=
use_marlin
else
:
layer
=
FusedMoE
(
...
...
@@ -326,8 +346,6 @@ def test_fp8_reloading(
weight_loader
=
default_weight_loader
,
)
method
.
use_marlin
=
use_marlin
# capture weights format during loading
original_metadata
=
[
(
name
,
param
.
shape
,
getattr
(
param
,
"weight_loader"
,
default_weight_loader
))
...
...
tests/quantization/untest_ptpc_fp8.py
View file @
7e63ef82
...
...
@@ -6,18 +6,12 @@ Run `pytest tests/quantization/test_ptpc_fp8.py --forked`.
"""
import
pytest
import
torch
from
tests.quantization.utils
import
is_quant_method_supported
from
vllm.model_executor.layers.quantization.fp8
import
Fp8KVCacheMethod
from
vllm.model_executor.layers.quantization.ptpc_fp8
import
PTPCFp8LinearMethod
from
vllm.platforms
import
current_platform
UNSUPPORTED_STR
=
(
"Currently torch._scaled_mm (hipBLASLt) rowwise gemm only "
"support output dtype of bfloat16. torch.float16 is specified."
)
@
pytest
.
fixture
(
scope
=
"function"
,
autouse
=
True
)
def
enable_pickle
(
monkeypatch
):
...
...
@@ -30,24 +24,17 @@ def enable_pickle(monkeypatch):
reason
=
"PTPC FP8 is not supported on this GPU type."
,
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_rocm
(),
reason
=
"This test is for ROCm GPU."
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"
auto"
,
"bfloat16"
,
"
float16"
])
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"auto"
,
"fp8"
,
"fp8_e4m3"
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"
b
float16"
])
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"auto"
,
"fp8"
])
def
test_ptpc_fp8_rocm
(
vllm_runner
,
dtype
:
str
,
kv_cache_dtype
:
str
)
->
None
:
try
:
llm
=
vllm_runner
(
"facebook/opt-125m"
,
dtype
=
dtype
,
quantization
=
"ptpc_fp8"
,
enforce_eager
=
True
,
kv_cache_dtype
=
kv_cache_dtype
,
)
except
AssertionError
as
e
:
if
str
(
e
)
==
UNSUPPORTED_STR
:
# If the error message matches, the test passes
return
else
:
# If the error message does not match, re-raise the exception
raise
llm
=
vllm_runner
(
"facebook/opt-125m"
,
dtype
=
dtype
,
quantization
=
"ptpc_fp8"
,
enforce_eager
=
True
,
kv_cache_dtype
=
kv_cache_dtype
,
allow_deprecated_quantization
=
True
,
)
with
llm
:
...
...
@@ -60,9 +47,9 @@ def test_ptpc_fp8_rocm(vllm_runner, dtype: str, kv_cache_dtype: str) -> None:
assert
attn
.
_k_scale
==
1.0
assert
attn
.
_v_scale
==
1.0
# For GPUs with hardware support, we keep weights in fp8
if
current_platform
.
has_device_capability
(
94
):
# For GPUs with hardware support, we keep weights in fp8
assert
fc1
.
weight
.
dtype
==
torch
.
float8_e4m3fnuz
assert
fc1
.
weight
.
dtype
==
current_platform
.
fp8_dtype
()
llm
.
apply_model
(
check_model
)
...
...
tests/quantization/utils.py
View file @
7e63ef82
...
...
@@ -10,6 +10,11 @@ def is_quant_method_supported(quant_method: str) -> bool:
if
not
(
current_platform
.
is_cuda
()
or
current_platform
.
is_rocm
()):
return
False
try
:
current_platform
.
verify_quantization
(
quant_method
)
except
ValueError
:
return
False
capability
=
current_platform
.
get_device_capability
()
assert
capability
is
not
None
...
...
Prev
1
…
25
26
27
28
29
30
31
32
33
…
35
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment