Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
38d80967
Commit
38d80967
authored
Sep 12, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori
parents
33650733
880c741b
Changes
544
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
356 additions
and
104 deletions
+356
-104
tests/kernels/quantization/test_block_fp8.py
tests/kernels/quantization/test_block_fp8.py
+50
-2
tests/kernels/test_onednn.py
tests/kernels/test_onednn.py
+70
-0
tests/kernels/utils.py
tests/kernels/utils.py
+2
-2
tests/kv_transfer/test_lookup_buffer.py
tests/kv_transfer/test_lookup_buffer.py
+1
-1
tests/kv_transfer/test_send_recv.py
tests/kv_transfer/test_send_recv.py
+1
-1
tests/lora/test_add_lora.py
tests/lora/test_add_lora.py
+2
-2
tests/lora/test_layers.py
tests/lora/test_layers.py
+8
-8
tests/lora/test_lora_allowed_token_ids.py
tests/lora/test_lora_allowed_token_ids.py
+4
-4
tests/lora/test_lora_manager.py
tests/lora/test_lora_manager.py
+1
-1
tests/lora/test_peft_helper.py
tests/lora/test_peft_helper.py
+1
-1
tests/lora/test_worker.py
tests/lora/test_worker.py
+4
-3
tests/model_executor/model_loader/test_registry.py
tests/model_executor/model_loader/test_registry.py
+2
-1
tests/model_executor/test_enabled_custom_ops.py
tests/model_executor/test_enabled_custom_ops.py
+22
-17
tests/models/language/generation/test_bart.py
tests/models/language/generation/test_bart.py
+2
-0
tests/models/language/generation/test_common.py
tests/models/language/generation/test_common.py
+2
-2
tests/models/language/generation/test_hybrid.py
tests/models/language/generation/test_hybrid.py
+33
-57
tests/models/language/generation/test_mistral.py
tests/models/language/generation/test_mistral.py
+2
-2
tests/models/language/generation_ppl_test/__init__.py
tests/models/language/generation_ppl_test/__init__.py
+0
-0
tests/models/language/generation_ppl_test/ppl_utils.py
tests/models/language/generation_ppl_test/ppl_utils.py
+131
-0
tests/models/language/generation_ppl_test/test_gemma.py
tests/models/language/generation_ppl_test/test_gemma.py
+18
-0
No files found.
Too many changes to show.
To preserve performance only
544 of 544+
files are displayed.
Plain diff
Email patch
tests/kernels/quantization/test_block_fp8.py
View file @
38d80967
...
...
@@ -11,8 +11,8 @@ from tests.kernels.quant_utils import (native_per_token_group_quant_fp8,
native_w8a8_block_matmul
)
from
vllm.config
import
VllmConfig
from
vllm.model_executor.layers.quantization.utils.fp8_utils
import
(
get_col_major_tma_aligned_tensor
,
per_token_group_quant_fp8
,
w8a8_block_fp8_matmul
)
cutlass_scaled_mm
,
get_col_major_tma_aligned_tensor
,
per_token_group_quant_fp8
,
w8a8_block_fp8_matmul
)
from
vllm.platforms
import
current_platform
from
vllm.utils
import
has_deep_gemm
from
vllm.utils.deep_gemm
import
fp8_gemm_nt
,
per_block_cast_to_fp8
...
...
@@ -98,6 +98,54 @@ def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
assert
rel_diff
<
0.001
@
torch
.
inference_mode
()
def
test_w8a8_block_fp8_cutlass_matmul
():
# Test simple case where weight.shape % 128 != 0,
# like in DSV3 kv_a_proj_with_mqa
M
=
32
N
=
576
K
=
7168
block_size
=
[
128
,
128
]
out_dtype
=
torch
.
bfloat16
seed
=
0
torch
.
manual_seed
(
seed
)
factor_for_scale
=
1e-2
fp8_info
=
torch
.
finfo
(
torch
.
float8_e4m3fn
)
fp8_max
,
fp8_min
=
fp8_info
.
max
,
fp8_info
.
min
A_fp32
=
(
torch
.
rand
(
M
,
K
,
dtype
=
torch
.
float32
)
-
0.5
)
*
2
*
fp8_max
B_fp32
=
(
torch
.
rand
(
N
,
K
,
dtype
=
torch
.
float32
)
-
0.5
)
*
2
*
fp8_max
B_fp8
=
B_fp32
.
clamp
(
min
=
fp8_min
,
max
=
fp8_max
).
to
(
torch
.
float8_e4m3fn
)
block_n
,
block_k
=
block_size
[
0
],
block_size
[
1
]
n_tiles
=
(
N
+
block_n
-
1
)
//
block_n
k_tiles
=
(
K
+
block_k
-
1
)
//
block_k
Bs
=
torch
.
rand
(
n_tiles
,
k_tiles
,
dtype
=
torch
.
float32
)
*
factor_for_scale
# Hopper requires row-major format for scales
Bs_cutlass
=
Bs
.
T
.
contiguous
()
if
current_platform
.
is_device_capability
(
90
)
else
Bs
A_fp8
,
As
=
per_token_group_quant_fp8
(
A_fp32
,
block_size
[
1
],
column_major_scales
=
False
)
# CUTLASS uses column-major format for scales
A_fp8_cutlass
,
As_cutlass
=
per_token_group_quant_fp8
(
A_fp32
,
block_size
[
1
],
column_major_scales
=
True
)
ref_out
=
native_w8a8_block_matmul
(
A_fp8
,
B_fp8
,
As
,
Bs
,
block_size
,
out_dtype
)
out
=
cutlass_scaled_mm
(
A_fp8_cutlass
,
B_fp8
,
As_cutlass
,
Bs_cutlass
,
block_size
,
out_dtype
)
rel_diff
=
(
torch
.
mean
(
torch
.
abs
(
out
.
to
(
torch
.
float32
)
-
ref_out
.
to
(
torch
.
float32
)))
/
torch
.
mean
(
torch
.
abs
(
ref_out
.
to
(
torch
.
float32
))))
assert
rel_diff
<
0.001
@
pytest
.
mark
.
parametrize
(
"M,N,K,block_size,out_dtype,seed"
,
itertools
.
product
(
M
,
N
,
K
,
BLOCK_SIZE
,
OUT_DTYPES
,
SEEDS
))
...
...
tests/kernels/test_onednn.py
View file @
38d80967
...
...
@@ -111,6 +111,49 @@ def onednn_int8_gemm_test_helper(primitive_cache_size: int,
torch
.
testing
.
assert_close
(
out
,
baseline
,
rtol
=
1e-1
,
atol
=
1e0
)
def
onednn_gemm_test_helper
(
primitive_cache_size
:
int
,
m
:
int
,
n
:
int
,
k
:
int
,
use_bias
:
bool
,
use_stride
:
bool
,
dtype
:
torch
.
dtype
=
torch
.
bfloat16
,
device
:
str
=
"cpu"
):
if
use_stride
:
a
=
torch
.
rand
((
m
,
2
*
k
),
dtype
=
dtype
,
device
=
device
)
*
1.5
a
=
a
[:,
:
k
]
else
:
a
=
torch
.
rand
((
m
,
k
),
dtype
=
dtype
,
device
=
device
)
*
1.5
b
=
torch
.
rand
((
n
,
k
),
dtype
=
dtype
,
device
=
device
)
*
1.5
if
use_bias
:
bias
=
torch
.
rand
((
n
,
),
device
=
device
,
dtype
=
dtype
)
*
5
bias_f32
=
bias
.
float
()
else
:
bias
=
None
bias_f32
=
None
handler
=
ops
.
create_onednn_mm
(
b
.
t
(),
primitive_cache_size
,
)
out
=
ops
.
onednn_mm
(
handler
,
a
,
bias
)
baseline
=
torch
.
nn
.
functional
.
linear
(
a
.
float
(),
b
.
float
(),
bias_f32
).
to
(
dtype
=
a
.
dtype
)
torch
.
testing
.
assert_close
(
out
,
baseline
)
if
use_bias
:
# To test runtime bias setting
out
=
ops
.
onednn_mm
(
handler
,
a
,
None
)
baseline
=
torch
.
nn
.
functional
.
linear
(
a
.
float
(),
b
.
float
(),
None
).
to
(
dtype
=
a
.
dtype
)
torch
.
testing
.
assert_close
(
out
,
baseline
)
@
pytest
.
mark
.
parametrize
(
"n,k"
,
NK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"m_list"
,
M_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"per_tensor_a_scale"
,
[
True
,
False
])
...
...
@@ -142,3 +185,30 @@ def test_onednn_int8_scaled_gemm(
use_azp
=
use_azp
,
out_dtype
=
output_type
,
)
@
pytest
.
mark
.
parametrize
(
"n,k"
,
NK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"m_list"
,
M_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"use_bias"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_stride"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPE
)
@
pytest
.
mark
.
parametrize
(
"primitive_cache_size"
,
CACHE_SIZES
)
def
test_onednn_gemm
(
n
:
int
,
k
:
int
,
m_list
:
tuple
[
int
],
use_bias
:
bool
,
use_stride
:
bool
,
dtype
:
torch
.
dtype
,
primitive_cache_size
:
int
,
):
for
m
in
m_list
:
onednn_gemm_test_helper
(
primitive_cache_size
=
primitive_cache_size
,
m
=
m
,
n
=
n
,
k
=
k
,
use_bias
=
use_bias
,
use_stride
=
use_stride
,
dtype
=
dtype
,
)
tests/kernels/utils.py
View file @
38d80967
...
...
@@ -1236,7 +1236,7 @@ def baseline_scaled_mm(a: torch.Tensor,
bias
:
Optional
[
torch
.
Tensor
]
=
None
)
->
torch
.
Tensor
:
# We treat N-dimensional group scaling as extended numpy-style broadcasting
# in numpy simply stretches dimensions with an extent of 1 to match
the
# in numpy simply stretches dimensions with an extent of 1 to match
# the target shape by repeating the data along that dimension (broadcasting)
# , we extend these semantics to say if the extent of a dimension in the
# source shape is not 1 and does not match the target shape we repeat each
...
...
@@ -1247,7 +1247,7 @@ def baseline_scaled_mm(a: torch.Tensor,
# then we would expand a to:
# a = [[1, 1, 2, 2],
# [3, 3, 4, 4]]
# NOTE this function
this function
does not explicitly broadcast dimensions
# NOTE this function does not explicitly broadcast dimensions
# with an extent of 1, since this can be done implicitly by pytorch
def
group_broadcast
(
t
,
shape
):
for
i
,
s
in
enumerate
(
shape
):
...
...
tests/kv_transfer/test_lookup_buffer.py
View file @
38d80967
...
...
@@ -128,7 +128,7 @@ if __name__ == "__main__":
print
(
f
"initialized! My rank is
{
my_rank
}
"
)
config
=
KVTransferConfig
(
kv_connector
=
'P
y
NcclConnector'
,
kv_connector
=
'P
2p
NcclConnector'
,
kv_buffer_device
=
'cuda'
,
kv_buffer_size
=
1e9
,
kv_rank
=
my_rank
,
...
...
tests/kv_transfer/test_send_recv.py
View file @
38d80967
...
...
@@ -137,7 +137,7 @@ if __name__ == "__main__":
)
config
=
KVTransferConfig
(
kv_connector
=
'P
y
NcclConnector'
,
kv_connector
=
'P
2p
NcclConnector'
,
kv_buffer_device
=
'cuda'
,
kv_buffer_size
=
1e9
,
kv_rank
=
my_rank
,
...
...
tests/lora/test_add_lora.py
View file @
38d80967
...
...
@@ -59,10 +59,10 @@ async def requests_processing_time(llm,
@
pytest
.
mark
.
asyncio
async
def
test_add_lora
(
chatglm3_lora_files
):
"""
The add_lora function is used to pre
-
load some LoRA adapters into the
The add_lora function is used to preload some LoRA adapters into the
engine in anticipation of future requests using these adapters. To test
this functionality, we use the async engine to process some requests - We
do it twice, once with add_lora() pre
-
loading and once without.
do it twice, once with add_lora() preloading and once without.
We measure the request processing time in both cases and expect the time
to be lesser in the case with add_lora() calls.
...
...
tests/lora/test_layers.py
View file @
38d80967
...
...
@@ -11,21 +11,21 @@ import pytest
import
torch
import
torch.nn.functional
as
F
from
vllm.config
import
LoRAConfig
from
vllm.lora.fully_sharded_layers
import
(
ColumnParallelLinearWithShardedLoRA
,
MergedColumnParallelLinearWithShardedLoRA
,
MergedQKVParallelLinearWithShardedLoRA
,
QKVParallelLinearWithShardedLoRA
,
RowParallelLinearWithShardedLoRA
)
from
vllm.config.lora
import
LoRAConfig
# yapf conflicts with isort for this block
# yapf: disable
from
vllm.lora.layers
import
(
BaseLayerWithLoRA
,
ColumnParallelLinearWithLoRA
,
ColumnParallelLinearWithShardedLoRA
,
LogitsProcessorWithLoRA
,
LoRAMapping
,
MergedColumnParallelLinearWithLoRA
,
MergedColumnParallelLinearWithShardedLoRA
,
MergedQKVParallelLinearWithLoRA
,
MergedQKVParallelLinearWithShardedLoRA
,
QKVParallelLinearWithLoRA
,
QKVParallelLinearWithShardedLoRA
,
ReplicatedLinearWithLoRA
,
RowParallelLinearWithLoRA
,
RowParallelLinearWithShardedLoRA
,
VocabParallelEmbeddingWithLoRA
)
# yapf: enable
from
vllm.lora.models
import
LoRALayerWeights
,
PackedLoRALayerWeights
...
...
@@ -60,9 +60,9 @@ DEVICES = ([
# prefill stage(True) or decode stage(False)
STAGES
=
[
True
,
False
]
NUM_RANDOM_SEEDS
=
6
NUM_RANDOM_SEEDS
=
2
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS
=
128
VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS
=
2
@
pytest
.
fixture
(
autouse
=
True
)
...
...
tests/lora/test_lora_allowed_token_ids.py
View file @
38d80967
...
...
@@ -3,8 +3,8 @@
import
pytest
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoRAConfig
,
ModelConfig
,
Vllm
Config
)
from
vllm.config
import
CacheConfig
,
DeviceConfig
,
ModelConfig
,
VllmConfig
from
vllm.config.lora
import
LoRA
Config
from
vllm.lora.request
import
LoRARequest
from
vllm.sampling_params
import
SamplingParams
from
vllm.transformers_utils.tokenizer_group
import
init_tokenizer_from_configs
...
...
@@ -18,7 +18,7 @@ def test_allowed_token_ids_with_lora_vocab(llama_2_7b_base_huggingface_id,
adapters that define additional tokens.
"""
# Setup a base model compatible with the sql_lora_files adapter and
# Set
up a base model compatible with the sql_lora_files adapter and
# a known number of tokens in the base model.
model_config
=
ModelConfig
(
model
=
llama_2_7b_base_huggingface_id
,
...
...
@@ -84,7 +84,7 @@ def test_allowed_token_ids_with_lora_adapter_no_vocab(
adapters that do not define additional tokens.
"""
# Setup a base model compatible with the qwen25vl_lora_files adapter and
# Set
up a base model compatible with the qwen25vl_lora_files adapter and
# a known number of tokens in the base model.
model_config
=
ModelConfig
(
model
=
qwen25vl_base_huggingface_id
,
...
...
tests/lora/test_lora_manager.py
View file @
38d80967
...
...
@@ -8,7 +8,7 @@ import torch
from
safetensors.torch
import
load_file
from
torch
import
nn
from
vllm.config
import
LoRAConfig
from
vllm.config
.lora
import
LoRAConfig
from
vllm.lora.layers
import
(
ColumnParallelLinearWithLoRA
,
MergedColumnParallelLinearWithLoRA
,
RowParallelLinearWithLoRA
)
...
...
tests/lora/test_peft_helper.py
View file @
38d80967
...
...
@@ -7,7 +7,7 @@ import shutil
import
pytest
from
vllm.config
import
LoRAConfig
from
vllm.config
.lora
import
LoRAConfig
from
vllm.lora.peft_helper
import
PEFTHelper
ERROR_CASES
=
[
...
...
tests/lora/test_worker.py
View file @
38d80967
...
...
@@ -6,9 +6,10 @@ import random
import
tempfile
from
unittest.mock
import
patch
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
LoadConfig
,
LoRAConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
,
VllmConfig
)
from
vllm.config
import
(
CacheConfig
,
DeviceConfig
,
ModelConfig
,
ParallelConfig
,
SchedulerConfig
,
VllmConfig
)
from
vllm.config.load
import
LoadConfig
from
vllm.config.lora
import
LoRAConfig
from
vllm.lora.models
import
LoRAMapping
from
vllm.lora.request
import
LoRARequest
from
vllm.v1.worker.gpu_worker
import
Worker
...
...
tests/model_executor/model_loader/test_registry.py
View file @
38d80967
...
...
@@ -4,7 +4,8 @@
import
pytest
from
torch
import
nn
from
vllm.config
import
LoadConfig
,
ModelConfig
from
vllm.config
import
ModelConfig
from
vllm.config.load
import
LoadConfig
from
vllm.model_executor.model_loader
import
(
get_model_loader
,
register_model_loader
)
from
vllm.model_executor.model_loader.base_loader
import
BaseModelLoader
...
...
tests/model_executor/test_enabled_custom_ops.py
View file @
38d80967
...
...
@@ -13,13 +13,15 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (dispatch_topk_func,
vllm_topk_softmax
)
from
vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe
import
(
is_rocm_aiter_moe_enabled
)
from
vllm.model_executor.layers.layernorm
import
(
RMSNorm
,
dispatch_
cuda
_rmsnorm_func
,
fused_add_rms_norm
,
rms_norm
,
rocm_aiter_
fused_add_rms_norm
,
rocm_aiter_
rms_norm
)
from
vllm.model_executor.layers.layernorm
import
(
RMSNorm
,
dispatch_
rocm
_rmsnorm_func
,
fused_add_rms_norm
,
rms_norm
)
from
vllm.model_executor.layers.quantization.utils.fp8_utils
import
(
cutlass_scaled_mm
,
dispatch_w8a8_blockscale_func
,
w8a8_block_fp8_matmul
)
from
vllm.platforms
import
current_platform
RMS_NORM_SUPPORTED_DTYPES
=
[
torch
.
float16
,
torch
.
bfloat16
]
# Registered subclass for test
@
CustomOp
.
register
(
"relu3"
)
...
...
@@ -149,24 +151,27 @@ def test_topk_dispatch(use_rocm_aiter: str, monkeypatch):
@
pytest
.
mark
.
parametrize
(
"add_residual"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter"
,
[
"0"
,
"1"
])
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter_norm"
,
[
"0"
,
"1"
])
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_rocm
(),
reason
=
"AITER is a feature exclusive for ROCm"
)
def
test_rms_norm_dispatch
(
add_residual
:
bool
,
use_rocm_aiter
:
str
,
use_rocm_aiter_norm
:
str
,
monkeypatch
):
def
test_rms_norm_dispatch
(
add_residual
:
bool
,
dtype
:
torch
.
dtype
,
use_rocm_aiter
:
str
,
use_rocm_aiter_norm
:
str
,
monkeypatch
):
monkeypatch
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
use_rocm_aiter
)
monkeypatch
.
setenv
(
"VLLM_ROCM_USE_AITER_RMSNORM"
,
use_rocm_aiter_norm
)
rms_norm_func
=
dispatch_cuda_rmsnorm_func
(
add_residual
)
if
not
add_residual
:
if
current_platform
.
is_rocm
()
and
int
(
use_rocm_aiter
)
and
int
(
use_rocm_aiter_norm
):
assert
rms_norm_func
==
rocm_aiter_rms_norm
else
:
assert
rms_norm_func
==
rms_norm
elif
current_platform
.
is_rocm
()
and
int
(
use_rocm_aiter
)
and
int
(
use_rocm_aiter_norm
):
assert
rms_norm_func
==
rocm_aiter_fused_add_rms_norm
else
:
rms_norm_func
=
dispatch_rocm_rmsnorm_func
(
add_residual
,
dtype
)
should_use_rocm_aiter
=
current_platform
.
is_rocm
()
and
int
(
use_rocm_aiter
)
\
and
int
(
use_rocm_aiter_norm
)
and
dtype
in
RMS_NORM_SUPPORTED_DTYPES
if
add_residual
and
should_use_rocm_aiter
:
assert
rms_norm_func
==
torch
.
ops
.
vllm
.
rocm_aiter_rmsnorm2d_fwd_with_add
elif
should_use_rocm_aiter
:
assert
rms_norm_func
==
torch
.
ops
.
vllm
.
rocm_aiter_rms_norm
elif
add_residual
:
assert
rms_norm_func
==
fused_add_rms_norm
else
:
assert
rms_norm_func
==
rms_norm
tests/models/language/generation/test_bart.py
View file @
38d80967
...
...
@@ -178,6 +178,7 @@ def run_test(
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
list
(
DecoderPromptType
))
@
pytest
.
mark
.
skip
(
reason
=
"bart not supported in V1"
)
def
test_models
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
,
model
,
dtype
,
max_tokens
,
num_logprobs
,
decoder_prompt_type
)
->
None
:
...
...
@@ -201,6 +202,7 @@ def test_models(hf_runner, vllm_runner, example_encoder_decoder_prompts, model,
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
@
pytest
.
mark
.
parametrize
(
"decoder_prompt_type"
,
[
DecoderPromptType
.
CUSTOM
])
@
pytest
.
mark
.
skip
(
reason
=
"bart not supported in V1"
)
def
test_models_distributed
(
hf_runner
,
vllm_runner
,
example_encoder_decoder_prompts
,
distributed_executor_backend
,
model
,
dtype
,
...
...
tests/models/language/generation/test_common.py
View file @
38d80967
...
...
@@ -13,7 +13,7 @@ from ...registry import HF_EXAMPLE_MODELS
from
...utils
import
check_logprobs_close
# These have unsupported head_dim for FA. We do not
#
not
have a clean way to fall back, so we fail with
# have a clean way to fall back, so we fail with
# a clear msg when it happens.
# https://github.com/vllm-project/vllm/issues/14524
REQUIRES_V0
=
[
"microsoft/phi-2"
,
"stabilityai/stablelm-3b-4e1t"
]
...
...
@@ -93,7 +93,7 @@ AITER_MODEL_LIST = [
"allenai/OLMoE-1B-7B-0924-Instruct"
,
marks
=
[
pytest
.
mark
.
cpu_model
],
),
pytest
.
param
(
"swiss-ai/Apertus-8B"
),
# apertus
pytest
.
param
(
"swiss-ai/Apertus-8B
-2509
"
),
# apertus
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
...
...
tests/models/language/generation/test_hybrid.py
View file @
38d80967
...
...
@@ -25,8 +25,7 @@ SSM_MODELS = [
HYBRID_MODELS
=
[
"ai21labs/Jamba-tiny-dev"
,
# skipping until vLLM implementation issues are resolved
# "pfnet/plamo-2-1b",
"pfnet/plamo-2-1b"
,
"Zyphra/Zamba2-1.2B-instruct"
,
"hmellor/tiny-random-BambaForCausalLM"
,
"ibm-granite/granite-4.0-tiny-preview"
,
...
...
@@ -34,20 +33,10 @@ HYBRID_MODELS = [
"LiquidAI/LFM2-1.2B"
,
]
HF_UNSUPPORTED_MODELS
=
[
# The HF transformers implementation of
# Mamba2 is buggy for Codestral as it doesn't handle n_groups, so the test
# doesn't compare vLLM output with HF output.
# See https://github.com/huggingface/transformers/pull/35943
"yujiepan/mamba2-codestral-v0.1-tiny-random"
,
# transformers 4.55 is still producing garbage for this model
# TODO(tdoublep): follow-up on transformers side
"ibm-granite/granite-4.0-tiny-preview"
]
V1_SUPPORTED_MODELS
=
[
"state-spaces/mamba-130m-hf"
,
"ai21labs/Jamba-tiny-dev"
,
"pfnet/plamo-2-1b"
,
"yujiepan/mamba2-codestral-v0.1-tiny-random"
,
"Zyphra/Zamba2-1.2B-instruct"
,
"hmellor/tiny-random-BambaForCausalLM"
,
...
...
@@ -58,6 +47,7 @@ V1_SUPPORTED_MODELS = [
FULL_CUDA_GRAPH_MODELS
=
[
"ai21labs/Jamba-tiny-dev"
,
"pfnet/plamo-2-1b"
,
"Zyphra/Zamba2-1.2B-instruct"
,
]
...
...
@@ -65,6 +55,11 @@ V0_UNSUPPORTED_MODELS = [
"LiquidAI/LFM2-1.2B"
,
]
FP32_STATE_MODELS
=
[
"state-spaces/mamba-130m-hf"
,
"Zyphra/Zamba2-1.2B-instruct"
,
]
# Avoid OOM
MAX_NUM_SEQS
=
4
...
...
@@ -85,20 +80,13 @@ def test_models(
try
:
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model
)
model_info
.
check_available_online
(
on_fail
=
"skip"
)
hf_version_check
=
model_info
.
check_transformers_version
(
on_fail
=
"return"
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
except
ValueError
:
hf_version_check
=
None
if
hf_version_check
is
not
None
:
print
(
f
"Skipping transformers comparison because:
{
hf_version_check
}
"
)
pass
with
hf_runner
(
model
)
as
hf_model
:
if
model
not
in
HF_UNSUPPORTED_MODELS
and
hf_version_check
is
None
:
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
)
else
:
hf_outputs
=
None
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
...
...
@@ -116,7 +104,7 @@ def test_models(
else
:
vllm_v1_outputs
=
None
if
hf_outputs
is
not
None
and
vllm_v0_outputs
is
not
None
:
if
vllm_v0_outputs
is
not
None
:
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_v0_outputs
,
...
...
@@ -125,12 +113,10 @@ def test_models(
)
if
model
in
V1_SUPPORTED_MODELS
:
ref_outputs
=
hf_outputs
if
hf_outputs
is
not
None
else
vllm_v0_outputs
assert
ref_outputs
is
not
None
check_logprobs_close
(
outputs_0_lst
=
re
f_outputs
,
outputs_0_lst
=
h
f_outputs
,
outputs_1_lst
=
vllm_v1_outputs
,
name_0
=
"hf"
if
hf_outputs
is
not
None
else
"vllm-v0"
,
name_0
=
"hf"
,
name_1
=
"vllm-v1"
,
)
...
...
@@ -315,7 +301,7 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
finished_requests_ids is larger than the maximum mamba block capacity.
This could generally happen due to the fact that hybrid does support
statelessness mechanism where it can cleanup new incoming requests in
statelessness mechanism where it can clean
up new incoming requests in
a single step.
"""
try
:
...
...
@@ -336,7 +322,7 @@ def test_state_cleanup(
This test is for verifying that the Hybrid state is cleaned up between
steps.
If its not cleaned, an error would be expected.
If it
'
s not cleaned, an error would be expected.
"""
try
:
with
vllm_runner
(
model
,
max_num_seqs
=
MAX_NUM_SEQS
)
as
vllm_model
:
...
...
@@ -397,11 +383,8 @@ def test_full_cuda_graph(
pass
with
hf_runner
(
model
)
as
hf_model
:
if
model
not
in
HF_UNSUPPORTED_MODELS
:
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
)
else
:
hf_outputs
=
None
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
...
...
@@ -416,7 +399,7 @@ def test_full_cuda_graph(
vllm_v1_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
if
hf_outputs
is
not
None
and
vllm_v0_outputs
is
not
None
:
if
vllm_v0_outputs
is
not
None
:
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_v0_outputs
,
...
...
@@ -424,17 +407,15 @@ def test_full_cuda_graph(
name_1
=
"vllm-v0"
,
)
ref_outputs
=
hf_outputs
if
hf_outputs
is
not
None
else
vllm_v0_outputs
assert
ref_outputs
is
not
None
check_logprobs_close
(
outputs_0_lst
=
re
f_outputs
,
outputs_0_lst
=
h
f_outputs
,
outputs_1_lst
=
vllm_v1_outputs
,
name_0
=
"hf"
if
hf_outputs
is
not
None
else
"vllm-v0"
,
name_0
=
"hf"
,
name_1
=
"vllm-v1"
,
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"Zyphra/Zamba2-1.2B-instruct"
]
)
@
pytest
.
mark
.
parametrize
(
"model"
,
FP32_STATE_MODELS
)
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_fp32_state
(
...
...
@@ -455,11 +436,8 @@ def test_fp32_state(
pass
with
hf_runner
(
model
)
as
hf_model
:
if
model
not
in
HF_UNSUPPORTED_MODELS
:
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
)
else
:
hf_outputs
=
None
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
)
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"0"
)
...
...
@@ -475,18 +453,16 @@ def test_fp32_state(
vllm_v1_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
if
hf_outputs
is
not
None
:
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_v0_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm-v0"
,
)
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_v0_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm-v0"
,
)
ref_outputs
=
hf_outputs
if
hf_outputs
is
not
None
else
vllm_v0_outputs
check_logprobs_close
(
outputs_0_lst
=
re
f_outputs
,
outputs_0_lst
=
h
f_outputs
,
outputs_1_lst
=
vllm_v1_outputs
,
name_0
=
"hf"
if
hf_outputs
is
not
None
else
"vllm-v0"
,
name_0
=
"hf"
,
name_1
=
"vllm-v1"
,
)
tests/models/language/generation/test_mistral.py
View file @
38d80967
...
...
@@ -20,7 +20,7 @@ MISTRAL_FORMAT_MODELS = [
"mistralai/Mistral-7B-Instruct-v0.3"
,
# uses the v3-Tekken tokenizer
"mistralai/Ministral-8B-Instruct-2410"
,
# Mistral-Nemo is to big for CI, but passes locally
# Mistral-Nemo is to
o
big for CI, but passes locally
# "mistralai/Mistral-Nemo-Instruct-2407"
]
...
...
@@ -273,7 +273,7 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
def
test_mistral_function_call_nested_json
():
"""Ensure that the function-name regex captures the entire outer
-
most
"""Ensure that the function-name regex captures the entire outermost
JSON block, including nested braces."""
# Create a minimal stub tokenizer that provides the few attributes the
...
...
tests/models/language/generation_ppl_test/__init__.py
0 → 100644
View file @
38d80967
tests/models/language/generation_ppl_test/ppl_utils.py
0 → 100644
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from https://huggingface.co/docs/transformers/perplexity
from
typing
import
Optional
,
cast
import
pytest
import
torch
from
datasets
import
load_dataset
from
tests.models.utils
import
(
GenerateModelInfo
,
TokensTextLogprobsPromptLogprobs
)
from
vllm.logprobs
import
Logprob
# See #24485
PPL_TOL
=
0.01
MAX_LENGTH
=
1024
@
torch
.
inference_mode
def
wikitext_ppl_test
(
hf_runner
,
vllm_runner
,
model_info
:
GenerateModelInfo
,
max_length
=
MAX_LENGTH
,
vllm_extra_kwargs
=
None
,
atol
=
PPL_TOL
):
# A model family has many models with the same architecture,
# and we don't need to test each one.
if
not
model_info
.
enable_test
:
pytest
.
skip
(
"Skipping test."
)
dataset
=
load_dataset
(
"wikitext"
,
"wikitext-2-raw-v1"
,
split
=
"test"
)
# Allow vllm to test using the given dtype, such as float32
vllm_extra_kwargs
=
vllm_extra_kwargs
or
{}
vllm_extra_kwargs
[
"dtype"
]
=
model_info
.
dtype
# Allow vllm to test using hf_overrides
if
model_info
.
hf_overrides
is
not
None
:
vllm_extra_kwargs
[
"hf_overrides"
]
=
model_info
.
hf_overrides
with
vllm_runner
(
model_info
.
name
,
gpu_memory_utilization
=
0.7
,
max_model_len
=
max_length
,
max_num_seqs
=
1
,
enforce_eager
=
True
,
**
vllm_extra_kwargs
)
as
vllm_model
:
# Use max_num_seqs=1 to avoid OOM,
# and batch different requests together.
model_config
=
vllm_model
.
llm
.
llm_engine
.
model_config
# Confirm whether vllm is using the correct architecture
if
model_info
.
architecture
:
assert
(
model_info
.
architecture
in
model_config
.
architectures
)
max_length
=
min
(
model_config
.
max_model_len
-
1
,
max_length
)
stride
=
max_length
tokenizer
=
vllm_model
.
llm
.
get_tokenizer
()
tokens
=
tokenizer
.
encode
(
"
\n\n
"
.
join
(
dataset
[
"text"
]))
n_tokens
=
len
(
tokens
)
chunks
=
[]
for
begin_loc
in
range
(
0
,
n_tokens
,
stride
):
end_loc
=
min
(
begin_loc
+
max_length
,
n_tokens
)
chunks
.
append
(
tokens
[
begin_loc
:
end_loc
])
outputs
=
vllm_model
.
generate_greedy_logprobs
(
prompts
=
chunks
,
max_tokens
=
1
,
num_logprobs
=
None
,
num_prompt_logprobs
=
0
,
use_tqdm
=
False
)
nll_sum
=
torch
.
tensor
(
0.
,
dtype
=
torch
.
float32
,
device
=
"cpu"
)
n_tokens
=
0
for
output
in
outputs
:
output
=
cast
(
TokensTextLogprobsPromptLogprobs
,
output
)
token_datas
=
cast
(
list
[
Optional
[
dict
[
int
,
Logprob
]]],
output
[
3
])
assert
token_datas
[
0
]
is
None
token_log_probs
=
[]
for
token_data
in
token_datas
[
1
:]:
assert
token_data
is
not
None
assert
len
(
token_data
)
==
1
token_log_prob
=
list
(
token_data
.
values
())[
0
].
logprob
token_log_probs
.
append
(
token_log_prob
)
neg_log_likelihood
=
-
torch
.
tensor
(
token_log_probs
,
dtype
=
torch
.
float32
,
device
=
"cpu"
).
sum
()
nll_sum
+=
neg_log_likelihood
n_tokens
+=
len
(
token_log_probs
)
vllm_ppl
=
float
(
torch
.
exp
(
nll_sum
/
n_tokens
))
vllm_dtype
=
model_config
.
dtype
# Accelerate ppl test by setting Transformers ppl score to a constant
if
model_info
.
hf_ppl
is
None
:
with
hf_runner
(
model_info
.
name
,
dtype
=
model_info
.
hf_dtype
,
)
as
hf_model
:
nll_sum
=
torch
.
tensor
(
0.
,
dtype
=
torch
.
float32
,
device
=
"cpu"
)
n_tokens
=
0
for
chunk
in
chunks
:
inputs
=
hf_model
.
wrap_device
(
{
"input_ids"
:
torch
.
tensor
([
chunk
])})
input_ids
=
inputs
[
"input_ids"
]
outputs
=
hf_model
.
model
(
input_ids
,
labels
=
input_ids
)
neg_log_likelihood
=
outputs
.
loss
neg_log_likelihood
=
neg_log_likelihood
.
to
(
torch
.
float32
).
cpu
()
num_loss_tokens
=
len
(
chunk
)
-
1
nll_sum
+=
neg_log_likelihood
*
num_loss_tokens
n_tokens
+=
num_loss_tokens
hf_ppl
=
float
(
torch
.
exp
(
nll_sum
/
n_tokens
))
hf_dtype
=
next
(
hf_model
.
model
.
parameters
()).
dtype
else
:
hf_ppl
=
model_info
.
hf_ppl
hf_dtype
=
"Constant"
differ
=
(
vllm_ppl
-
hf_ppl
)
/
hf_ppl
print
(
"Model:"
,
model_info
.
name
)
print
(
"VLLM:"
,
vllm_dtype
,
vllm_ppl
)
print
(
"Transformers:"
,
hf_dtype
,
hf_ppl
)
print
(
"Difference (%):"
,
differ
*
100
)
# PPL the smaller, the better
# We are not concerned that the vllm PPL is less than Transformers,
# so we only perform one-sided testing.
assert
differ
<
atol
tests/models/language/generation_ppl_test/test_gemma.py
0 → 100644
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
tests.models.utils
import
GenerateModelInfo
from
.ppl_utils
import
wikitext_ppl_test
MODELS
=
[
GenerateModelInfo
(
"google/gemma-2b"
),
GenerateModelInfo
(
"google/gemma-2-2b"
),
GenerateModelInfo
(
"google/gemma-3-4b-it"
),
]
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
def
test_ppl
(
hf_runner
,
vllm_runner
,
model_info
:
GenerateModelInfo
):
wikitext_ppl_test
(
hf_runner
,
vllm_runner
,
model_info
)
Prev
1
…
9
10
11
12
13
14
15
16
17
…
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment