Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7a985548
Commit
7a985548
authored
May 22, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.0' into v0.9.0-ori
parents
45d3785c
dc1440cf
Changes
486
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
323 additions
and
189 deletions
+323
-189
tests/lora/test_utils.py
tests/lora/test_utils.py
+57
-12
tests/lora/test_worker.py
tests/lora/test_worker.py
+11
-5
tests/model_executor/test_enabled_custom_ops.py
tests/model_executor/test_enabled_custom_ops.py
+33
-22
tests/model_executor/test_guided_processors.py
tests/model_executor/test_guided_processors.py
+9
-6
tests/model_executor/weight_utils.py
tests/model_executor/weight_utils.py
+3
-3
tests/models/embedding/utils.py
tests/models/embedding/utils.py
+0
-66
tests/models/encoder_decoder/vision_language/__init__.py
tests/models/encoder_decoder/vision_language/__init__.py
+0
-0
tests/models/encoder_decoder/vision_language/test_broadcast.py
.../models/encoder_decoder/vision_language/test_broadcast.py
+0
-37
tests/models/language/__init__.py
tests/models/language/__init__.py
+0
-0
tests/models/language/generation/__init__.py
tests/models/language/generation/__init__.py
+0
-0
tests/models/language/generation/test_bart.py
tests/models/language/generation/test_bart.py
+0
-4
tests/models/language/generation/test_common.py
tests/models/language/generation/test_common.py
+36
-8
tests/models/language/generation/test_granite.py
tests/models/language/generation/test_granite.py
+0
-4
tests/models/language/generation/test_granitemoehybrid.py
tests/models/language/generation/test_granitemoehybrid.py
+41
-0
tests/models/language/generation/test_hybrid.py
tests/models/language/generation/test_hybrid.py
+14
-9
tests/models/language/generation/test_mistral.py
tests/models/language/generation/test_mistral.py
+0
-4
tests/models/language/generation/test_phimoe.py
tests/models/language/generation/test_phimoe.py
+0
-4
tests/models/language/pooling/__init__.py
tests/models/language/pooling/__init__.py
+0
-0
tests/models/language/pooling/mteb_utils.py
tests/models/language/pooling/mteb_utils.py
+118
-0
tests/models/language/pooling/test_classification.py
tests/models/language/pooling/test_classification.py
+1
-5
No files found.
Too many changes to show.
To preserve performance only
486 of 486+
files are displayed.
Plain diff
Email patch
tests/lora/test_utils.py
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
from
collections
import
OrderedDict
from
typing
import
NamedTuple
,
Optional
from
unittest.mock
import
patch
import
pytest
...
...
@@ -9,52 +10,96 @@ from torch import nn
from
vllm.lora.utils
import
(
get_adapter_absolute_path
,
parse_fine_tuned_lora_name
,
replace_submodule
)
from
vllm.model_executor.models.utils
import
WeightsMapper
class
LoRANameParserTestConfig
(
NamedTuple
):
name
:
str
module_name
:
str
is_lora_a
:
bool
is_bias
:
bool
weights_mapper
:
Optional
[
WeightsMapper
]
=
None
def
test_parse_fine_tuned_lora_name_valid
():
fixture
=
{
(
"base_model.model.lm_head.lora_A.weight"
,
"lm_head"
,
True
,
False
),
(
"base_model.model.lm_head.lora_B.weight"
,
"lm_head"
,
False
,
False
),
(
fixture
=
[
LoRANameParserTestConfig
(
"base_model.model.lm_head.lora_A.weight"
,
"lm_head"
,
True
,
False
),
LoRANameParserTestConfig
(
"base_model.model.lm_head.lora_B.weight"
,
"lm_head"
,
False
,
False
),
LoRANameParserTestConfig
(
"base_model.model.model.embed_tokens.lora_embedding_A"
,
"model.embed_tokens"
,
True
,
False
,
),
(
LoRANameParserTestConfig
(
"base_model.model.model.embed_tokens.lora_embedding_B"
,
"model.embed_tokens"
,
False
,
False
,
),
(
LoRANameParserTestConfig
(
"base_model.model.model.layers.9.mlp.down_proj.lora_A.weight"
,
"model.layers.9.mlp.down_proj"
,
True
,
False
,
),
(
LoRANameParserTestConfig
(
"base_model.model.model.layers.9.mlp.down_proj.lora_B.weight"
,
"model.layers.9.mlp.down_proj"
,
False
,
False
,
),
(
LoRANameParserTestConfig
(
"language_model.layers.9.mlp.down_proj.lora_A.weight"
,
"language_model.layers.9.mlp.down_proj"
,
True
,
False
,
),
(
LoRANameParserTestConfig
(
"language_model.layers.9.mlp.down_proj.lora_B.weight"
,
"language_model.layers.9.mlp.down_proj"
,
False
,
False
,
),
}
for
name
,
module_name
,
is_lora_a
,
is_bias
in
fixture
:
# Test with WeightsMapper
LoRANameParserTestConfig
(
"base_model.model.model.layers.9.mlp.down_proj.lora_A.weight"
,
"language_model.model.layers.9.mlp.down_proj"
,
True
,
False
,
weights_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"model."
:
"language_model.model."
}),
),
LoRANameParserTestConfig
(
"base_model.model.model.layers.9.mlp.down_proj.lora_B.weight"
,
"language_model.model.layers.9.mlp.down_proj"
,
False
,
False
,
weights_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"model."
:
"language_model.model."
}),
),
LoRANameParserTestConfig
(
"model.layers.9.mlp.down_proj.lora_A.weight"
,
"language_model.model.layers.9.mlp.down_proj"
,
True
,
False
,
weights_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"model."
:
"language_model.model."
}),
),
LoRANameParserTestConfig
(
"model.layers.9.mlp.down_proj.lora_B.weight"
,
"language_model.model.layers.9.mlp.down_proj"
,
False
,
False
,
weights_mapper
=
WeightsMapper
(
orig_to_new_prefix
=
{
"model."
:
"language_model.model."
}),
),
]
for
name
,
module_name
,
is_lora_a
,
is_bias
,
weights_mapper
in
fixture
:
assert
(
module_name
,
is_lora_a
,
is_bias
)
==
parse_fine_tuned_lora_name
(
name
)
is_bias
)
==
parse_fine_tuned_lora_name
(
name
,
weights_mapper
)
def
test_parse_fine_tuned_lora_name_invalid
():
...
...
tests/lora/test_worker.py
View file @
7a985548
...
...
@@ -58,13 +58,19 @@ def test_worker_apply_lora(sql_lora_files):
download_dir
=
None
,
load_format
=
"dummy"
,
),
parallel_config
=
ParallelConfig
(
1
,
1
,
False
),
parallel_config
=
ParallelConfig
(
pipeline_parallel_size
=
1
,
tensor_parallel_size
=
1
,
data_parallel_size
=
1
,
),
scheduler_config
=
SchedulerConfig
(
"generate"
,
32
,
32
,
32
),
device_config
=
DeviceConfig
(
"cuda"
),
cache_config
=
CacheConfig
(
block_size
=
16
,
gpu_memory_utilization
=
1.
,
swap_space
=
0
,
cache_dtype
=
"auto"
),
cache_config
=
CacheConfig
(
block_size
=
16
,
gpu_memory_utilization
=
1.0
,
swap_space
=
0
,
cache_dtype
=
"auto"
,
),
lora_config
=
LoRAConfig
(
max_lora_rank
=
8
,
max_cpu_loras
=
32
,
max_loras
=
32
),
)
...
...
tests/model_executor/test_enabled_custom_ops.py
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
torch
from
vllm.config
import
CompilationConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.model_executor.custom_op
import
CustomOp
from
vllm.model_executor.layers.activation
import
(
GeluAndMul
,
ReLUSquaredActivation
,
SiluAndMul
)
from
vllm.model_executor.layers.fused_moe.fused_moe
import
(
dispatch_fused_experts_func
,
dispatch_topk_func
,
torch_vllm_inplace_fused_experts
,
torch_vllm_outplace_fused_experts
,
vllm_topk_softmax
)
from
vllm.model_executor.layers.fused_moe.fused_moe
import
(
dispatch_topk_func
,
vllm_topk_softmax
)
from
vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe
import
(
is_rocm_aiter_moe_enabled
)
from
vllm.model_executor.layers.layernorm
import
(
RMSNorm
,
dispatch_cuda_rmsnorm_func
,
fused_add_rms_norm
,
rms_norm
,
rocm_aiter_fused_add_rms_norm
,
rocm_aiter_rms_norm
)
from
vllm.model_executor.layers.quantization.utils.fp8_utils
import
(
cutlass_scaled_mm
,
dispatch_w8a8_blockscale_func
,
w8a8_block_fp8_matmul
)
from
vllm.platforms
import
current_platform
...
...
@@ -98,35 +99,45 @@ def test_enabled_ops_invalid(env: str):
RMSNorm
(
1024
).
enabled
()
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_rocm
()
or
not
current_platform
.
is_fp8_fnuz
(),
reason
=
"AITER is a feature exclusive for ROCm and FP8_FNUZ"
)
@
pytest
.
mark
.
parametrize
(
"use_cutlass"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter"
,
[
"0"
,
"1"
])
def
test_topk_dispatch
(
use_rocm_aiter
:
str
,
monkeypatch
):
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter_gemm_w8a8_blockscale"
,
[
"0"
,
"1"
])
def
test_w8a8_blockscale_dispatch
(
use_cutlass
:
bool
,
use_rocm_aiter
:
str
,
use_rocm_aiter_gemm_w8a8_blockscale
:
str
,
monkeypatch
):
monkeypatch
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
use_rocm_aiter
)
topk_func
=
dispatch_topk_func
()
is_rocm_aiter_moe_enabled
.
cache_clear
()
if
current_platform
.
is_rocm
()
and
int
(
use_rocm_aiter
):
from
vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe
import
(
rocm_aiter_topk_softmax
)
assert
topk_func
==
rocm_aiter_topk_softmax
monkeypatch
.
setenv
(
"VLLM_ROCM_USE_AITER_LINEAR"
,
use_rocm_aiter_gemm_w8a8_blockscale
)
use_aiter_and_is_supported
=
(
bool
(
int
(
use_rocm_aiter
))
and
bool
(
int
(
use_rocm_aiter_gemm_w8a8_blockscale
)))
block_scale_func
=
dispatch_w8a8_blockscale_func
(
use_cutlass
,
use_aiter_and_is_supported
=
use_aiter_and_is_supported
)
if
use_cutlass
:
assert
block_scale_func
==
cutlass_scaled_mm
elif
current_platform
.
is_rocm
()
and
int
(
use_rocm_aiter
)
and
int
(
use_rocm_aiter_gemm_w8a8_blockscale
):
assert
block_scale_func
==
(
torch
.
ops
.
vllm
.
rocm_aiter_gemm_w8a8_blockscale
)
else
:
assert
topk_func
==
vllm_topk_softmax
assert
block_scale_func
==
w8a8_block_fp8_matmul
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter"
,
[
"0"
,
"1"
])
@
pytest
.
mark
.
parametrize
(
"inplace"
,
[
True
,
False
])
def
test_fused_experts_dispatch
(
use_rocm_aiter
:
str
,
inplace
:
bool
,
monkeypatch
):
def
test_topk_dispatch
(
use_rocm_aiter
:
str
,
monkeypatch
):
monkeypatch
.
setenv
(
"VLLM_ROCM_USE_AITER"
,
use_rocm_aiter
)
topk_func
=
dispatch_topk_func
()
is_rocm_aiter_moe_enabled
.
cache_clear
()
fused_experts_func
=
dispatch_fused_experts_func
(
inplace
)
if
current_platform
.
is_rocm
()
and
int
(
use_rocm_aiter
):
from
vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe
import
(
rocm_aiter_fused_experts
)
assert
fused_experts_func
==
rocm_aiter_fused_experts
elif
inplace
:
assert
fused_experts_func
==
torch_vllm_inplace_fused_experts
rocm_aiter_topk_softmax
)
assert
topk_func
==
rocm_aiter_topk_softmax
else
:
assert
fused_experts_func
==
torch_vllm_outplace_fused_experts
assert
topk_func
==
vllm_topk_softmax
@
pytest
.
mark
.
parametrize
(
"add_residual"
,
[
True
,
False
])
...
...
tests/model_executor/test_guided_processors.py
View file @
7a985548
...
...
@@ -202,12 +202,15 @@ def test_multiple_guided_options_not_allowed(sample_json_schema, sample_regex):
def
test_guided_decoding_backend_options
():
"""Test backend-specific options"""
params
=
GuidedDecodingParams
(
backend
=
"xgrammar:option-1,option-2,option-3"
)
assert
params
.
backend_options
()
==
[
"option-1"
,
"option-2"
,
"option-3"
]
no_fallback
=
GuidedDecodingParams
(
backend
=
"xgrammar:option-1,no-fallback"
)
assert
no_fallback
.
no_fallback
()
with
pytest
.
warns
(
DeprecationWarning
):
guided_decoding_params
=
GuidedDecodingParams
(
backend
=
"xgrammar:no-fallback,disable-any-whitespace,no-additional-properties"
)
assert
guided_decoding_params
.
backend
==
"xgrammar"
assert
guided_decoding_params
.
disable_fallback
assert
guided_decoding_params
.
disable_any_whitespace
assert
guided_decoding_params
.
disable_additional_properties
def
test_pickle_xgrammar_tokenizer_data
():
...
...
tests/model_executor/weight_utils.py
View file @
7a985548
...
...
@@ -20,11 +20,11 @@ def test_hf_transfer_auto_activation():
try
:
# enable hf hub transfer if available
import
hf_transfer
# type: ignore # noqa
HF_TRANFER_ACTIVE
=
True
HF_TRAN
S
FER_ACTIVE
=
True
except
ImportError
:
HF_TRANFER_ACTIVE
=
False
HF_TRAN
S
FER_ACTIVE
=
False
assert
(
huggingface_hub
.
constants
.
HF_HUB_ENABLE_HF_TRANSFER
==
HF_TRANFER_ACTIVE
)
HF_TRAN
S
FER_ACTIVE
)
def
test_download_weights_from_hf
():
...
...
tests/models/embedding/utils.py
deleted
100644 → 0
View file @
45d3785c
# SPDX-License-Identifier: Apache-2.0
from
collections.abc
import
Sequence
from
typing
import
NamedTuple
,
Optional
import
torch
import
torch.nn.functional
as
F
def
check_embeddings_close
(
*
,
embeddings_0_lst
:
Sequence
[
list
[
float
]],
embeddings_1_lst
:
Sequence
[
list
[
float
]],
name_0
:
str
,
name_1
:
str
,
tol
:
float
=
1e-3
,
)
->
None
:
assert
len
(
embeddings_0_lst
)
==
len
(
embeddings_1_lst
)
for
prompt_idx
,
(
embeddings_0
,
embeddings_1
)
in
enumerate
(
zip
(
embeddings_0_lst
,
embeddings_1_lst
)):
assert
len
(
embeddings_0
)
==
len
(
embeddings_1
),
(
f
"Length mismatch:
{
len
(
embeddings_0
)
}
vs.
{
len
(
embeddings_1
)
}
"
)
sim
=
F
.
cosine_similarity
(
torch
.
tensor
(
embeddings_0
),
torch
.
tensor
(
embeddings_1
),
dim
=
0
)
fail_msg
=
(
f
"Test
{
prompt_idx
}
:"
f
"
\n
{
name_0
}
:
\t
{
embeddings_0
[:
16
]
!
r
}
"
f
"
\n
{
name_1
}
:
\t
{
embeddings_1
[:
16
]
!
r
}
"
)
assert
sim
>=
1
-
tol
,
fail_msg
def
matryoshka_fy
(
tensor
,
dimensions
):
tensor
=
torch
.
tensor
(
tensor
)
tensor
=
tensor
[...,
:
dimensions
]
tensor
=
F
.
normalize
(
tensor
,
p
=
2
,
dim
=
1
)
return
tensor
class
EmbedModelInfo
(
NamedTuple
):
name
:
str
is_matryoshka
:
bool
matryoshka_dimensions
:
Optional
[
list
[
int
]]
=
None
architecture
:
str
=
""
enable_test
:
bool
=
True
def
correctness_test
(
hf_model
,
inputs
,
vllm_outputs
:
Sequence
[
list
[
float
]],
dimensions
:
Optional
[
int
]
=
None
):
hf_outputs
=
hf_model
.
encode
(
inputs
)
if
dimensions
:
hf_outputs
=
matryoshka_fy
(
hf_outputs
,
dimensions
)
check_embeddings_close
(
embeddings_0_lst
=
hf_outputs
,
embeddings_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
tol
=
1e-2
,
)
tests/models/encoder_decoder/vision_language/__init__.py
deleted
100644 → 0
View file @
45d3785c
tests/models/encoder_decoder/vision_language/test_broadcast.py
deleted
100644 → 0
View file @
45d3785c
# SPDX-License-Identifier: Apache-2.0
import
pytest
from
....utils
import
multi_gpu_test
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
"mp"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
[
"meta-llama/Llama-3.2-11B-Vision-Instruct"
,
])
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
distributed_executor_backend
,
model
)
->
None
:
dtype
=
"half"
max_tokens
=
5
num_logprobs
=
5
tensor_parallel_size
=
2
if
model
.
startswith
(
"meta-llama/Llama-3.2-11B-Vision-Instruct"
):
from
.test_mllama
import
models
,
run_test
else
:
raise
NotImplementedError
(
f
"Unsupported model:
{
model
}
"
)
run_test
(
hf_runner
,
vllm_runner
,
image_assets
,
model
=
models
[
0
],
size_factors
=
[
0.25
,
0.5
,
1.0
],
dtype
=
dtype
,
max_tokens
=
max_tokens
,
num_logprobs
=
num_logprobs
,
tensor_parallel_size
=
tensor_parallel_size
,
distributed_executor_backend
=
distributed_executor_backend
,
)
tests/models/
decoder_only
/__init__.py
→
tests/models/
language
/__init__.py
View file @
7a985548
File moved
tests/models/
decoder_only/audio_language
/__init__.py
→
tests/models/
language/generation
/__init__.py
View file @
7a985548
File moved
tests/models/
encoder_decoder/language
/test_bart.py
→
tests/models/
language/generation
/test_bart.py
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for BART models using greedy sampling.
Run `pytest tests/models/encoder_decoder/language/test_bart.py`.
"""
from
typing
import
Optional
import
pytest
...
...
tests/models/
decoder_only/language/test_models
.py
→
tests/models/
language/generation/test_common
.py
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM when using greedy sampling.
Run `pytest tests/models/test_models.py`.
"""
import
os
from
typing
import
Optional
import
pytest
import
torch
...
...
@@ -29,7 +27,8 @@ AITER_MODEL_LIST = [
"openbmb/MiniCPM3-4B"
,
"Qwen/Qwen-7B-Chat"
,
"Qwen/Qwen2.5-0.5B-Instruct"
,
"ehristoforu/Falcon3-MoE-2x7B-Insruct"
,
"TitanML/tiny-mixtral"
,
"Qwen/Qwen3-8B"
,
]
...
...
@@ -80,12 +79,14 @@ AITER_MODEL_LIST = [
"Qwen/Qwen2.5-0.5B-Instruct"
,
# qwen2
marks
=
[
pytest
.
mark
.
core_model
],
),
pytest
.
param
(
"Qwen/Qwen3-8B"
,
# qwen (text-only)
),
pytest
.
param
(
"stabilityai/stablelm-3b-4e1t"
),
# stablelm
pytest
.
param
(
"bigcode/starcoder2-3b"
),
# starcoder2
pytest
.
param
(
"ehristoforu/Falcon3-MoE-2x7B-Insruct"
,
# mixtral
marks
=
[
pytest
.
mark
.
cpu_model
,
large_gpu_mark
(
min_gb
=
48
)],
"TitanML/tiny-mixtral"
,
# mixtral
marks
=
[
pytest
.
mark
.
cpu_model
],
)
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
...
...
@@ -112,19 +113,38 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
# in parts of the operators
pytest
.
skip
(
f
"Skipping '
{
model
}
' model test with AITER kernel."
)
use_prompt_embeds
=
os
.
getenv
(
"VLLM_USE_V1"
)
==
"0"
with
hf_runner
(
model
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
)
prompt_embeds
:
Optional
[
list
[
torch
.
Tensor
]]
=
([]
if
use_prompt_embeds
else
None
)
prompt_token_ids
=
[]
for
prompt
in
example_prompts
:
token_ids
=
hf_model
.
tokenizer
(
prompt
,
return_tensors
=
"pt"
).
input_ids
.
to
(
hf_model
.
model
.
device
)
prompt_token_ids
.
append
(
token_ids
)
if
prompt_embeds
is
not
None
:
prompt_embeds
.
append
(
hf_model
.
model
.
get_input_embeddings
()(
token_ids
).
squeeze
(
0
))
with
vllm_runner
(
model
,
tokenizer_name
=
model_info
.
tokenizer
or
model
,
tokenizer_mode
=
model_info
.
tokenizer_mode
,
trust_remote_code
=
model_info
.
trust_remote_code
,
max_num_seqs
=
2
,
enable_prompt_embeds
=
use_prompt_embeds
,
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
if
prompt_embeds
is
not
None
:
vllm_outputs_from_embeds
=
vllm_model
.
generate_greedy_logprobs
(
prompt_embeds
,
max_tokens
,
num_logprobs
)
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
...
...
@@ -132,6 +152,14 @@ def test_models(hf_runner, vllm_runner, example_prompts, model: str,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
if
prompt_embeds
is
not
None
:
check_logprobs_close
(
outputs_0_lst
=
vllm_outputs
,
outputs_1_lst
=
vllm_outputs_from_embeds
,
name_0
=
"vllm"
,
name_1
=
"vllm_from_embeds"
,
)
if
use_rocm_aiter
:
# this is to ensure that vllm engine
# has deallocated the memory before running the next
...
...
tests/models/
decoder_only/language
/test_granite.py
→
tests/models/
language/generation
/test_granite.py
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for Granite models using greedy sampling.
Run `pytest tests/models/test_granite.py`.
"""
import
pytest
from
...utils
import
check_logprobs_close
...
...
tests/models/language/generation/test_granitemoehybrid.py
0 → 100644
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
import
pytest
from
...utils
import
check_logprobs_close
# Path of the checkpoints
MODELS
=
[
"ibm-granite/granite-4.0-tiny-preview"
,
]
@
pytest
.
mark
.
skip
(
reason
=
"Granite 4.0 is not yet available in huggingface transformers"
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"float16"
,
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_model_equivalence_to_hf_greedy
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
):
with
vllm_runner
(
model
,
dtype
=
dtype
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy_logprobs
(
example_prompts
,
max_tokens
,
num_logprobs
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy_logprobs_limit
(
example_prompts
,
max_tokens
,
num_logprobs
)
check_logprobs_close
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
tests/models/
decoder_only/language
/test_hybrid.py
→
tests/models/
language/generation
/test_hybrid.py
View file @
7a985548
...
...
@@ -23,12 +23,15 @@ SSM_MODELS = [
HYBRID_MODELS
=
[
"ai21labs/Jamba-tiny-dev"
,
# NOTE: ibm-granite/granite-4.0-tiny-preview are skipped currently as
# it is not yet available in huggingface transformers
# "ibm-granite/granite-4.0-tiny-preview",
# NOTE: Running Plamo2 in transformers implementation requires to install
# causal-conv1d package, which is not listed as a test dependency as it's
# not compatible with pip-compile.
"pfnet/plamo-2-1b"
,
"Zyphra/Zamba2-1.2B-instruct"
,
"
ibm-ai-platform/Bamba-9B
"
,
"
hmellor/bamba-tiny-random
"
,
]
# Avoid OOM
...
...
@@ -289,23 +292,25 @@ def test_multistep_correctness(
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"model"
,
[
SSM_MODELS
[
0
],
HYBRID_MODELS
[
0
]])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
64
])
def
test_hybrid_distributed_produces_identical_generation
(
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
def
test_distributed_correctness
(
vllm_runner
,
example_prompts
,
model
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
,
)
->
None
:
with
vllm_runner
(
model
,
tensor_parallel_size
=
2
,
with
vllm_runner
(
model
,
tensor_parallel_size
=
1
,
max_num_seqs
=
2
)
as
vllm_model
:
vllm_outputs_tp_
2
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_token
s
)
vllm_outputs_tp_
1
=
vllm_model
.
generate_greedy
_logprobs
(
example_prompts
,
max_tokens
,
num_logprob
s
)
with
vllm_runner
(
model
,
tensor_parallel_size
=
1
,
with
vllm_runner
(
model
,
tensor_parallel_size
=
2
,
max_num_seqs
=
2
)
as
vllm_model
:
vllm_outputs_tp_
1
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_token
s
)
vllm_outputs_tp_
2
=
vllm_model
.
generate_greedy
_logprobs
(
example_prompts
,
max_tokens
,
num_logprob
s
)
check_
outputs_equal
(
check_
logprobs_close
(
outputs_0_lst
=
vllm_outputs_tp_1
,
outputs_1_lst
=
vllm_outputs_tp_2
,
name_0
=
"vllm_tp_1"
,
...
...
tests/models/
decoder_only/language
/test_mistral.py
→
tests/models/
language/generation
/test_mistral.py
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for Mistral models using greedy sampling.
Run `pytest tests/models/test_mistral.py`.
"""
import
copy
import
json
...
...
tests/models/
decoder_only/language
/test_phimoe.py
→
tests/models/
language/generation
/test_phimoe.py
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
"""Compare the outputs of HF and vLLM for moe models using greedy sampling.
Run `pytest tests/models/test_phimoe.py`.
"""
import
pytest
import
torch
...
...
tests/models/
decoder_only/
language/__init__.py
→
tests/models/language
/pooling
/__init__.py
View file @
7a985548
File moved
tests/models/language/pooling/mteb_utils.py
0 → 100644
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
import
math
from
collections.abc
import
Sequence
import
mteb
import
numpy
as
np
import
pytest
from
tests.models.utils
import
EmbedModelInfo
from
vllm.model_executor.model_loader.utils
import
set_default_torch_dtype
# Most models on the STS12 task (See #17175):
# - Model implementation and minor changes in tensor dtype
# results in differences less than 1e-4
# - Different model results in differences more than 1e-3
# 1e-4 is a good tolerance threshold
MTEB_EMBED_TASKS
=
[
"STS12"
]
MTEB_EMBED_TOL
=
1e-4
class
VllmMtebEncoder
(
mteb
.
Encoder
):
def
__init__
(
self
,
vllm_model
):
super
().
__init__
()
self
.
model
=
vllm_model
self
.
rng
=
np
.
random
.
default_rng
(
seed
=
42
)
def
encode
(
self
,
sentences
:
Sequence
[
str
],
*
args
,
**
kwargs
,
)
->
np
.
ndarray
:
# Hoping to discover potential scheduling
# issues by randomizing the order.
r
=
self
.
rng
.
permutation
(
len
(
sentences
))
sentences
=
[
sentences
[
i
]
for
i
in
r
]
outputs
=
self
.
model
.
encode
(
sentences
,
use_tqdm
=
False
)
embeds
=
np
.
array
(
outputs
)
embeds
=
embeds
[
np
.
argsort
(
r
)]
return
embeds
class
OpenAIClientMtebEncoder
(
mteb
.
Encoder
):
def
__init__
(
self
,
model_name
:
str
,
client
):
super
().
__init__
()
self
.
model_name
=
model_name
self
.
client
=
client
self
.
rng
=
np
.
random
.
default_rng
(
seed
=
42
)
def
encode
(
self
,
sentences
:
Sequence
[
str
],
*
args
,
**
kwargs
)
->
np
.
ndarray
:
# Hoping to discover potential scheduling
# issues by randomizing the order.
r
=
self
.
rng
.
permutation
(
len
(
sentences
))
sentences
=
[
sentences
[
i
]
for
i
in
r
]
embeddings
=
self
.
client
.
embeddings
.
create
(
model
=
self
.
model_name
,
input
=
sentences
)
outputs
=
[
d
.
embedding
for
d
in
embeddings
.
data
]
embeds
=
np
.
array
(
outputs
)
embeds
=
embeds
[
np
.
argsort
(
r
)]
return
embeds
def
run_mteb_embed_task
(
encoder
,
tasks
):
tasks
=
mteb
.
get_tasks
(
tasks
=
tasks
)
evaluation
=
mteb
.
MTEB
(
tasks
=
tasks
)
results
=
evaluation
.
run
(
encoder
,
verbosity
=
0
,
output_folder
=
None
)
main_score
=
results
[
0
].
scores
[
"test"
][
0
][
"main_score"
]
return
main_score
def
run_mteb_embed_task_st
(
model_name
,
tasks
):
from
sentence_transformers
import
SentenceTransformer
model
=
SentenceTransformer
(
model_name
)
return
run_mteb_embed_task
(
model
,
tasks
)
def
mteb_test_embed_models
(
hf_runner
,
vllm_runner
,
model_info
:
EmbedModelInfo
,
vllm_extra_kwargs
=
None
):
if
not
model_info
.
enable_test
:
# A model family has many models with the same architecture,
# and we don't need to test each one.
pytest
.
skip
(
"Skipping test."
)
vllm_extra_kwargs
=
vllm_extra_kwargs
or
{}
with
vllm_runner
(
model_info
.
name
,
task
=
"embed"
,
max_model_len
=
None
,
dtype
=
model_info
.
dtype
,
**
vllm_extra_kwargs
)
as
vllm_model
:
if
model_info
.
architecture
:
assert
(
model_info
.
architecture
in
vllm_model
.
model
.
llm_engine
.
model_config
.
architectures
)
vllm_main_score
=
run_mteb_embed_task
(
VllmMtebEncoder
(
vllm_model
),
MTEB_EMBED_TASKS
)
vllm_dtype
=
vllm_model
.
model
.
llm_engine
.
model_config
.
dtype
model_dtype
=
getattr
(
vllm_model
.
model
.
llm_engine
.
model_config
.
hf_config
,
"torch_dtype"
,
vllm_dtype
)
with
set_default_torch_dtype
(
model_dtype
)
and
hf_runner
(
model_info
.
name
,
is_sentence_transformer
=
True
,
dtype
=
model_dtype
)
as
hf_model
:
st_main_score
=
run_mteb_embed_task
(
hf_model
,
MTEB_EMBED_TASKS
)
print
(
"VLLM:"
,
vllm_dtype
,
vllm_main_score
)
print
(
"SentenceTransformer:"
,
model_dtype
,
st_main_score
)
print
(
"Difference:"
,
st_main_score
-
vllm_main_score
)
assert
math
.
isclose
(
st_main_score
,
vllm_main_score
,
rel_tol
=
MTEB_EMBED_TOL
)
tests/models/
embedding/language/test_cls_models
.py
→
tests/models/
language/pooling/test_classification
.py
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
"""Compare the classification outputs of HF and vLLM models.
Run `pytest tests/models/test_cls_models.py`.
"""
import
pytest
import
torch
from
transformers
import
AutoModelForSequenceClassification
...
...
@@ -19,7 +15,7 @@ from vllm.platforms import current_platform
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
]
if
current_platform
.
is_rocm
()
else
[
"float"
])
def
test_
classification_
models
(
def
test_models
(
hf_runner
,
vllm_runner
,
example_prompts
,
...
...
Prev
1
…
15
16
17
18
19
20
21
22
23
…
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment