Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
bd98842c
Unverified
Commit
bd98842c
authored
Sep 10, 2025
by
wang.yuqi
Committed by
GitHub
Sep 10, 2025
Browse files
[CI] Add PPL test for generation models (#24485)
Signed-off-by:
wang.yuqi
<
noooop@126.com
>
parent
d6069887
Changes
9
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
211 additions
and
7 deletions
+211
-7
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+10
-0
tests/models/language/generation_ppl_test/__init__.py
tests/models/language/generation_ppl_test/__init__.py
+0
-0
tests/models/language/generation_ppl_test/ppl_utils.py
tests/models/language/generation_ppl_test/ppl_utils.py
+131
-0
tests/models/language/generation_ppl_test/test_gemma.py
tests/models/language/generation_ppl_test/test_gemma.py
+18
-0
tests/models/language/generation_ppl_test/test_gpt.py
tests/models/language/generation_ppl_test/test_gpt.py
+14
-0
tests/models/language/generation_ppl_test/test_qwen.py
tests/models/language/generation_ppl_test/test_qwen.py
+21
-0
tests/models/language/pooling/embed_utils.py
tests/models/language/pooling/embed_utils.py
+1
-1
tests/models/language/pooling/mteb_utils.py
tests/models/language/pooling/mteb_utils.py
+7
-4
tests/models/utils.py
tests/models/utils.py
+9
-2
No files found.
.buildkite/test-pipeline.yaml
View file @
bd98842c
...
@@ -604,6 +604,16 @@ steps:
...
@@ -604,6 +604,16 @@ steps:
-
pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
-
pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
-
pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
-
pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
-
label
:
Language Models Test (PPL)
timeout_in_minutes
:
110
mirror_hardwares
:
[
amdexperimental
]
optional
:
true
source_file_dependencies
:
-
vllm/
-
tests/models/language/generation_ppl_test
commands
:
-
pytest -v -s models/language/generation_ppl_test
-
label
:
Language Models Test (Extended Pooling)
# 36min
-
label
:
Language Models Test (Extended Pooling)
# 36min
timeout_in_minutes
:
50
timeout_in_minutes
:
50
mirror_hardwares
:
[
amdexperimental
]
mirror_hardwares
:
[
amdexperimental
]
...
...
tests/models/language/generation_ppl_test/__init__.py
0 → 100644
View file @
bd98842c
tests/models/language/generation_ppl_test/ppl_utils.py
0 → 100644
View file @
bd98842c
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from https://huggingface.co/docs/transformers/perplexity
from
typing
import
Optional
,
cast
import
pytest
import
torch
from
datasets
import
load_dataset
from
tests.models.utils
import
(
GenerateModelInfo
,
TokensTextLogprobsPromptLogprobs
)
from
vllm.logprobs
import
Logprob
# See #24485
PPL_TOL
=
0.01
MAX_LENGTH
=
1024
@
torch
.
inference_mode
def
wikitext_ppl_test
(
hf_runner
,
vllm_runner
,
model_info
:
GenerateModelInfo
,
max_length
=
MAX_LENGTH
,
vllm_extra_kwargs
=
None
,
atol
=
PPL_TOL
):
# A model family has many models with the same architecture,
# and we don't need to test each one.
if
not
model_info
.
enable_test
:
pytest
.
skip
(
"Skipping test."
)
dataset
=
load_dataset
(
"wikitext"
,
"wikitext-2-raw-v1"
,
split
=
"test"
)
# Allow vllm to test using the given dtype, such as float32
vllm_extra_kwargs
=
vllm_extra_kwargs
or
{}
vllm_extra_kwargs
[
"dtype"
]
=
model_info
.
dtype
# Allow vllm to test using hf_overrides
if
model_info
.
hf_overrides
is
not
None
:
vllm_extra_kwargs
[
"hf_overrides"
]
=
model_info
.
hf_overrides
with
vllm_runner
(
model_info
.
name
,
gpu_memory_utilization
=
0.7
,
max_model_len
=
max_length
,
max_num_seqs
=
1
,
enforce_eager
=
True
,
**
vllm_extra_kwargs
)
as
vllm_model
:
# Use max_num_seqs=1 to avoid OOM,
# and batch different requests together.
model_config
=
vllm_model
.
llm
.
llm_engine
.
model_config
# Confirm whether vllm is using the correct architecture
if
model_info
.
architecture
:
assert
(
model_info
.
architecture
in
model_config
.
architectures
)
max_length
=
min
(
model_config
.
max_model_len
-
1
,
max_length
)
stride
=
max_length
tokenizer
=
vllm_model
.
llm
.
get_tokenizer
()
tokens
=
tokenizer
.
encode
(
"
\n\n
"
.
join
(
dataset
[
"text"
]))
n_tokens
=
len
(
tokens
)
chunks
=
[]
for
begin_loc
in
range
(
0
,
n_tokens
,
stride
):
end_loc
=
min
(
begin_loc
+
max_length
,
n_tokens
)
chunks
.
append
(
tokens
[
begin_loc
:
end_loc
])
outputs
=
vllm_model
.
generate_greedy_logprobs
(
prompts
=
chunks
,
max_tokens
=
1
,
num_logprobs
=
None
,
num_prompt_logprobs
=
0
,
use_tqdm
=
False
)
nll_sum
=
torch
.
tensor
(
0.
,
dtype
=
torch
.
float32
,
device
=
"cpu"
)
n_tokens
=
0
for
output
in
outputs
:
output
=
cast
(
TokensTextLogprobsPromptLogprobs
,
output
)
token_datas
=
cast
(
list
[
Optional
[
dict
[
int
,
Logprob
]]],
output
[
3
])
assert
token_datas
[
0
]
is
None
token_log_probs
=
[]
for
token_data
in
token_datas
[
1
:]:
assert
token_data
is
not
None
assert
len
(
token_data
)
==
1
token_log_prob
=
list
(
token_data
.
values
())[
0
].
logprob
token_log_probs
.
append
(
token_log_prob
)
neg_log_likelihood
=
-
torch
.
tensor
(
token_log_probs
,
dtype
=
torch
.
float32
,
device
=
"cpu"
).
sum
()
nll_sum
+=
neg_log_likelihood
n_tokens
+=
len
(
token_log_probs
)
vllm_ppl
=
float
(
torch
.
exp
(
nll_sum
/
n_tokens
))
vllm_dtype
=
model_config
.
dtype
# Accelerate ppl test by setting Transformers ppl score to a constant
if
model_info
.
hf_ppl
is
None
:
with
hf_runner
(
model_info
.
name
,
dtype
=
model_info
.
hf_dtype
,
)
as
hf_model
:
nll_sum
=
torch
.
tensor
(
0.
,
dtype
=
torch
.
float32
,
device
=
"cpu"
)
n_tokens
=
0
for
chunk
in
chunks
:
inputs
=
hf_model
.
wrap_device
(
{
"input_ids"
:
torch
.
tensor
([
chunk
])})
input_ids
=
inputs
[
"input_ids"
]
outputs
=
hf_model
.
model
(
input_ids
,
labels
=
input_ids
)
neg_log_likelihood
=
outputs
.
loss
neg_log_likelihood
=
neg_log_likelihood
.
to
(
torch
.
float32
).
cpu
()
num_loss_tokens
=
len
(
chunk
)
-
1
nll_sum
+=
neg_log_likelihood
*
num_loss_tokens
n_tokens
+=
num_loss_tokens
hf_ppl
=
float
(
torch
.
exp
(
nll_sum
/
n_tokens
))
hf_dtype
=
next
(
hf_model
.
model
.
parameters
()).
dtype
else
:
hf_ppl
=
model_info
.
hf_ppl
hf_dtype
=
"Constant"
differ
=
(
vllm_ppl
-
hf_ppl
)
/
hf_ppl
print
(
"Model:"
,
model_info
.
name
)
print
(
"VLLM:"
,
vllm_dtype
,
vllm_ppl
)
print
(
"Transformers:"
,
hf_dtype
,
hf_ppl
)
print
(
"Difference (%):"
,
differ
*
100
)
# PPL the smaller, the better
# We are not concerned that the vllm PPL is less than Transformers,
# so we only perform one-sided testing.
assert
differ
<
atol
tests/models/language/generation_ppl_test/test_gemma.py
0 → 100644
View file @
bd98842c
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
tests.models.utils
import
GenerateModelInfo
from
.ppl_utils
import
wikitext_ppl_test
MODELS
=
[
GenerateModelInfo
(
"google/gemma-2b"
),
GenerateModelInfo
(
"google/gemma-2-2b"
),
GenerateModelInfo
(
"google/gemma-3-4b-it"
),
]
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
def
test_ppl
(
hf_runner
,
vllm_runner
,
model_info
:
GenerateModelInfo
):
wikitext_ppl_test
(
hf_runner
,
vllm_runner
,
model_info
)
tests/models/language/generation_ppl_test/test_gpt.py
0 → 100644
View file @
bd98842c
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
tests.models.utils
import
GenerateModelInfo
from
.ppl_utils
import
wikitext_ppl_test
MODELS
=
[
GenerateModelInfo
(
"openai-community/gpt2-large"
)]
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
def
test_ppl
(
hf_runner
,
vllm_runner
,
model_info
:
GenerateModelInfo
):
wikitext_ppl_test
(
hf_runner
,
vllm_runner
,
model_info
)
tests/models/language/generation_ppl_test/test_qwen.py
0 → 100644
View file @
bd98842c
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
tests.models.utils
import
GenerateModelInfo
from
.ppl_utils
import
wikitext_ppl_test
MODELS
=
[
GenerateModelInfo
(
"Qwen/Qwen3-0.6B"
),
GenerateModelInfo
(
"Qwen/Qwen3-0.6B-FP8"
),
# transformers:
# Loading a GPTQ quantized model requires optimum, gptqmodel
# GenerateModelInfo("Qwen/Qwen3-0.6B-GPTQ-Int8"),
]
@
pytest
.
mark
.
parametrize
(
"model_info"
,
MODELS
)
def
test_ppl
(
hf_runner
,
vllm_runner
,
model_info
:
GenerateModelInfo
):
wikitext_ppl_test
(
hf_runner
,
vllm_runner
,
model_info
)
tests/models/language/pooling/embed_utils.py
View file @
bd98842c
...
@@ -59,7 +59,7 @@ def correctness_test_embed_models(hf_runner,
...
@@ -59,7 +59,7 @@ def correctness_test_embed_models(hf_runner,
with
hf_runner
(
with
hf_runner
(
model_info
.
name
,
model_info
.
name
,
dtype
=
"float32"
,
dtype
=
model_info
.
hf_dtype
,
is_sentence_transformer
=
True
,
is_sentence_transformer
=
True
,
)
as
hf_model
:
)
as
hf_model
:
...
...
tests/models/language/pooling/mteb_utils.py
View file @
bd98842c
...
@@ -213,7 +213,7 @@ def mteb_test_embed_models(hf_runner,
...
@@ -213,7 +213,7 @@ def mteb_test_embed_models(hf_runner,
if
model_info
.
mteb_score
is
None
:
if
model_info
.
mteb_score
is
None
:
with
hf_runner
(
model_info
.
name
,
with
hf_runner
(
model_info
.
name
,
is_sentence_transformer
=
True
,
is_sentence_transformer
=
True
,
dtype
=
"float32"
)
as
hf_model
:
dtype
=
model_info
.
hf_dtype
)
as
hf_model
:
# e.g. setting default parameters for the encode method of hf_runner
# e.g. setting default parameters for the encode method of hf_runner
if
hf_model_callback
is
not
None
:
if
hf_model_callback
is
not
None
:
...
@@ -278,9 +278,12 @@ def run_mteb_rerank(cross_encoder, tasks, languages):
...
@@ -278,9 +278,12 @@ def run_mteb_rerank(cross_encoder, tasks, languages):
return
main_score
return
main_score
def
mteb_test_rerank_models_hf
(
hf_runner
,
model_name
,
hf_model_callback
=
None
):
def
mteb_test_rerank_models_hf
(
hf_runner
,
model_name
,
hf_dtype
=
"float32"
,
hf_model_callback
=
None
):
with
hf_runner
(
model_name
,
is_cross_encoder
=
True
,
with
hf_runner
(
model_name
,
is_cross_encoder
=
True
,
dtype
=
"float32"
)
as
hf_model
:
dtype
=
hf_dtype
)
as
hf_model
:
original_predict
=
hf_model
.
predict
original_predict
=
hf_model
.
predict
...
@@ -357,7 +360,7 @@ def mteb_test_rerank_models(hf_runner,
...
@@ -357,7 +360,7 @@ def mteb_test_rerank_models(hf_runner,
# SentenceTransformers mteb score to a constant
# SentenceTransformers mteb score to a constant
if
model_info
.
mteb_score
is
None
:
if
model_info
.
mteb_score
is
None
:
st_main_score
,
st_dtype
=
mteb_test_rerank_models_hf
(
st_main_score
,
st_dtype
=
mteb_test_rerank_models_hf
(
hf_runner
,
model_info
.
name
,
hf_model_callback
)
hf_runner
,
model_info
.
name
,
model_info
.
hf_dtype
,
hf_model_callback
)
else
:
else
:
st_main_score
=
model_info
.
mteb_score
st_main_score
=
model_info
.
mteb_score
st_dtype
=
"Constant"
st_dtype
=
"Constant"
...
...
tests/models/utils.py
View file @
bd98842c
...
@@ -347,14 +347,15 @@ class ModelInfo:
...
@@ -347,14 +347,15 @@ class ModelInfo:
name
:
str
name
:
str
architecture
:
str
=
""
architecture
:
str
=
""
dtype
:
str
=
"auto"
dtype
:
str
=
"auto"
hf_dtype
:
str
=
"float32"
hf_overrides
:
Optional
[
dict
[
str
,
Any
]]
=
None
hf_overrides
:
Optional
[
dict
[
str
,
Any
]]
=
None
default_pooling_type
:
str
=
""
default_pooling_type
:
str
=
""
mteb_score
:
Optional
[
float
]
=
None
enable_test
:
bool
=
True
enable_test
:
bool
=
True
@
dataclass
@
dataclass
class
EmbedModelInfo
(
ModelInfo
):
class
EmbedModelInfo
(
ModelInfo
):
mteb_score
:
Optional
[
float
]
=
None
is_matryoshka
:
bool
=
False
is_matryoshka
:
bool
=
False
matryoshka_dimensions
:
Optional
[
list
[
int
]]
=
None
matryoshka_dimensions
:
Optional
[
list
[
int
]]
=
None
...
@@ -371,7 +372,7 @@ class LASTPoolingEmbedModelInfo(EmbedModelInfo):
...
@@ -371,7 +372,7 @@ class LASTPoolingEmbedModelInfo(EmbedModelInfo):
@
dataclass
@
dataclass
class
RerankModelInfo
(
ModelInfo
):
class
RerankModelInfo
(
ModelInfo
):
pass
mteb_score
:
Optional
[
float
]
=
None
@
dataclass
@
dataclass
...
@@ -384,6 +385,12 @@ class LASTPoolingRerankModelInfo(RerankModelInfo):
...
@@ -384,6 +385,12 @@ class LASTPoolingRerankModelInfo(RerankModelInfo):
default_pooling_type
:
str
=
"LAST"
default_pooling_type
:
str
=
"LAST"
@
dataclass
class
GenerateModelInfo
(
ModelInfo
):
hf_dtype
:
str
=
"auto"
hf_ppl
:
Optional
[
float
]
=
None
def
dummy_hf_overrides
(
def
dummy_hf_overrides
(
hf_config
:
PretrainedConfig
,
hf_config
:
PretrainedConfig
,
*
,
*
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment