Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
afb4429b
Unverified
Commit
afb4429b
authored
May 01, 2025
by
Cyrus Leung
Committed by
GitHub
Apr 30, 2025
Browse files
[CI/Build] Reorganize models tests (#17459)
Signed-off-by:
DarkLight1337
<
tlleungac@connect.ust.hk
>
parent
aa4502e7
Changes
65
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
72 additions
and
6 deletions
+72
-6
tests/models/quantization/test_modelopt.py
tests/models/quantization/test_modelopt.py
+0
-1
tests/models/quantization/test_nvfp4.py
tests/models/quantization/test_nvfp4.py
+0
-1
tests/models/utils.py
tests/models/utils.py
+65
-1
vllm/config.py
vllm/config.py
+3
-1
vllm/model_executor/models/llama.py
vllm/model_executor/models/llama.py
+4
-2
No files found.
tests/models/
decoder_only/language
/test_modelopt.py
→
tests/models/
quantization
/test_modelopt.py
View file @
afb4429b
...
...
@@ -40,7 +40,6 @@ EXPECTED_STRS_MAP = {
@
pytest
.
mark
.
skip
(
reason
=
"Prevent unstable test based on golden strings from breaking the build."
)
@
pytest
.
mark
.
quant_model
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"fp8"
),
reason
=
"fp8 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS
)
...
...
tests/models/
decoder_only/language
/test_nvfp4.py
→
tests/models/
quantization
/test_nvfp4.py
View file @
afb4429b
...
...
@@ -41,7 +41,6 @@ EXPECTED_STRS_MAP = {
reason
=
"Prevent unstable test based on golden strings from breaking the build "
" and test input model being too large and hanging the system."
)
@
pytest
.
mark
.
quant_model
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"nvfp4"
),
reason
=
"nvfp4 is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model_name"
,
MODELS
)
...
...
tests/models/utils.py
View file @
afb4429b
...
...
@@ -2,9 +2,10 @@
import
warnings
from
collections.abc
import
Sequence
from
typing
import
Any
,
Optional
,
Union
from
typing
import
TYPE_CHECKING
,
Any
,
NamedTuple
,
Optional
,
Union
import
torch
import
torch.nn.functional
as
F
from
vllm.config
import
ModelConfig
,
TaskOption
from
vllm.inputs
import
InputContext
...
...
@@ -12,6 +13,9 @@ from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
from
.registry
import
HF_EXAMPLE_MODELS
if
TYPE_CHECKING
:
from
..conftest
import
HfRunner
TokensText
=
tuple
[
list
[
int
],
str
]
...
...
@@ -291,3 +295,63 @@ def build_model_context(
**
model_config_kwargs
,
)
return
InputContext
(
model_config
)
def
check_embeddings_close
(
*
,
embeddings_0_lst
:
Sequence
[
list
[
float
]],
embeddings_1_lst
:
Sequence
[
list
[
float
]],
name_0
:
str
,
name_1
:
str
,
tol
:
float
=
1e-3
,
)
->
None
:
assert
len
(
embeddings_0_lst
)
==
len
(
embeddings_1_lst
)
for
prompt_idx
,
(
embeddings_0
,
embeddings_1
)
in
enumerate
(
zip
(
embeddings_0_lst
,
embeddings_1_lst
)):
assert
len
(
embeddings_0
)
==
len
(
embeddings_1
),
(
f
"Length mismatch:
{
len
(
embeddings_0
)
}
vs.
{
len
(
embeddings_1
)
}
"
)
sim
=
F
.
cosine_similarity
(
torch
.
tensor
(
embeddings_0
),
torch
.
tensor
(
embeddings_1
),
dim
=
0
)
fail_msg
=
(
f
"Test
{
prompt_idx
}
:"
f
"
\n
{
name_0
}
:
\t
{
embeddings_0
[:
16
]
!
r
}
"
f
"
\n
{
name_1
}
:
\t
{
embeddings_1
[:
16
]
!
r
}
"
)
assert
sim
>=
1
-
tol
,
fail_msg
def
matryoshka_fy
(
tensor
:
torch
.
Tensor
,
dimensions
:
int
):
tensor
=
torch
.
tensor
(
tensor
)
tensor
=
tensor
[...,
:
dimensions
]
tensor
=
F
.
normalize
(
tensor
,
p
=
2
,
dim
=
1
)
return
tensor
class
EmbedModelInfo
(
NamedTuple
):
name
:
str
is_matryoshka
:
bool
matryoshka_dimensions
:
Optional
[
list
[
int
]]
=
None
architecture
:
str
=
""
enable_test
:
bool
=
True
def
run_embedding_correctness_test
(
hf_model
:
"HfRunner"
,
inputs
:
list
[
str
],
vllm_outputs
:
Sequence
[
list
[
float
]],
dimensions
:
Optional
[
int
]
=
None
,
):
hf_outputs
=
hf_model
.
encode
(
inputs
)
if
dimensions
:
hf_outputs
=
matryoshka_fy
(
hf_outputs
,
dimensions
)
check_embeddings_close
(
embeddings_0_lst
=
hf_outputs
,
embeddings_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
tol
=
1e-2
,
)
vllm/config.py
View file @
afb4429b
...
...
@@ -1043,8 +1043,10 @@ class ModelConfig:
if
self
.
is_attention_free
:
return
0
if
hasattr
(
self
.
hf_text_config
,
"head_dim"
):
# NOTE: Some configs may set head_dim=None in the config
if
getattr
(
self
.
hf_text_config
,
"head_dim"
,
None
)
is
not
None
:
return
self
.
hf_text_config
.
head_dim
# FIXME(woosuk): This may not be true for all models.
return
(
self
.
hf_text_config
.
hidden_size
//
self
.
hf_text_config
.
num_attention_heads
)
...
...
vllm/model_executor/models/llama.py
View file @
afb4429b
...
...
@@ -127,8 +127,10 @@ class LlamaAttention(nn.Module):
assert
tp_size
%
self
.
total_num_kv_heads
==
0
self
.
num_kv_heads
=
max
(
1
,
self
.
total_num_kv_heads
//
tp_size
)
# MistralConfig has an optional head_dim introduced by Mistral-Nemo
self
.
head_dim
=
getattr
(
config
,
"head_dim"
,
self
.
hidden_size
//
self
.
total_num_heads
)
head_dim
=
getattr
(
config
,
"head_dim"
,
None
)
if
head_dim
is
None
:
head_dim
=
self
.
hidden_size
//
self
.
total_num_heads
self
.
head_dim
=
head_dim
# Phi models introduced a partial_rotary_factor parameter in the config
self
.
partial_rotary_factor
=
getattr
(
config
,
"partial_rotary_factor"
,
1
)
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment