"docs/vscode:/vscode.git/clone" did not exist on "65b1cbb1381bf2301a2441fd988bbe88b4b7865e"
Unverified Commit afb4429b authored by Cyrus Leung's avatar Cyrus Leung Committed by GitHub
Browse files

[CI/Build] Reorganize models tests (#17459)


Signed-off-by: default avatarDarkLight1337 <tlleungac@connect.ust.hk>
parent aa4502e7
...@@ -40,7 +40,6 @@ EXPECTED_STRS_MAP = { ...@@ -40,7 +40,6 @@ EXPECTED_STRS_MAP = {
@pytest.mark.skip( @pytest.mark.skip(
reason= reason=
"Prevent unstable test based on golden strings from breaking the build.") "Prevent unstable test based on golden strings from breaking the build.")
@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("fp8"), @pytest.mark.skipif(not is_quant_method_supported("fp8"),
reason="fp8 is not supported on this GPU type.") reason="fp8 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS) @pytest.mark.parametrize("model_name", MODELS)
......
...@@ -41,7 +41,6 @@ EXPECTED_STRS_MAP = { ...@@ -41,7 +41,6 @@ EXPECTED_STRS_MAP = {
reason= reason=
"Prevent unstable test based on golden strings from breaking the build " "Prevent unstable test based on golden strings from breaking the build "
" and test input model being too large and hanging the system.") " and test input model being too large and hanging the system.")
@pytest.mark.quant_model
@pytest.mark.skipif(not is_quant_method_supported("nvfp4"), @pytest.mark.skipif(not is_quant_method_supported("nvfp4"),
reason="nvfp4 is not supported on this GPU type.") reason="nvfp4 is not supported on this GPU type.")
@pytest.mark.parametrize("model_name", MODELS) @pytest.mark.parametrize("model_name", MODELS)
......
...@@ -2,9 +2,10 @@ ...@@ -2,9 +2,10 @@
import warnings import warnings
from collections.abc import Sequence from collections.abc import Sequence
from typing import Any, Optional, Union from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
import torch import torch
import torch.nn.functional as F
from vllm.config import ModelConfig, TaskOption from vllm.config import ModelConfig, TaskOption
from vllm.inputs import InputContext from vllm.inputs import InputContext
...@@ -12,6 +13,9 @@ from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs ...@@ -12,6 +13,9 @@ from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
from .registry import HF_EXAMPLE_MODELS from .registry import HF_EXAMPLE_MODELS
if TYPE_CHECKING:
from ..conftest import HfRunner
TokensText = tuple[list[int], str] TokensText = tuple[list[int], str]
...@@ -291,3 +295,63 @@ def build_model_context( ...@@ -291,3 +295,63 @@ def build_model_context(
**model_config_kwargs, **model_config_kwargs,
) )
return InputContext(model_config) return InputContext(model_config)
def check_embeddings_close(
*,
embeddings_0_lst: Sequence[list[float]],
embeddings_1_lst: Sequence[list[float]],
name_0: str,
name_1: str,
tol: float = 1e-3,
) -> None:
assert len(embeddings_0_lst) == len(embeddings_1_lst)
for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
zip(embeddings_0_lst, embeddings_1_lst)):
assert len(embeddings_0) == len(embeddings_1), (
f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
sim = F.cosine_similarity(torch.tensor(embeddings_0),
torch.tensor(embeddings_1),
dim=0)
fail_msg = (f"Test{prompt_idx}:"
f"\n{name_0}:\t{embeddings_0[:16]!r}"
f"\n{name_1}:\t{embeddings_1[:16]!r}")
assert sim >= 1 - tol, fail_msg
def matryoshka_fy(tensor: torch.Tensor, dimensions: int):
tensor = torch.tensor(tensor)
tensor = tensor[..., :dimensions]
tensor = F.normalize(tensor, p=2, dim=1)
return tensor
class EmbedModelInfo(NamedTuple):
name: str
is_matryoshka: bool
matryoshka_dimensions: Optional[list[int]] = None
architecture: str = ""
enable_test: bool = True
def run_embedding_correctness_test(
hf_model: "HfRunner",
inputs: list[str],
vllm_outputs: Sequence[list[float]],
dimensions: Optional[int] = None,
):
hf_outputs = hf_model.encode(inputs)
if dimensions:
hf_outputs = matryoshka_fy(hf_outputs, dimensions)
check_embeddings_close(
embeddings_0_lst=hf_outputs,
embeddings_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
tol=1e-2,
)
...@@ -1043,8 +1043,10 @@ class ModelConfig: ...@@ -1043,8 +1043,10 @@ class ModelConfig:
if self.is_attention_free: if self.is_attention_free:
return 0 return 0
if hasattr(self.hf_text_config, "head_dim"): # NOTE: Some configs may set head_dim=None in the config
if getattr(self.hf_text_config, "head_dim", None) is not None:
return self.hf_text_config.head_dim return self.hf_text_config.head_dim
# FIXME(woosuk): This may not be true for all models. # FIXME(woosuk): This may not be true for all models.
return (self.hf_text_config.hidden_size // return (self.hf_text_config.hidden_size //
self.hf_text_config.num_attention_heads) self.hf_text_config.num_attention_heads)
......
...@@ -127,8 +127,10 @@ class LlamaAttention(nn.Module): ...@@ -127,8 +127,10 @@ class LlamaAttention(nn.Module):
assert tp_size % self.total_num_kv_heads == 0 assert tp_size % self.total_num_kv_heads == 0
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size) self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
# MistralConfig has an optional head_dim introduced by Mistral-Nemo # MistralConfig has an optional head_dim introduced by Mistral-Nemo
self.head_dim = getattr(config, "head_dim", head_dim = getattr(config, "head_dim", None)
self.hidden_size // self.total_num_heads) if head_dim is None:
head_dim = self.hidden_size // self.total_num_heads
self.head_dim = head_dim
# Phi models introduced a partial_rotary_factor parameter in the config # Phi models introduced a partial_rotary_factor parameter in the config
self.partial_rotary_factor = getattr(config, "partial_rotary_factor", self.partial_rotary_factor = getattr(config, "partial_rotary_factor",
1) 1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment