[CI/Build] Reorganize models tests (#17459)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[CI/Build] Reorganize models tests (#17459)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
afb4429b · Cyrus Leung · GitHub · aa4502e7 · afb4429b · afb4429b
Unverified Commit afb4429b authored May 01, 2025 by Cyrus Leung Committed by GitHub Apr 30, 2025
5 changed files
--- a/tests/models/decoder_only/language/test_modelopt.py
+++ b/tests/models/decoder_only/language/test_modelopt.py
@@ -40,7 +40,6 @@ EXPECTED_STRS_MAP = {
 @pytest.mark.skip(
    reason=
    "Prevent unstable test based on golden strings from breaking the build.")
-@pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("fp8"),
                    reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)

--- a/tests/models/decoder_only/language/test_nvfp4.py
+++ b/tests/models/decoder_only/language/test_nvfp4.py
@@ -41,7 +41,6 @@ EXPECTED_STRS_MAP = {
    reason=
    "Prevent unstable test based on golden strings from breaking the build "
    " and test input model being too large and hanging the system.")
-@pytest.mark.quant_model
 @pytest.mark.skipif(not is_quant_method_supported("nvfp4"),
                    reason="nvfp4 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)

--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -2,9 +2,10 @@
 import warnings
 from collections.abc import Sequence
-from typing import Any, Optional, Union
+from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
 import torch
+import torch.nn.functional as F
 from vllm.config import ModelConfig, TaskOption
 from vllm.inputs import InputContext
@@ -12,6 +13,9 @@ from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 from .registry import HF_EXAMPLE_MODELS
+if TYPE_CHECKING:
+    from ..conftest import HfRunner
 TokensText = tuple[list[int], str]
@@ -291,3 +295,63 @@ def build_model_context(
        **model_config_kwargs,
    )
    return InputContext(model_config)
+def check_embeddings_close(
+    *,
+    embeddings_0_lst: Sequence[list[float]],
+    embeddings_1_lst: Sequence[list[float]],
+    name_0: str,
+    name_1: str,
+    tol: float = 1e-3,
+) -> None:
+    assert len(embeddings_0_lst) == len(embeddings_1_lst)
+    for prompt_idx, (embeddings_0, embeddings_1) in enumerate(
+            zip(embeddings_0_lst, embeddings_1_lst)):
+        assert len(embeddings_0) == len(embeddings_1), (
+            f"Length mismatch: {len(embeddings_0)} vs. {len(embeddings_1)}")
+        sim = F.cosine_similarity(torch.tensor(embeddings_0),
+                                  torch.tensor(embeddings_1),
+                                  dim=0)
+        fail_msg = (f"Test{prompt_idx}:"
+                    f"\n{name_0}:\t{embeddings_0[:16]!r}"
+                    f"\n{name_1}:\t{embeddings_1[:16]!r}")
+        assert sim >= 1 - tol, fail_msg
+def matryoshka_fy(tensor: torch.Tensor, dimensions: int):
+    tensor = torch.tensor(tensor)
+    tensor = tensor[..., :dimensions]
+    tensor = F.normalize(tensor, p=2, dim=1)
+    return tensor
+class EmbedModelInfo(NamedTuple):
+    name: str
+    is_matryoshka: bool
+    matryoshka_dimensions: Optional[list[int]] = None
+    architecture: str = ""
+    enable_test: bool = True
+def run_embedding_correctness_test(
+    hf_model: "HfRunner",
+    inputs: list[str],
+    vllm_outputs: Sequence[list[float]],
+    dimensions: Optional[int] = None,
+):
+    hf_outputs = hf_model.encode(inputs)
+    if dimensions:
+        hf_outputs = matryoshka_fy(hf_outputs, dimensions)
+    check_embeddings_close(
+        embeddings_0_lst=hf_outputs,
+        embeddings_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+        tol=1e-2,
+    )
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1043,8 +1043,10 @@ class ModelConfig:
        if self.is_attention_free:
            return 0
-        if hasattr(self.hf_text_config, "head_dim"):
+        # NOTE: Some configs may set head_dim=None in the config
+        if getattr(self.hf_text_config, "head_dim", None) is not None:
            return self.hf_text_config.head_dim
        # FIXME(woosuk): This may not be true for all models.
        return (self.hf_text_config.hidden_size //
                self.hf_text_config.num_attention_heads)

--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -127,8 +127,10 @@ class LlamaAttention(nn.Module):
            assert tp_size % self.total_num_kv_heads == 0
        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
        # MistralConfig has an optional head_dim introduced by Mistral-Nemo
-        self.head_dim = getattr(config, "head_dim",
+        head_dim = getattr(config, "head_dim", None)
-                                self.hidden_size // self.total_num_heads)
+        if head_dim is None:
+            head_dim = self.hidden_size // self.total_num_heads
+        self.head_dim = head_dim
        # Phi models introduced a partial_rotary_factor parameter in the config
        self.partial_rotary_factor = getattr(config, "partial_rotary_factor",
                                             1)