[CI/Build] Sync multimodal tests (#23181)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>

[CI/Build] Sync multimodal tests (#23181)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
de7b67a0 · Cyrus Leung · GitHub · f7290232 · de7b67a0 · de7b67a0
Unverified Commit de7b67a0 authored Aug 20, 2025 by Cyrus Leung Committed by GitHub Aug 20, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 18 additions and 16 deletions

tests/models/multimodal/processing/test_common.py tests/models/multimodal/processing/test_common.py +7 -3

tests/models/registry.py tests/models/registry.py +11 -13

No files found.
--- a/tests/models/multimodal/processing/test_common.py
+++ b/tests/models/multimodal/processing/test_common.py
@@ -275,16 +275,17 @@ def _test_processing_correctness_one(
    "google/gemma-3n-E2B-it",
    "zai-org/glm-4v-9b",
    "zai-org/GLM-4.1V-9B-Thinking",
+    "zai-org/GLM-4.5V",
    "ibm-granite/granite-speech-3.3-2b",
    "h2oai/h2ovl-mississippi-800m",
+    "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
+    "HuggingFaceM4/Idefics3-8B-Llama3",
    "internlm/Intern-S1",
    "OpenGVLab/InternVL2-1B",
    "OpenGVLab/InternVL3-1B",
-    "HuggingFaceM4/Idefics3-8B-Llama3",
+    "Kwai-Keye/Keye-VL-8B-Preview",
-    "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
    "moonshotai/Kimi-VL-A3B-Instruct",
    "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-    "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
    "llava-hf/llava-1.5-7b-hf",
    "llava-hf/llava-v1.6-mistral-7b-hf",
    "llava-hf/LLaVA-NeXT-Video-7B-hf",
@@ -315,10 +316,13 @@ def _test_processing_correctness_one(
    "Qwen/Qwen2-Audio-7B-Instruct",
    "Qwen/Qwen2.5-Omni-3B",
    "Skywork/Skywork-R1V-38B",
+    "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
+    "stepfun-ai/step3",
    "fixie-ai/ultravox-v0_5-llama-3_2-1b",
    "openai/whisper-large-v3",
    "omni-research/Tarsier-7b",
    "omni-research/Tarsier2-Recap-7b",
+    "mistralai/Voxtral-Mini-3B-2507",
 ])
 @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
 @pytest.mark.parametrize("num_batches", [32])

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -215,9 +215,6 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "HunYuanDenseV1ForCausalLM":_HfExamplesInfo("tencent/Hunyuan-7B-Instruct-0124",
                                                trust_remote_code=True,
                                                is_available_online=False),
-    "HCXVisionForCausalLM": _HfExamplesInfo(
-        "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",
-        trust_remote_code=True),
    "InternLMForCausalLM": _HfExamplesInfo("internlm/internlm-chat-7b",
                                           trust_remote_code=True),
    "InternLM2ForCausalLM": _HfExamplesInfo("internlm/internlm2-chat-7b",
@@ -298,8 +295,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "StableLmForCausalLM": _HfExamplesInfo("stabilityai/stablelm-3b-4e1t"),
    "Starcoder2ForCausalLM": _HfExamplesInfo("bigcode/starcoder2-3b"),
    "Step3TextForCausalLM": _HfExamplesInfo("stepfun-ai/step3",
-                                            trust_remote_code=True,
+                                            trust_remote_code=True),
-                                            is_available_online=False),
    "SolarForCausalLM": _HfExamplesInfo("upstage/solar-pro-preview-instruct",
                                        trust_remote_code=True),
    "TeleChat2ForCausalLM": _HfExamplesInfo("Tele-AI/TeleChat2-3B",
@@ -405,22 +401,24 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                        hf_overrides={"architectures": ["GLM4VForCausalLM"]}),  # noqa: E501
    "Glm4vForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.1V-9B-Thinking"),  # noqa: E501
    "Glm4vMoeForConditionalGeneration": _HfExamplesInfo("zai-org/GLM-4.5V",
-                                          is_available_online=False),   # noqa: E501
+                                                        min_transformers_version="4.56"),  # noqa: E501
    "H2OVLChatModel": _HfExamplesInfo("h2oai/h2ovl-mississippi-800m",
                                      trust_remote_code=True,
                                      extras={"2b": "h2oai/h2ovl-mississippi-2b"},  # noqa: E501
                                      max_transformers_version="4.48",  # noqa: E501
                                      transformers_version_reason="HF model is not compatible."),  # noqa: E501
+    "HCXVisionForCausalLM": _HfExamplesInfo("naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B",  # noqa: E501
+                                            trust_remote_code=True),
    "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
                                                        {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"},    # noqa: E501
                                                        min_transformers_version="4.55.1",
                                                        transformers_version_reason="HF model broken in 4.55.0"),  # noqa: E501
+    "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
+                                                        trust_remote_code=True),  # noqa: E501
    "InternVLChatModel": _HfExamplesInfo("OpenGVLab/InternVL2-1B",
                                         extras={"2B": "OpenGVLab/InternVL2-2B",
                                                 "3.0": "OpenGVLab/InternVL3-1B"},  # noqa: E501
                                         trust_remote_code=True),
-    "InternS1ForConditionalGeneration": _HfExamplesInfo("internlm/Intern-S1",
-                                         trust_remote_code=True),
    "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
                                                    trust_remote_code=True),
    "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
@@ -464,9 +462,10 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                            transformers_version_reason="HF model is not compatible",  # noqa: E501
                            extras={"1.6-llama": "AIDC-AI/Ovis1.6-Llama3.2-3B",
                                    "1.6-gemma": "AIDC-AI/Ovis1.6-Gemma2-9B"}),  # noqa: E501
-    "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B", trust_remote_code=True,
+    "Ovis2_5": _HfExamplesInfo("AIDC-AI/Ovis2.5-2B",
-                            max_transformers_version="4.53",
+                               trust_remote_code=True,
-                            transformers_version_reason="HF model is not compatible"),  # noqa: E501
+                               max_transformers_version="4.53",
+                               transformers_version_reason="HF model is not compatible"),  # noqa: E501
    "PaliGemmaForConditionalGeneration": _HfExamplesInfo("google/paligemma-3b-mix-224",  # noqa: E501
                                                         extras={"v2": "google/paligemma2-3b-ft-docci-448"}),  # noqa: E501
    "Phi3VForCausalLM": _HfExamplesInfo("microsoft/Phi-3-vision-128k-instruct",
@@ -496,8 +495,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
                                                       min_transformers_version="4.55.1",
                                                       transformers_version_reason="HF model broken in 4.55.0"),  # noqa: E501
    "Step3VLForConditionalGeneration": _HfExamplesInfo("stepfun-ai/step3",
-                                                        trust_remote_code=True,
+                                                        trust_remote_code=True),
-                                                        is_available_online=False),
    "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",  # noqa: E501
                                     trust_remote_code=True),
    "TarsierForConditionalGeneration": _HfExamplesInfo("omni-research/Tarsier-7b"),  # noqa: E501