Support glm4.1v and glm4.5v (#8798)

Signed-off-by: Xinyuan Tong <justinning0323@outlook.com> Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: Xinyuan Tong <justinning0323@outlook.com> Co-authored-by: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com> Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: zRzRzRzRzRzRzR <2448370773@qq.com> Co-authored-by: Minglei Zhu <mingleizhu1122@gmail.com> Co-authored-by: Chang Su <csu272@usc.edu>

Support glm4.1v and glm4.5v (#8798)
Signed-off-by: Xinyuan Tong <justinning0323@outlook.com> Signed-off-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: Xinyuan Tong <justinning0323@outlook.com> Co-authored-by: Xinyuan Tong <115166877+JustinTong0323@users.noreply.github.com> Co-authored-by: Xinyuan Tong <xinyuantong.cs@gmail.com> Co-authored-by: zRzRzRzRzRzRzR <2448370773@qq.com> Co-authored-by: Minglei Zhu <mingleizhu1122@gmail.com> Co-authored-by: Chang Su <csu272@usc.edu>
f29aba8c · Binyao Jiang · GitHub · faa25df1 · f29aba8c
Unverified Commit f29aba8c authored Aug 09, 2025 by Binyao Jiang Committed by GitHub Aug 09, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 19 additions and 8 deletions

test/srt/test_vision_openai_server_common.py test/srt/test_vision_openai_server_common.py +19 -8

No files found.
--- a/test/srt/test_vision_openai_server_common.py
+++ b/test/srt/test_vision_openai_server_common.py
@@ -96,8 +96,13 @@ class TestOpenAIVisionServer(CustomTestCase):
        ), f"text: {text}, should contain cab, taxi, SUV, vehicle or car"
        # MiniCPMO fails to recognize `iron`, but `hanging`
        assert (
-            "iron" in text or "hang" in text or "cloth" in text or "holding" in text
-        ), f"text: {text}, should contain iron, hang, cloth or holding"
+            "iron" in text
+            or "hang" in text
+            or "cloth" in text
+            or "coat" in text
+            or "holding" in text
+            or "outfit" in text
+        ), f"text: {text}, should contain iron, hang, cloth, coat or holding or outfit"
        assert response.id
        assert response.created
        assert response.usage.prompt_tokens > 0
@@ -193,11 +198,15 @@ class TestOpenAIVisionServer(CustomTestCase):
        print(f"Multi images response:\n{text}")
        print("-" * 30)
        assert (
-            "man" in text or "cab" in text or "SUV" in text or "taxi" in text
-        ), f"text: {text}, should contain man, cab, SUV or taxi"
+            "man" in text
+            or "cab" in text
+            or "SUV" in text
+            or "taxi" in text
+            or "car" in text
+        ), f"text: {text}, should contain man, cab, SUV, taxi or car"
        assert (
-            "logo" in text or '"S"' in text or "SG" in text
-        ), f"text: {text}, should contain logo, S or SG"
+            "logo" in text or '"S"' in text or "SG" in text or "graphic" in text
+        ), f"text: {text}, should contain logo, S or SG or graphic"
        assert response.id
        assert response.created
        assert response.usage.prompt_tokens > 0
@@ -320,11 +329,12 @@ class TestOpenAIVisionServer(CustomTestCase):
            or "individual" in video_response
            or "speaker" in video_response
            or "Steve" in video_response
+            or "hand" in video_response
        ), f"""
        ====================== video_response =====================
        {video_response}
        ===========================================================
-        should contain 'man' or 'person' or 'individual' or 'speaker'
+        should contain 'man' or 'person' or 'individual' or 'speaker' or 'hand'
        """
        assert (
            "present" in video_response
@@ -375,7 +385,8 @@ class TestOpenAIVisionServer(CustomTestCase):
            or "person" in video_response
            or "individual" in video_response
            or "speaker" in video_response
-        ), f"video_response: {video_response}, should either have 'man' in video_response, or 'person' in video_response, or 'individual' in video_response or 'speaker' in video_response"
+            or "hand" in video_response
+        ), f"video_response: {video_response}, should either have 'man' in video_response, or 'person' in video_response, or 'individual' in video_response, or 'speaker' in video_response or 'hand' in video_response"
        assert (
            "present" in video_response
            or "examine" in video_response