GLM-4.5 Model Support Follow-up (#8445)

581e7dcb · Binyao Jiang · GitHub · 484d0e02 · 581e7dcb · 581e7dcb
Unverified Commit 581e7dcb authored Jul 27, 2025 by Binyao Jiang Committed by GitHub Jul 27, 2025
6 changed files
--- a/python/sglang/srt/function_call/glm4_moe_detector.py
+++ b/python/sglang/srt/function_call/glm4_moe_detector.py
@@ -156,8 +156,7 @@ class Glm4MoeDetector(BaseFormatDetector):
            tools,
            individual_call_start_token=self.bot_token,
            individual_call_end_token=self.eot_token,
-            # GLM4Moe is not compatible with multiple tool_calls under tool_choice condition: it will output unlimited tool_calls...
+            tool_call_separator="\\n",
-            # tool_call_separator="\\n",
            function_format="xml",
            call_rule_fmt='"{name}" "\\n" {arguments_rule} "\\n"',
            key_value_rule_fmt='"<arg_key>{key}</arg_key>" "\\n" "<arg_value>" {valrule} "</arg_value>"',

--- a/python/sglang/srt/function_call/qwen3_coder_detector.py
+++ b/python/sglang/srt/function_call/qwen3_coder_detector.py
@@ -148,4 +148,5 @@ class Qwen3CoderDetector(BaseFormatDetector):
            function_format="xml",
            call_rule_fmt='"<function={name}>\\n" {arguments_rule} "\\n</function>"',
            key_value_rule_fmt='"<parameter={key}>\\n" {valrule} "\\n</parameter>"',
+            key_value_separator="\\n",
        )
--- a/test/srt/openai_server/features/test_enable_thinking.py
+++ b/test/srt/openai_server/features/test_enable_thinking.py
@@ -189,7 +189,7 @@ class TestEnableThinking(CustomTestCase):
        )
-## Skip for ci test
+# Skip for ci test
 # class TestGLM45EnableThinking(TestEnableThinking):
 #     @classmethod
 #     def setUpClass(cls):

--- a/test/srt/openai_server/function_call/test_openai_function_calling.py
+++ b/test/srt/openai_server/function_call/test_openai_function_calling.py
@@ -913,7 +913,7 @@ class TestOpenAIPythonicFunctionCalling(CustomTestCase):
        )
-## Skip for ci test
+# Skip for ci test
 # class TestGLM45ServerFunctionCalling(TestOpenAIServerFunctionCalling):
 #     @classmethod
 #     def setUpClass(cls):

--- a/test/srt/openai_server/function_call/test_tool_choice.py
+++ b/test/srt/openai_server/function_call/test_tool_choice.py
@@ -135,7 +135,7 @@ class TestToolChoiceLlama32(CustomTestCase):
        return [
            {
                "role": "user",
-                "content": "Answer the following questions as best you can:\n\nYou will be given a trace of thinking process in the following format.\n\nQuestion: the input question you must answer\nTOOL: think about what to do, and choose a tool to use ONLY IF there are defined tools\nOBSERVATION: the result of the tool call or the observation of the current task, NEVER include this in your response, this information will be provided\n... (this TOOL/OBSERVATION can repeat N times)\nANSWER: If you know the answer to the original question, require for more information, \nif the previous conversation history already contains the answer, \nor you don't know the answer and there are no defined tools or all available tools are not helpful, respond with the answer without mentioning anything else.\nYou may use light Markdown formatting to improve clarity (e.g. lists, **bold**, *italics*), but keep it minimal and unobtrusive.\n\nYour task is to respond with the next step to take, based on the traces, \nor answer the question if you have enough information.\n\nQuestion: what is the weather in top 5 populated cities in the US?\n\nTraces:\n\n\nThese are some additional instructions that you should follow:",
+                "content": "Answer the following questions as best you can:\n\nYou will be given a trace of thinking process in the following format.\n\nQuestion: the input question you must answer\nTOOL: think about what to do, and choose a tool to use ONLY IF there are defined tools\nOBSERVATION: the result of the tool call or the observation of the current task, NEVER include this in your response, this information will be provided\n... (this TOOL/OBSERVATION can repeat N times)\nANSWER: If you know the answer to the original question, require for more information, \nif the previous conversation history already contains the answer, \nor you don't know the answer and there are no defined tools or all available tools are not helpful, respond with the answer without mentioning anything else.\nYou may use light Markdown formatting to improve clarity (e.g. lists, **bold**, *italics*), but keep it minimal and unobtrusive.\n\nYour task is to respond with the next step to take, based on the traces, \nor answer the question if you have enough information.\n\nQuestion: what is the weather in top 5 populated cities in the US in celsius?\n\nTraces:\n\n\nThese are some additional instructions that you should follow:",
            }
        ]
@@ -203,7 +203,7 @@ class TestToolChoiceLlama32(CustomTestCase):
        response = self.client.chat.completions.create(
            model=self.model_name,
            messages=messages,
-            max_tokens=400,
+            max_tokens=2048,
            tools=tools,
            tool_choice="auto",
            stream=False,
@@ -220,7 +220,7 @@ class TestToolChoiceLlama32(CustomTestCase):
        response = self.client.chat.completions.create(
            model=self.model_name,
            messages=messages,
-            max_tokens=400,
+            max_tokens=2048,
            tools=tools,
            tool_choice="auto",
            stream=True,
@@ -248,7 +248,7 @@ class TestToolChoiceLlama32(CustomTestCase):
        response = self.client.chat.completions.create(
            model=self.model_name,
            messages=messages,
-            max_tokens=400,
+            max_tokens=2048,
            temperature=0.2,
            tools=tools,
            tool_choice="required",
@@ -268,7 +268,7 @@ class TestToolChoiceLlama32(CustomTestCase):
        response = self.client.chat.completions.create(
            model=self.model_name,
            messages=messages,
-            max_tokens=400,
+            max_tokens=2048,
            tools=tools,
            tool_choice="required",
            stream=True,
@@ -294,7 +294,7 @@ class TestToolChoiceLlama32(CustomTestCase):
        response = self.client.chat.completions.create(
            model=self.model_name,
            messages=messages,
-            max_tokens=200,
+            max_tokens=2048,
            tools=tools,
            tool_choice=tool_choice,
            stream=False,
@@ -318,7 +318,7 @@ class TestToolChoiceLlama32(CustomTestCase):
        response = self.client.chat.completions.create(
            model=self.model_name,
            messages=messages,
-            max_tokens=200,
+            max_tokens=2048,
            tools=tools,
            tool_choice=tool_choice,
            stream=True,
@@ -351,7 +351,7 @@ class TestToolChoiceLlama32(CustomTestCase):
        response = self.client.chat.completions.create(
            model=self.model_name,
            messages=messages,
-            max_tokens=400,
+            max_tokens=2048,
            temperature=0.2,
            tools=tools,
            tool_choice="auto",
@@ -392,7 +392,7 @@ class TestToolChoiceLlama32(CustomTestCase):
        response = self.client.chat.completions.create(
            model=self.model_name,
            messages=messages,
-            max_tokens=400,
+            max_tokens=2048,
            temperature=0.2,
            tools=tools,
            tool_choice="required",
@@ -450,7 +450,7 @@ class TestToolChoiceLlama32(CustomTestCase):
            response = self.client.chat.completions.create(
                model=self.model_name,
                messages=messages,
-                max_tokens=200,
+                max_tokens=2048,
                tools=tools,
                tool_choice=tool_choice,
                stream=False,
@@ -517,5 +517,34 @@ class TestToolChoiceMistral(TestToolChoiceLlama32):
        cls.tokenizer = get_tokenizer(cls.model)
+# Skip for ci test
+# class TestToolChoiceGLM45(TestToolChoiceLlama32):
+#     @classmethod
+#     def setUpClass(cls):
+#         # Replace with the model name needed for testing; if not required, reuse DEFAULT_SMALL_MODEL_NAME_FOR_TEST
+#         cls.model = "THUDM/GLM-4.5"
+#         cls.base_url = DEFAULT_URL_FOR_TEST
+#         cls.api_key = "sk-123456"
+#         # Start the local OpenAI Server. If necessary, you can add other parameters such as --enable-tools.
+#         cls.process = popen_launch_server(
+#             cls.model,
+#             cls.base_url,
+#             timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+#             api_key=cls.api_key,
+#             other_args=[
+#                 # If your server needs extra parameters to test function calling, please add them here.
+#                 "--tool-call-parser",
+#                 "glm45",
+#                 "--reasoning-parser",
+#                 "glm45",
+#                 "--tp-size",
+#                 "8"
+#             ],
+#         )
+#         cls.base_url += "/v1"
+#         cls.tokenizer = get_tokenizer(cls.model)
 if __name__ == "__main__":
    unittest.main()
--- a/test/srt/test_function_call_parser.py
+++ b/test/srt/test_function_call_parser.py
@@ -2068,7 +2068,7 @@ class TestGlm4MoeDetector(unittest.TestCase):
            tool_calls[1]["parameters"], '{"city": "Shanghai", "date": "2024-06-28"}'
        )
-    def test_tool_call_completion(self):
+    def test_tool_call_id(self):
        """Test that the buffer and state are reset after a tool call is completed."""
        chunks = [
            "<tool_call>get_weather\n",