Clean up server_args.py to have a dedicated function for model specific adjustments (#8983)

706bd69c · Lianmin Zheng · GitHub · 23f2afb2 · 706bd69c · 706bd69c
Unverified Commit 706bd69c authored Aug 08, 2025 by Lianmin Zheng Committed by GitHub Aug 08, 2025
4 changed files
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
 import asyncio
 import itertools
 import unittest
-from random import random, uniform

 import requests


--- a/test/srt/test_mla_deepseek_v3.py
+++ b/test/srt/test_mla_deepseek_v3.py
@@ -149,66 +149,5 @@ class TestDeepseekV3MTP(CustomTestCase):
        self.assertGreater(avg_spec_accept_length, 2.5)


-# compatible with old APIs
-class TestDeepseekV3MTPWithDraft(CustomTestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = "lmsys/sglang-ci-dsv3-test"
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        other_args = [
-            "--trust-remote-code",
-            "--cuda-graph-max-bs",
-            "2",
-            "--disable-radix",
-            "--enable-torch-compile",
-            "--torch-compile-max-bs",
-            "1",
-            "--speculative-algorithm",
-            "EAGLE",
-            "--speculative-draft",
-            "lmsys/sglang-ci-dsv3-test-NextN",
-            "--speculative-num-steps",
-            "2",
-            "--speculative-eagle-topk",
-            "4",
-            "--speculative-num-draft-tokens",
-            "4",
-        ]
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=other_args,
-        )
-
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-
-    def test_gsm8k(self):
-        requests.get(self.base_url + "/flush_cache")
-
-        args = SimpleNamespace(
-            num_shots=5,
-            data_path=None,
-            num_questions=200,
-            max_new_tokens=512,
-            parallel=128,
-            host="http://127.0.0.1",
-            port=int(self.base_url.split(":")[-1]),
-        )
-        metrics = run_eval_few_shot_gsm8k(args)
-        print(metrics)
-
-        self.assertGreater(metrics["accuracy"], 0.60)
-
-        server_info = requests.get(self.base_url + "/get_server_info")
-        avg_spec_accept_length = server_info.json()["internal_states"][0][
-            "avg_spec_accept_length"
-        ]
-        print(f"{avg_spec_accept_length=}")
-        self.assertGreater(avg_spec_accept_length, 2.5)
-
-
 if __name__ == "__main__":
    unittest.main()
--- a/test/srt/test_mla_flashinfer.py
+++ b/test/srt/test_mla_flashinfer.py
@@ -25,7 +25,7 @@ class TestFlashinferMLA(CustomTestCase):
                [
                    "--enable-torch-compile",
                    "--cuda-graph-max-bs",
-                    "2",
+                    "4",
                    "--attention-backend",
                    "flashinfer",
                ]
@@ -68,7 +68,6 @@ class TestFlashinferMLAMTP(CustomTestCase):
                [
                    "--cuda-graph-max-bs",
                    "4",
-                    "--disable-radix",
                    "--enable-torch-compile",
                    "--torch-compile-max-bs",
                    "1",

--- a/test/srt/test_mla_int8_deepseek_v3.py
+++ b/test/srt/test_mla_int8_deepseek_v3.py
@@ -10,6 +10,7 @@ from sglang.test.test_utils import (
    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
    DEFAULT_URL_FOR_TEST,
    CustomTestCase,
+    is_in_ci,
    popen_launch_server,
 )

@@ -112,6 +113,7 @@ class TestDeepseekV3MTPChannelInt8(CustomTestCase):
        self.assertGreater(avg_spec_accept_length, 2.5)


+@unittest.skipIf(is_in_ci(), "To reduce the CI execution time.")
 class TestMLADeepseekV3BlockInt8(CustomTestCase):
    @classmethod
    def setUpClass(cls):