# Comprehensive test for hybrid parallelism (DP/TP attention, DP/TP Dense FFN, TP/EP Sparse FFN, DP/VP LM head) plus speculative decoding. # These tests are not run by default but can be launched on demand. import unittest from types import SimpleNamespace from sglang.srt.utils import kill_process_tree from sglang.test.run_eval import run_eval from sglang.test.test_utils import ( DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST, DEFAULT_MLA_MODEL_NAME_FOR_TEST, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_URL_FOR_TEST, CustomTestCase, popen_launch_server, ) class Test00(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test01(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test02(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test03(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--moe-dense-tp-size", "1", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test04(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test05(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test06(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-dp-lm-head", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test07(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-dp-lm-head", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test08(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test09(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test10(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "128", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test11(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "128", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test12(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "128", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test13(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--moe-dense-tp-size", "1", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "128", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test14(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "128", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test15(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "128", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test16(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-dp-lm-head", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "128", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test17(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-dp-lm-head", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "128", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test18(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "128", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test19(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "128", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test20(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test21(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test22(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test23(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--moe-dense-tp-size", "1", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test24(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test25(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test26(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-dp-lm-head", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test27(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-dp-lm-head", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test28(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test29(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_MLA_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--enable-ep-moe", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test30(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test31(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test32(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test33(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--moe-dense-tp-size", "1", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test34(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test35(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test36(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-dp-lm-head", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test37(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-dp-lm-head", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test38(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test39(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test40(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "32", "--max-running-requests", "32", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test41(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "32", "--max-running-requests", "32", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test42(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "32", "--max-running-requests", "32", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test43(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--moe-dense-tp-size", "1", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "32", "--max-running-requests", "32", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test44(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "32", "--max-running-requests", "32", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test45(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "32", "--max-running-requests", "32", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test46(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-dp-lm-head", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "32", "--max-running-requests", "32", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test47(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-dp-lm-head", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "32", "--max-running-requests", "32", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test48(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "32", "--max-running-requests", "32", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test49(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--enable-deepep-moe", "--deepep-mode", "auto", "--cuda-graph-max-bs", "32", "--max-running-requests", "32", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test50(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test51(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test52(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test53(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--moe-dense-tp-size", "1", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test54(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test55(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test56(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--enable-dp-lm-head", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test57(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--enable-dp-lm-head", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test58(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "4", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) class Test59(CustomTestCase): @classmethod def setUpClass(cls): cls.model = DEFAULT_DEEPPEP_MODEL_NAME_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST cls.process = popen_launch_server( cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, other_args=[ "--trust-remote-code", "--tp", "8", "--enable-dp-attention", "--dp", "8", "--moe-dense-tp-size", "1", "--enable-dp-lm-head", "--enable-ep-moe", "--speculative-algo", "NEXTN", "--speculative-draft", "lmsys/DeepSeek-V3-0324-NextN", "--speculative-num-steps", "2", "--speculative-eagle-topk", "4", "--speculative-num-draft-tokens", "4", ], ) @classmethod def tearDownClass(cls): kill_process_tree(cls.process.pid) def test_mmlu(self): args = SimpleNamespace( base_url=self.base_url, model=self.model, eval_name="mmlu", num_examples=64, num_threads=32, ) metrics = run_eval(args) print(f"{metrics=}") self.assertGreater(metrics["score"], 0.48) if __name__ == "__main__": unittest.main()