feat: add deepseek v3 fp4 ut (#10391)

9d775b1a · Yineng Zhang · GitHub · 78b7465c · 9d775b1a · 9d775b1a
Unverified Commit 9d775b1a authored Sep 12, 2025 by Yineng Zhang Committed by GitHub Sep 12, 2025
Showing with 166 additions and 1 deletion

.github/workflows/pr-test.yml .github/workflows/pr-test.yml +1 -1

test/srt/run_suite.py test/srt/run_suite.py +1 -0

test/srt/test_deepseek_v3_fp4_4gpu.py test/srt/test_deepseek_v3_fp4_4gpu.py +164 -0

No files found.
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -405,7 +405,7 @@ jobs:
          IS_BLACKWELL=1 bash scripts/ci/ci_install_dependency.sh
      - name: Run test
-        timeout-minutes: 20
+        timeout-minutes: 60
        run: |
          cd test/srt
          python3 run_suite.py --suite per-commit-8-gpu-b200 --auto-partition-id 0 --auto-partition-size 1

--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -148,6 +148,7 @@ suites = {
    "per-commit-8-gpu-b200": [
        # add more here
        TestFile("test_gpt_oss_4gpu.py", 600),
+        TestFile("test_deepseek_v3_fp4_4gpu.py", 600),
    ],
    "per-commit-4-gpu-deepep": [
        TestFile("ep/test_deepep_small.py", 531),

--- a/test/srt/test_deepseek_v3_fp4_4gpu.py
+++ b/test/srt/test_deepseek_v3_fp4_4gpu.py
+import unittest
+from types import SimpleNamespace
+import requests
+from sglang.srt.utils import kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.send_one import BenchArgs, send_one_prompt
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    CustomTestCase,
+    is_in_ci,
+    popen_launch_server,
+    write_github_step_summary,
+)
+FULL_DEEPSEEK_V3_FP4_MODEL_PATH = "nvidia/DeepSeek-V3-0324-FP4"
+class TestDeepseekV3FP4(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = FULL_DEEPSEEK_V3_FP4_MODEL_PATH
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = [
+            "--tp",
+            "4",
+            "--attention-backend",
+            "trtllm_mla",
+            "--moe-runner-backend",
+            "flashinfer_trtllm",
+            "--quantization",
+            "modelopt_fp4",
+        ]
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+    def test_a_gsm8k(
+        self,
+    ):  # Append an "a" to make this test run first (alphabetically) to warm up the server
+        args = SimpleNamespace(
+            num_shots=8,
+            data_path=None,
+            num_questions=1319,
+            parallel=1319,
+            max_new_tokens=512,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"{metrics=}")
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_gsm8k (deepseek-v3-fp4)\n" f'{metrics["accuracy"]=:.3f}\n'
+            )
+            self.assertGreater(metrics["accuracy"], 0.935)
+    def test_bs_1_speed(self):
+        args = BenchArgs(port=int(self.base_url.split(":")[-1]), max_new_tokens=2048)
+        acc_length, speed = send_one_prompt(args)
+        print(f"{speed=:.2f}")
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_bs_1_speed (deepseek-v3-fp4)\n" f"{speed=:.2f} token/s\n"
+            )
+            self.assertGreater(speed, 75)
+class TestDeepseekV3FP4MTP(CustomTestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = FULL_DEEPSEEK_V3_FP4_MODEL_PATH
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = [
+            "--tp",
+            "4",
+            "--attention-backend",
+            "trtllm_mla",
+            "--moe-runner-backend",
+            "flashinfer_trtllm",
+            "--quantization",
+            "modelopt_fp4",
+            "--speculative-algorithm",
+            "EAGLE",
+            "--speculative-num-steps",
+            "3",
+            "--speculative-eagle-topk",
+            "1",
+            "--speculative-num-draft-tokens",
+            "4",
+        ]
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+    def test_a_gsm8k(
+        self,
+    ):  # Append an "a" to make this test run first (alphabetically) to warm up the server
+        requests.get(self.base_url + "/flush_cache")
+        args = SimpleNamespace(
+            num_shots=5,
+            data_path=None,
+            num_questions=200,
+            max_new_tokens=512,
+            parallel=128,
+            host="http://127.0.0.1",
+            port=int(self.base_url.split(":")[-1]),
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(f"{metrics=}")
+        server_info = requests.get(self.base_url + "/get_server_info")
+        avg_spec_accept_length = server_info.json()["internal_states"][0][
+            "avg_spec_accept_length"
+        ]
+        print(f"{avg_spec_accept_length=}")
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_gsm8k (deepseek-v3-fp4 mtp)\n"
+                f'{metrics["accuracy"]=:.3f}\n'
+                f"{avg_spec_accept_length=:.2f}\n"
+            )
+            self.assertGreater(metrics["accuracy"], 0.935)
+            self.assertGreater(avg_spec_accept_length, 2.9)
+    def test_bs_1_speed(self):
+        args = BenchArgs(port=int(self.base_url.split(":")[-1]), max_new_tokens=2048)
+        acc_length, speed = send_one_prompt(args)
+        print(f"{acc_length=:.2f} {speed=:.2f}")
+        if is_in_ci():
+            write_github_step_summary(
+                f"### test_bs_1_speed (deepseek-v3-fp4 mtp)\n"
+                f"{acc_length=:.2f}\n"
+                f"{speed=:.2f} token/s\n"
+            )
+            self.assertGreater(acc_length, 2.9)
+            self.assertGreater(speed, 130)
+if __name__ == "__main__":
+    unittest.main()