[CI] Add Llama 3.1 8B FP4 to B200 CI (#12182)

7ed8ba05 · b8zhong · GitHub · df08f346 · 7ed8ba05 · 7ed8ba05
Unverified Commit 7ed8ba05 authored Oct 29, 2025 by b8zhong Committed by GitHub Oct 29, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 60 additions and 1 deletion

test/srt/run_suite.py test/srt/run_suite.py +2 -1

test/srt/test_llama31_fp4.py test/srt/test_llama31_fp4.py +58 -0

No files found.
--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -180,9 +180,10 @@ suites = {
        TestFile("test_disaggregation_pp.py", 140),
    ],
    "per-commit-4-gpu-b200": [
+        TestFile("test_deepseek_v3_fp4_4gpu.py", 3600),
        TestFile("test_flash_attention_4.py", 300),
        TestFile("test_gpt_oss_4gpu.py", 600),
-        TestFile("test_deepseek_v3_fp4_4gpu.py", 3600),
+        TestFile("test_llama31_fp4.py", 300),
    ],
    "per-commit-4-gpu-deepep": [
        TestFile("ep/test_deepep_small.py", 531),

--- a/test/srt/test_llama31_fp4.py
+++ b/test/srt/test_llama31_fp4.py
+import unittest
+from types import SimpleNamespace
+from urllib.parse import urlparse
+from sglang.srt.utils import get_device_sm, kill_process_tree
+from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
+from sglang.test.test_utils import (
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+MODEL_PATH = "nvidia/Llama-3.1-8B-Instruct-FP4"
+@unittest.skipIf(get_device_sm() < 100, "Test requires CUDA SM 100 or higher")
+class TestLlama31FP4B200(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = MODEL_PATH
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        other_args = [
+            "--trust-remote-code",
+            "--mem-fraction-static",
+            "0.8",
+            "--quantization",
+            "modelopt_fp4",
+        ]
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+    @classmethod
+    def tearDownClass(cls):
+        kill_process_tree(cls.process.pid)
+    def test_gsm8k(self):
+        parsed_url = urlparse(self.base_url)
+        args = SimpleNamespace(
+            num_shots=4,
+            data_path=None,
+            num_questions=100,
+            max_new_tokens=512,
+            parallel=128,
+            host=f"{parsed_url.scheme}://{parsed_url.hostname}",
+            port=parsed_url.port,
+        )
+        metrics = run_eval_few_shot_gsm8k(args)
+        print(metrics)
+        self.assertGreater(metrics["accuracy"], 0.61)
+if __name__ == "__main__":
+    unittest.main()