feat: update nightly gsm8k eval (#1304)

2561ed01 · Yineng Zhang · GitHub · 99994427 · 2561ed01 · 2561ed01
Unverified Commit 2561ed01 authored Sep 03, 2024 by Yineng Zhang Committed by GitHub Sep 03, 2024
3 changed files
--- a/.github/workflows/nightly-eval.yml
+++ b/.github/workflows/nightly-eval.yml
@@ -15,9 +15,9 @@ concurrency:
  cancel-in-progress: true
 jobs:
-  meta-llama-31-8b-instruct:
+  nightly-eval-2-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
-    runs-on: 1-gpu-runner
+    runs-on: 2-gpu-runner
    steps:
      - name: Checkout code
        uses: actions/checkout@v3
@@ -25,42 +25,11 @@ jobs:
      - name: Install dependencies
        run: |
          pip install --upgrade pip
-          pip install -e "python[dev]"
+          pip install -e "python[all]"
          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
-          git clone https://github.com/EleutherAI/lm-evaluation-harness
-          pushd lm-evaluation-harness
-          pip install -e .
-          pip install lm_eval[api]
-          popd
-      - name: Run eval
+      - name: Nightly gsm8k Accuracy
-        timeout-minutes: 20
+        timeout-minutes: 60
        run: |
-            python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --disable-radix-cache &
+          cd test/srt
+          python3 test_nightly_gsm8k_eval.py
-            echo "Waiting for server to start..."
-            for i in {1..120}; do
-              if curl -s http://127.0.0.1:30000/health; then
-                echo "Server is up!"
-                break
-              fi
-              if [ $i -eq 120 ]; then
-                echo "Server failed to start within 120 seconds"
-                exit 1
-              fi
-              sleep 1
-            done
-            lm_eval --model local-completions --tasks gsm8k --model_args model=meta-llama/Meta-Llama-3.1-8B-Instruct,base_url=http://127.0.0.1:30000/v1/completions,num_concurrent=128,max_retries=3,tokenized_requests=False
-            echo "Stopping server..."
-            kill -9 $(ps aux | grep sglang | grep Meta-Llama-3.1-8B-Instruct | grep -v grep | awk '{print $2}')
-  finish:
-    needs: [
-      meta-llama-31-8b-instruct
-    ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Finish
-        run: echo "This is an empty step to ensure that all jobs are completed."
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -23,6 +23,10 @@ from sglang.utils import get_exception_traceback
 DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
 DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
 DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
+DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
 if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
    DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157

--- a/test/srt/test_nightly_gsm8k_eval.py
+++ b/test/srt/test_nightly_gsm8k_eval.py
+import unittest
+from types import SimpleNamespace
+from sglang.srt.utils import kill_child_process
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1,
+    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2,
+    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1,
+    DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+def parse_models(model_string):
+    return [model.strip() for model in model_string.split(",") if model.strip()]
+class TestEvalAccuracyLarge(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model_groups = [
+            (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
+            (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
+            (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False),
+            (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
+        ]
+        cls.base_url = DEFAULT_URL_FOR_TEST
+    def setUp(self):
+        self.process = None
+    def tearDown(self):
+        if self.process:
+            kill_child_process(self.process.pid)
+    def launch_server(self, model, is_fp8, is_tp2):
+        other_args = ["--log-level-http", "warning", "--trust-remote-code"]
+        if is_fp8:
+            if "Llama-3" in model or "gemma-2" in model:
+                # compressed-tensors
+                other_args.extend(["--kv-cache-dtype", "fp8_e5m2"])
+            elif "Qwen2-72B-Instruct-FP8" in model:
+                # bug
+                other_args.extend(["--quantization", "fp8"])
+            else:
+                other_args.extend(
+                    ["--quantization", "fp8", "--kv-cache-dtype", "fp8_e5m2"]
+                )
+        if is_tp2:
+            other_args.extend(["--tp", "2"])
+        if "DeepSeek" in model:
+            other_args.append("--enable-mla")
+        self.process = popen_launch_server(
+            model,
+            self.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_args,
+        )
+    def test_mgsm_en_all_models(self):
+        for model_group, is_fp8, is_tp2 in self.model_groups:
+            for model in model_group:
+                with self.subTest(model=model):
+                    self.launch_server(model, is_fp8, is_tp2)
+                    args = SimpleNamespace(
+                        base_url=self.base_url,
+                        model=model,
+                        eval_name="mgsm_en",
+                        num_examples=None,
+                        num_threads=1024,
+                    )
+                    metrics = run_eval(args)
+                    print(
+                        f"{'=' * 42}\n{model} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n"
+                    )
+                    # loosely threshold
+                    assert metrics["score"] > 0.5, f"score={metrics['score']} <= 0.5"
+                    self.tearDown()
+if __name__ == "__main__":
+    unittest.main()