[CI] Add more multi-gpu tests (#1280)

1b5d56f7 · Lianmin Zheng · GitHub · d134c139 · 1b5d56f7 · d134c139
Unverified Commit 1b5d56f7 authored Sep 01, 2024 by Lianmin Zheng Committed by GitHub Sep 01, 2024
11 changed files
--- a/.github/workflows/accuracy-test.yml
+++ b/.github/workflows/accuracy-test.yml
@@ -18,7 +18,7 @@ concurrency:
  cancel-in-progress: true
 jobs:
-  accuracy-test:
+  one-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 1-gpu-runner
@@ -41,3 +41,34 @@ jobs:
        run: |
          cd test/srt
          python3 test_eval_accuracy_large.py
+  two-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 2-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install -e "python[all]"
+          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
+          git clone https://github.com/merrymercy/human-eval.git
+          cd human-eval
+          pip install -e .
+      - name: Evaluate Accuracy
+        timeout-minutes: 20
+        run: |
+          cd test/srt
+          python3 test_moe_eval_accuracy_large.py
+  finish:
+    needs: [one-gpu, two-gpu]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Finish
+        run: echo "This is an empty step to ensure that all jobs are completed."
--- a/.github/workflows/cache-purge.yml
+++ b/.github/workflows/cache-purge.yml
-name: Weekly Cache Purge
-on:
-  schedule:
-    - cron: '0 0 * * 0' # Every Sunday at 00:00
-  workflow_dispatch:
-jobs:
-  purge-cache:
-    if: github.repository == 'sgl-project/sglang'
-    runs-on: self-hosted
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v3
-    - name: Purge pip cache
-      run: |
-        source $HOME/venv/bin/activate
-        echo "$HOME/venv/bin" >> $GITHUB_PATH
-        pip cache purge
-    - name: Update dependencies
-      run: |
-        pip install --upgrade pip
-        pip install -e "python[all]"
-        pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
--- a/.github/workflows/e2e-test.yml
+++ b/.github/workflows/e2e-test.yml
@@ -18,7 +18,7 @@ concurrency:
  cancel-in-progress: true
 jobs:
-  e2e-test:
+  one-gpu:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 1-gpu-runner
@@ -41,7 +41,8 @@ jobs:
      - name: Benchmark Serving Latency
        timeout-minutes: 10
        run: |
-          python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 1 --input 128 --output 8
+          cd test/srt
+          python3 -m unittest test_serving_latency.TestServingLatency.test_default
      - name: Benchmark Serving Throughput (w/o RadixAttention)
        timeout-minutes: 10
@@ -54,3 +55,42 @@ jobs:
        run: |
          cd test/srt
          python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill
+  two-gpu:
+    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
+    runs-on: 2-gpu-runner
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install -e "python[all]"
+          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
+      - name: Benchmark Serving Throughput (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
+      - name: Benchmark Serving Latency (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_moe_serving_latency.TestServingLatency.test_default
+      - name: Benchmark Serving Throughput (w/o RadixAttention) (TP=2)
+        timeout-minutes: 10
+        run: |
+          cd test/srt
+          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
+  finish:
+    needs: [one-gpu, two-gpu]
+    runs-on: ubuntu-latest
+    steps:
+      - name: Finish
+        run: echo "This is an empty step to ensure that all jobs are completed."
--- a/.github/workflows/moe-test.yml
+++ b/.github/workflows/moe-test.yml
-name: MoE Test
-on:
-  push:
-    branches: [ main ]
-    paths:
-      - "python/sglang/**"
-      - "test/**"
-  pull_request:
-    branches: [ main ]
-    paths:
-      - "python/sglang/**"
-      - "test/**"
-  workflow_dispatch:
-concurrency:
-  group: moe-test-${{ github.ref }}
-  cancel-in-progress: true
-jobs:
-  moe-test:
-    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
-    runs-on: 2-gpu-runner
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-      - name: Install dependencies
-        run: |
-          pip install --upgrade pip
-          pip install -e "python[all]"
-          pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
-      - name: Benchmark MoE Serving Throughput
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
-      - name: Benchmark MoE Serving Throughput (w/o RadixAttention)
-        timeout-minutes: 10
-        run: |
-          cd test/srt
-          python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
--- a/.github/workflows/unit-test.yml
+++ b/.github/workflows/unit-test.yml
@@ -18,7 +18,7 @@ concurrency:
  cancel-in-progress: true
 jobs:
-  unit-test-jobs:
+  run-test:
    if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
    runs-on: 1-gpu-runner
    strategy:
@@ -48,9 +48,9 @@ jobs:
            python3 run_suite.py --suite minimal --range-begin 8
          fi
-  unit-test:
+  finish:
-    needs: unit-test-jobs
+    needs: [run-test]
    runs-on: ubuntu-latest
    steps:
-      - name: Merge step
+      - name: Finish
-        run: echo "This is an empty merge step"
+        run: echo "This is an empty step to ensure that all jobs are completed."
\ No newline at end of file
--- a/python/sglang/bench_latency.py
+++ b/python/sglang/bench_latency.py
@@ -11,26 +11,34 @@ python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct
 ## plot the results in series of lines:
 python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
 # Usage (correctness test):
 python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
 ## Reference output (of the correctness test above, can be gpu dependent):
-prefill logits (first half) tensor([[-10.0312,  -9.5000,   0.8936,  ...,  -4.9414,  -3.2402,  -3.3633],
+input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
-        [-10.0312,  -9.5000,   0.8936,  ...,  -4.9414,  -3.2402,  -3.3633],
-        [ -9.1875, -10.2500,   2.7109,  ...,  -4.3359,  -4.0664,  -4.1328]],
+prefill logits (first half): tensor([[-10.0312,  -9.5000,   0.8931,  ...,  -4.9414,  -3.2422,  -3.3633],
-       device='cuda:0', dtype=torch.float16)
+        [-10.0312,  -9.5000,   0.8931,  ...,  -4.9414,  -3.2422,  -3.3633],
-prefill logits (final) tensor([[-8.3203, -7.1211,  3.3379,  ..., -4.9570, -4.1328, -3.4141],
+        [ -9.1875, -10.2500,   2.7129,  ...,  -4.3359,  -4.0664,  -4.1328]],
-        [-8.9062, -9.0156,  4.1445,  ..., -4.9922, -4.4961, -4.0742],
+       device='cuda:0')
-        [-9.6328, -9.0547,  4.0117,  ..., -5.3047, -4.7148, -4.4609]],
-       device='cuda:0', dtype=torch.float16)
+prefill logits (final): tensor([[-8.3125, -7.1172,  3.3457,  ..., -4.9570, -4.1328, -3.4141],
-<s> The capital of France is.
+        [-8.9141, -9.0156,  4.1445,  ..., -4.9922, -4.4961, -4.0781],
+        [-9.6328, -9.0547,  4.0195,  ..., -5.3047, -4.7148, -4.4570]],
+       device='cuda:0')
+========== Prompt 0 ==========
+<s> The capital of France is Paris.
 The capital of the United States is Washington, D.C.
-<s> The capital of the United Kindom is.
+========== Prompt 1 ==========
+<s> The capital of the United Kindom is London.
 The capital of the United Kingdom is London.
 The capital of the
-<s> Today is a sunny day and I like go for a walk in the park.
+========== Prompt 2 ==========
+<s> Today is a sunny day and I like to go for a walk in the park.
 I'm going to the park
 """
@@ -225,12 +233,12 @@ def correctness_test(
    # Prepare inputs
    input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
-    rank_print(f"{input_ids=}")
+    rank_print(f"\n{input_ids=}\n")
    if bench_args.cut_len > 0:
        # Prefill
        next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
-        rank_print("prefill logits (first half)", next_token_logits)
+        rank_print(f"prefill logits (first half): {next_token_logits} \n")
    # Prepare extend inputs
    reqs = prepare_extend_inputs_for_correctness_test(
@@ -239,7 +247,7 @@ def correctness_test(
    # Extend
    next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
-    rank_print("prefill logits (final)", next_token_logits)
+    rank_print(f"prefill logits (final): {next_token_logits} \n")
    # Decode
    output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
@@ -250,7 +258,8 @@ def correctness_test(
    # Print
    for i in range(len(reqs)):
-        rank_print(tokenizer.decode(output_ids[i]))
+        rank_print(f"========== Prompt {i} ==========")
+        rank_print(tokenizer.decode(output_ids[i]), "\n")
 @torch.inference_mode()

--- a/test/srt/test_moe_eval_accuracy_large.py
+++ b/test/srt/test_moe_eval_accuracy_large.py
+import unittest
+from types import SimpleNamespace
+from sglang.srt.utils import kill_child_process
+from sglang.test.run_eval import run_eval
+from sglang.test.test_utils import (
+    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+    DEFAULT_URL_FOR_TEST,
+    popen_launch_server,
+)
+class TestEvalAccuracyLarge(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
+        cls.base_url = DEFAULT_URL_FOR_TEST
+        cls.process = popen_launch_server(
+            cls.model,
+            cls.base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=[
+                "--log-level-http",
+                "warning",
+                "--tp",
+                "2",
+            ],
+        )
+    @classmethod
+    def tearDownClass(cls):
+        kill_child_process(cls.process.pid)
+    def test_mmlu(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mmlu",
+            num_examples=3000,
+            num_threads=1024,
+        )
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.63, f"{metrics}"
+    def test_human_eval(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="humaneval",
+            num_examples=None,
+            num_threads=1024,
+        )
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.43, f"{metrics}"
+    def test_mgsm_en(self):
+        args = SimpleNamespace(
+            base_url=self.base_url,
+            model=self.model,
+            eval_name="mgsm_en",
+            num_examples=None,
+            num_threads=1024,
+        )
+        metrics = run_eval(args)
+        assert metrics["score"] >= 0.64, f"{metrics}"
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_moe_serving_latency.py
+++ b/test/srt/test_moe_serving_latency.py
+import os
+import subprocess
+import unittest
+from sglang.srt.utils import kill_child_process
+from sglang.test.test_utils import DEFAULT_MOE_MODEL_NAME_FOR_TEST
+class TestServingLatency(unittest.TestCase):
+    def test_default(self):
+        command = [
+            "python3",
+            "-m",
+            "sglang.bench_latency",
+            "--model",
+            DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+            "--batch-size",
+            "1",
+            "--input",
+            "128",
+            "--output",
+            "8",
+            "--tp",
+            "2",
+        ]
+        process = subprocess.Popen(
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        stdout, stderr = process.communicate()
+        output = stdout.decode()
+        error = stderr.decode()
+        print(f"Output: {output}")
+        print(f"Error: {error}")
+        lastline = output.split("\n")[-3]
+        value = float(lastline.split(" ")[-2])
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert value > 125
+        kill_child_process(process.pid)
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_moe_serving_throughput.py
+++ b/test/srt/test_moe_serving_throughput.py
@@ -23,7 +23,6 @@ class TestServingThroughput(unittest.TestCase):
            other_args.append("--disable-flashinfer")
        other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
        other_args.extend(["--tensor-parallel-size", "2"])
-        other_args.append("--enable-p2p-check")
        model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
        base_url = DEFAULT_URL_FOR_TEST
@@ -35,7 +34,7 @@ class TestServingThroughput(unittest.TestCase):
        )
        # Run benchmark
-        num_prompts = 200
+        num_prompts = 300
        args = SimpleNamespace(
            backend="sglang",
            base_url=base_url,
@@ -76,8 +75,7 @@ class TestServingThroughput(unittest.TestCase):
        )
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 950, H100 (SMX): 1800
+            assert res["output_throughput"] > 1850
-            assert res["output_throughput"] > 1750
    def test_default_without_radix_cache(self):
        res = self.run_test(
@@ -87,18 +85,7 @@ class TestServingThroughput(unittest.TestCase):
        )
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 950, H100 (SMX): 1900
+            assert res["output_throughput"] > 1950
-            assert res["output_throughput"] > 1850
-    def test_all_cases(self):
-        for disable_radix_cache in [False, True]:
-            for disable_flashinfer in [False, True]:
-                for chunked_prefill_size in [-1, 2048]:
-                    self.run_test(
-                        disable_radix_cache=False,
-                        disable_flashinfer=False,
-                        chunked_prefill_size=-1,
-                    )
 if __name__ == "__main__":

--- a/test/srt/test_serving_latency.py
+++ b/test/srt/test_serving_latency.py
+import os
+import subprocess
+import unittest
+from sglang.srt.utils import kill_child_process
+from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST
+class TestServingLatency(unittest.TestCase):
+    def test_default(self):
+        command = [
+            "python3",
+            "-m",
+            "sglang.bench_latency",
+            "--model",
+            DEFAULT_MODEL_NAME_FOR_TEST,
+            "--batch-size",
+            "1",
+            "--input",
+            "128",
+            "--output",
+            "8",
+        ]
+        process = subprocess.Popen(
+            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
+        )
+        stdout, stderr = process.communicate()
+        output = stdout.decode()
+        error = stderr.decode()
+        print(f"Output: {output}")
+        print(f"Error: {error}")
+        lastline = output.split("\n")[-3]
+        value = float(lastline.split(" ")[-2])
+        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
+            assert value > 130
+        kill_child_process(process.pid)
+if __name__ == "__main__":
+    unittest.main()
--- a/test/srt/test_serving_throughput.py
+++ b/test/srt/test_serving_throughput.py
@@ -33,7 +33,7 @@ class TestServingThroughput(unittest.TestCase):
        )
        # Run benchmark
-        num_prompts = 400
+        num_prompts = 500
        args = SimpleNamespace(
            backend="sglang",
            base_url=base_url,
@@ -74,8 +74,7 @@ class TestServingThroughput(unittest.TestCase):
        )
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 1450, H100 (SMX): 2550
+            assert res["output_throughput"] > 2400
-            assert res["output_throughput"] > 2500
    def test_default_without_radix_cache(self):
        res = self.run_test(
@@ -85,7 +84,6 @@ class TestServingThroughput(unittest.TestCase):
        )
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 1500, H100 (SMX): 2850
            assert res["output_throughput"] > 2800
    def test_default_without_chunked_prefill(self):
@@ -96,18 +94,7 @@ class TestServingThroughput(unittest.TestCase):
        )
        if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
-            # A100 (PCIE): 1450, H100 (SMX): 2550
+            assert res["output_throughput"] > 2400
-            assert res["output_throughput"] > 2500
-    def test_all_cases(self):
-        for disable_radix_cache in [False, True]:
-            for disable_flashinfer in [False, True]:
-                for chunked_prefill_size in [-1, 2048]:
-                    self.run_test(
-                        disable_radix_cache=False,
-                        disable_flashinfer=False,
-                        chunked_prefill_size=-1,
-                    )
 if __name__ == "__main__":