Unverified Commit 1b5d56f7 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

[CI] Add more multi-gpu tests (#1280)

parent d134c139
...@@ -18,7 +18,7 @@ concurrency: ...@@ -18,7 +18,7 @@ concurrency:
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
accuracy-test: one-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
...@@ -41,3 +41,34 @@ jobs: ...@@ -41,3 +41,34 @@ jobs:
run: | run: |
cd test/srt cd test/srt
python3 test_eval_accuracy_large.py python3 test_eval_accuracy_large.py
two-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 2-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
git clone https://github.com/merrymercy/human-eval.git
cd human-eval
pip install -e .
- name: Evaluate Accuracy
timeout-minutes: 20
run: |
cd test/srt
python3 test_moe_eval_accuracy_large.py
finish:
needs: [one-gpu, two-gpu]
runs-on: ubuntu-latest
steps:
- name: Finish
run: echo "This is an empty step to ensure that all jobs are completed."
name: Weekly Cache Purge
on:
schedule:
- cron: '0 0 * * 0' # Every Sunday at 00:00
workflow_dispatch:
jobs:
purge-cache:
if: github.repository == 'sgl-project/sglang'
runs-on: self-hosted
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Purge pip cache
run: |
source $HOME/venv/bin/activate
echo "$HOME/venv/bin" >> $GITHUB_PATH
pip cache purge
- name: Update dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
...@@ -18,7 +18,7 @@ concurrency: ...@@ -18,7 +18,7 @@ concurrency:
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
e2e-test: one-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
...@@ -41,7 +41,8 @@ jobs: ...@@ -41,7 +41,8 @@ jobs:
- name: Benchmark Serving Latency - name: Benchmark Serving Latency
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 1 --input 128 --output 8 cd test/srt
python3 -m unittest test_serving_latency.TestServingLatency.test_default
- name: Benchmark Serving Throughput (w/o RadixAttention) - name: Benchmark Serving Throughput (w/o RadixAttention)
timeout-minutes: 10 timeout-minutes: 10
...@@ -54,3 +55,42 @@ jobs: ...@@ -54,3 +55,42 @@ jobs:
run: | run: |
cd test/srt cd test/srt
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill
two-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 2-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
- name: Benchmark Serving Throughput (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
- name: Benchmark Serving Latency (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_moe_serving_latency.TestServingLatency.test_default
- name: Benchmark Serving Throughput (w/o RadixAttention) (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
finish:
needs: [one-gpu, two-gpu]
runs-on: ubuntu-latest
steps:
- name: Finish
run: echo "This is an empty step to ensure that all jobs are completed."
name: MoE Test
on:
push:
branches: [ main ]
paths:
- "python/sglang/**"
- "test/**"
pull_request:
branches: [ main ]
paths:
- "python/sglang/**"
- "test/**"
workflow_dispatch:
concurrency:
group: moe-test-${{ github.ref }}
cancel-in-progress: true
jobs:
moe-test:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 2-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
- name: Benchmark MoE Serving Throughput
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
- name: Benchmark MoE Serving Throughput (w/o RadixAttention)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
...@@ -18,7 +18,7 @@ concurrency: ...@@ -18,7 +18,7 @@ concurrency:
cancel-in-progress: true cancel-in-progress: true
jobs: jobs:
unit-test-jobs: run-test:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
strategy: strategy:
...@@ -48,9 +48,9 @@ jobs: ...@@ -48,9 +48,9 @@ jobs:
python3 run_suite.py --suite minimal --range-begin 8 python3 run_suite.py --suite minimal --range-begin 8
fi fi
unit-test: finish:
needs: unit-test-jobs needs: [run-test]
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- name: Merge step - name: Finish
run: echo "This is an empty merge step" run: echo "This is an empty step to ensure that all jobs are completed."
\ No newline at end of file
...@@ -11,26 +11,34 @@ python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct ...@@ -11,26 +11,34 @@ python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct
## plot the results in series of lines: ## plot the results in series of lines:
python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results" python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
# Usage (correctness test): # Usage (correctness test):
python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
## Reference output (of the correctness test above, can be gpu dependent): ## Reference output (of the correctness test above, can be gpu dependent):
prefill logits (first half) tensor([[-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633], input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
[-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633],
[ -9.1875, -10.2500, 2.7109, ..., -4.3359, -4.0664, -4.1328]], prefill logits (first half): tensor([[-10.0312, -9.5000, 0.8931, ..., -4.9414, -3.2422, -3.3633],
device='cuda:0', dtype=torch.float16) [-10.0312, -9.5000, 0.8931, ..., -4.9414, -3.2422, -3.3633],
prefill logits (final) tensor([[-8.3203, -7.1211, 3.3379, ..., -4.9570, -4.1328, -3.4141], [ -9.1875, -10.2500, 2.7129, ..., -4.3359, -4.0664, -4.1328]],
[-8.9062, -9.0156, 4.1445, ..., -4.9922, -4.4961, -4.0742], device='cuda:0')
[-9.6328, -9.0547, 4.0117, ..., -5.3047, -4.7148, -4.4609]],
device='cuda:0', dtype=torch.float16) prefill logits (final): tensor([[-8.3125, -7.1172, 3.3457, ..., -4.9570, -4.1328, -3.4141],
<s> The capital of France is. [-8.9141, -9.0156, 4.1445, ..., -4.9922, -4.4961, -4.0781],
[-9.6328, -9.0547, 4.0195, ..., -5.3047, -4.7148, -4.4570]],
device='cuda:0')
========== Prompt 0 ==========
<s> The capital of France is Paris.
The capital of the United States is Washington, D.C. The capital of the United States is Washington, D.C.
<s> The capital of the United Kindom is.
========== Prompt 1 ==========
<s> The capital of the United Kindom is London.
The capital of the United Kingdom is London. The capital of the United Kingdom is London.
The capital of the The capital of the
<s> Today is a sunny day and I like go for a walk in the park.
========== Prompt 2 ==========
<s> Today is a sunny day and I like to go for a walk in the park.
I'm going to the park I'm going to the park
""" """
...@@ -225,12 +233,12 @@ def correctness_test( ...@@ -225,12 +233,12 @@ def correctness_test(
# Prepare inputs # Prepare inputs
input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer) input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
rank_print(f"{input_ids=}") rank_print(f"\n{input_ids=}\n")
if bench_args.cut_len > 0: if bench_args.cut_len > 0:
# Prefill # Prefill
next_token_ids, next_token_logits, batch = extend(reqs, model_runner) next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
rank_print("prefill logits (first half)", next_token_logits) rank_print(f"prefill logits (first half): {next_token_logits} \n")
# Prepare extend inputs # Prepare extend inputs
reqs = prepare_extend_inputs_for_correctness_test( reqs = prepare_extend_inputs_for_correctness_test(
...@@ -239,7 +247,7 @@ def correctness_test( ...@@ -239,7 +247,7 @@ def correctness_test(
# Extend # Extend
next_token_ids, next_token_logits, batch = extend(reqs, model_runner) next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
rank_print("prefill logits (final)", next_token_logits) rank_print(f"prefill logits (final): {next_token_logits} \n")
# Decode # Decode
output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))] output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
...@@ -250,7 +258,8 @@ def correctness_test( ...@@ -250,7 +258,8 @@ def correctness_test(
# Print # Print
for i in range(len(reqs)): for i in range(len(reqs)):
rank_print(tokenizer.decode(output_ids[i])) rank_print(f"========== Prompt {i} ==========")
rank_print(tokenizer.decode(output_ids[i]), "\n")
@torch.inference_mode() @torch.inference_mode()
......
import unittest
from types import SimpleNamespace
from sglang.srt.utils import kill_child_process
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
class TestEvalAccuracyLarge(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--log-level-http",
"warning",
"--tp",
"2",
],
)
@classmethod
def tearDownClass(cls):
kill_child_process(cls.process.pid)
def test_mmlu(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mmlu",
num_examples=3000,
num_threads=1024,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.63, f"{metrics}"
def test_human_eval(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="humaneval",
num_examples=None,
num_threads=1024,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.43, f"{metrics}"
def test_mgsm_en(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mgsm_en",
num_examples=None,
num_threads=1024,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.64, f"{metrics}"
if __name__ == "__main__":
unittest.main()
import os
import subprocess
import unittest
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MOE_MODEL_NAME_FOR_TEST
class TestServingLatency(unittest.TestCase):
def test_default(self):
command = [
"python3",
"-m",
"sglang.bench_latency",
"--model",
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
"--batch-size",
"1",
"--input",
"128",
"--output",
"8",
"--tp",
"2",
]
process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
output = stdout.decode()
error = stderr.decode()
print(f"Output: {output}")
print(f"Error: {error}")
lastline = output.split("\n")[-3]
value = float(lastline.split(" ")[-2])
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert value > 125
kill_child_process(process.pid)
if __name__ == "__main__":
unittest.main()
...@@ -23,7 +23,6 @@ class TestServingThroughput(unittest.TestCase): ...@@ -23,7 +23,6 @@ class TestServingThroughput(unittest.TestCase):
other_args.append("--disable-flashinfer") other_args.append("--disable-flashinfer")
other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)]) other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
other_args.extend(["--tensor-parallel-size", "2"]) other_args.extend(["--tensor-parallel-size", "2"])
other_args.append("--enable-p2p-check")
model = DEFAULT_MOE_MODEL_NAME_FOR_TEST model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
base_url = DEFAULT_URL_FOR_TEST base_url = DEFAULT_URL_FOR_TEST
...@@ -35,7 +34,7 @@ class TestServingThroughput(unittest.TestCase): ...@@ -35,7 +34,7 @@ class TestServingThroughput(unittest.TestCase):
) )
# Run benchmark # Run benchmark
num_prompts = 200 num_prompts = 300
args = SimpleNamespace( args = SimpleNamespace(
backend="sglang", backend="sglang",
base_url=base_url, base_url=base_url,
...@@ -76,8 +75,7 @@ class TestServingThroughput(unittest.TestCase): ...@@ -76,8 +75,7 @@ class TestServingThroughput(unittest.TestCase):
) )
if os.getenv("SGLANG_IS_IN_CI", "false") == "true": if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 950, H100 (SMX): 1800 assert res["output_throughput"] > 1850
assert res["output_throughput"] > 1750
def test_default_without_radix_cache(self): def test_default_without_radix_cache(self):
res = self.run_test( res = self.run_test(
...@@ -87,18 +85,7 @@ class TestServingThroughput(unittest.TestCase): ...@@ -87,18 +85,7 @@ class TestServingThroughput(unittest.TestCase):
) )
if os.getenv("SGLANG_IS_IN_CI", "false") == "true": if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 950, H100 (SMX): 1900 assert res["output_throughput"] > 1950
assert res["output_throughput"] > 1850
def test_all_cases(self):
for disable_radix_cache in [False, True]:
for disable_flashinfer in [False, True]:
for chunked_prefill_size in [-1, 2048]:
self.run_test(
disable_radix_cache=False,
disable_flashinfer=False,
chunked_prefill_size=-1,
)
if __name__ == "__main__": if __name__ == "__main__":
......
import os
import subprocess
import unittest
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST
class TestServingLatency(unittest.TestCase):
def test_default(self):
command = [
"python3",
"-m",
"sglang.bench_latency",
"--model",
DEFAULT_MODEL_NAME_FOR_TEST,
"--batch-size",
"1",
"--input",
"128",
"--output",
"8",
]
process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
output = stdout.decode()
error = stderr.decode()
print(f"Output: {output}")
print(f"Error: {error}")
lastline = output.split("\n")[-3]
value = float(lastline.split(" ")[-2])
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert value > 130
kill_child_process(process.pid)
if __name__ == "__main__":
unittest.main()
...@@ -33,7 +33,7 @@ class TestServingThroughput(unittest.TestCase): ...@@ -33,7 +33,7 @@ class TestServingThroughput(unittest.TestCase):
) )
# Run benchmark # Run benchmark
num_prompts = 400 num_prompts = 500
args = SimpleNamespace( args = SimpleNamespace(
backend="sglang", backend="sglang",
base_url=base_url, base_url=base_url,
...@@ -74,8 +74,7 @@ class TestServingThroughput(unittest.TestCase): ...@@ -74,8 +74,7 @@ class TestServingThroughput(unittest.TestCase):
) )
if os.getenv("SGLANG_IS_IN_CI", "false") == "true": if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 1450, H100 (SMX): 2550 assert res["output_throughput"] > 2400
assert res["output_throughput"] > 2500
def test_default_without_radix_cache(self): def test_default_without_radix_cache(self):
res = self.run_test( res = self.run_test(
...@@ -85,7 +84,6 @@ class TestServingThroughput(unittest.TestCase): ...@@ -85,7 +84,6 @@ class TestServingThroughput(unittest.TestCase):
) )
if os.getenv("SGLANG_IS_IN_CI", "false") == "true": if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 1500, H100 (SMX): 2850
assert res["output_throughput"] > 2800 assert res["output_throughput"] > 2800
def test_default_without_chunked_prefill(self): def test_default_without_chunked_prefill(self):
...@@ -96,18 +94,7 @@ class TestServingThroughput(unittest.TestCase): ...@@ -96,18 +94,7 @@ class TestServingThroughput(unittest.TestCase):
) )
if os.getenv("SGLANG_IS_IN_CI", "false") == "true": if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 1450, H100 (SMX): 2550 assert res["output_throughput"] > 2400
assert res["output_throughput"] > 2500
def test_all_cases(self):
for disable_radix_cache in [False, True]:
for disable_flashinfer in [False, True]:
for chunked_prefill_size in [-1, 2048]:
self.run_test(
disable_radix_cache=False,
disable_flashinfer=False,
chunked_prefill_size=-1,
)
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment