Unverified Commit 1b5d56f7 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

[CI] Add more multi-gpu tests (#1280)

parent d134c139
......@@ -18,7 +18,7 @@ concurrency:
cancel-in-progress: true
jobs:
accuracy-test:
one-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 1-gpu-runner
......@@ -41,3 +41,34 @@ jobs:
run: |
cd test/srt
python3 test_eval_accuracy_large.py
two-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 2-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
git clone https://github.com/merrymercy/human-eval.git
cd human-eval
pip install -e .
- name: Evaluate Accuracy
timeout-minutes: 20
run: |
cd test/srt
python3 test_moe_eval_accuracy_large.py
finish:
needs: [one-gpu, two-gpu]
runs-on: ubuntu-latest
steps:
- name: Finish
run: echo "This is an empty step to ensure that all jobs are completed."
name: Weekly Cache Purge
on:
schedule:
- cron: '0 0 * * 0' # Every Sunday at 00:00
workflow_dispatch:
jobs:
purge-cache:
if: github.repository == 'sgl-project/sglang'
runs-on: self-hosted
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Purge pip cache
run: |
source $HOME/venv/bin/activate
echo "$HOME/venv/bin" >> $GITHUB_PATH
pip cache purge
- name: Update dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
......@@ -18,7 +18,7 @@ concurrency:
cancel-in-progress: true
jobs:
e2e-test:
one-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 1-gpu-runner
......@@ -41,7 +41,8 @@ jobs:
- name: Benchmark Serving Latency
timeout-minutes: 10
run: |
python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3.1-8B-Instruct --batch-size 1 --input 128 --output 8
cd test/srt
python3 -m unittest test_serving_latency.TestServingLatency.test_default
- name: Benchmark Serving Throughput (w/o RadixAttention)
timeout-minutes: 10
......@@ -54,3 +55,42 @@ jobs:
run: |
cd test/srt
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill
two-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 2-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
- name: Benchmark Serving Throughput (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
- name: Benchmark Serving Latency (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_moe_serving_latency.TestServingLatency.test_default
- name: Benchmark Serving Throughput (w/o RadixAttention) (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
finish:
needs: [one-gpu, two-gpu]
runs-on: ubuntu-latest
steps:
- name: Finish
run: echo "This is an empty step to ensure that all jobs are completed."
name: MoE Test
on:
push:
branches: [ main ]
paths:
- "python/sglang/**"
- "test/**"
pull_request:
branches: [ main ]
paths:
- "python/sglang/**"
- "test/**"
workflow_dispatch:
concurrency:
group: moe-test-${{ github.ref }}
cancel-in-progress: true
jobs:
moe-test:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 2-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
- name: Benchmark MoE Serving Throughput
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
- name: Benchmark MoE Serving Throughput (w/o RadixAttention)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
......@@ -18,7 +18,7 @@ concurrency:
cancel-in-progress: true
jobs:
unit-test-jobs:
run-test:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 1-gpu-runner
strategy:
......@@ -48,9 +48,9 @@ jobs:
python3 run_suite.py --suite minimal --range-begin 8
fi
unit-test:
needs: unit-test-jobs
finish:
needs: [run-test]
runs-on: ubuntu-latest
steps:
- name: Merge step
run: echo "This is an empty merge step"
\ No newline at end of file
- name: Finish
run: echo "This is an empty step to ensure that all jobs are completed."
......@@ -11,26 +11,34 @@ python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct
## plot the results in series of lines:
python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
# Usage (correctness test):
python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
## Reference output (of the correctness test above, can be gpu dependent):
prefill logits (first half) tensor([[-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633],
[-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633],
[ -9.1875, -10.2500, 2.7109, ..., -4.3359, -4.0664, -4.1328]],
device='cuda:0', dtype=torch.float16)
prefill logits (final) tensor([[-8.3203, -7.1211, 3.3379, ..., -4.9570, -4.1328, -3.4141],
[-8.9062, -9.0156, 4.1445, ..., -4.9922, -4.4961, -4.0742],
[-9.6328, -9.0547, 4.0117, ..., -5.3047, -4.7148, -4.4609]],
device='cuda:0', dtype=torch.float16)
<s> The capital of France is.
input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
prefill logits (first half): tensor([[-10.0312, -9.5000, 0.8931, ..., -4.9414, -3.2422, -3.3633],
[-10.0312, -9.5000, 0.8931, ..., -4.9414, -3.2422, -3.3633],
[ -9.1875, -10.2500, 2.7129, ..., -4.3359, -4.0664, -4.1328]],
device='cuda:0')
prefill logits (final): tensor([[-8.3125, -7.1172, 3.3457, ..., -4.9570, -4.1328, -3.4141],
[-8.9141, -9.0156, 4.1445, ..., -4.9922, -4.4961, -4.0781],
[-9.6328, -9.0547, 4.0195, ..., -5.3047, -4.7148, -4.4570]],
device='cuda:0')
========== Prompt 0 ==========
<s> The capital of France is Paris.
The capital of the United States is Washington, D.C.
<s> The capital of the United Kindom is.
========== Prompt 1 ==========
<s> The capital of the United Kindom is London.
The capital of the United Kingdom is London.
The capital of the
<s> Today is a sunny day and I like go for a walk in the park.
========== Prompt 2 ==========
<s> Today is a sunny day and I like to go for a walk in the park.
I'm going to the park
"""
......@@ -225,12 +233,12 @@ def correctness_test(
# Prepare inputs
input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
rank_print(f"{input_ids=}")
rank_print(f"\n{input_ids=}\n")
if bench_args.cut_len > 0:
# Prefill
next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
rank_print("prefill logits (first half)", next_token_logits)
rank_print(f"prefill logits (first half): {next_token_logits} \n")
# Prepare extend inputs
reqs = prepare_extend_inputs_for_correctness_test(
......@@ -239,7 +247,7 @@ def correctness_test(
# Extend
next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
rank_print("prefill logits (final)", next_token_logits)
rank_print(f"prefill logits (final): {next_token_logits} \n")
# Decode
output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
......@@ -250,7 +258,8 @@ def correctness_test(
# Print
for i in range(len(reqs)):
rank_print(tokenizer.decode(output_ids[i]))
rank_print(f"========== Prompt {i} ==========")
rank_print(tokenizer.decode(output_ids[i]), "\n")
@torch.inference_mode()
......
import unittest
from types import SimpleNamespace
from sglang.srt.utils import kill_child_process
from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
class TestEvalAccuracyLarge(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(
cls.model,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--log-level-http",
"warning",
"--tp",
"2",
],
)
@classmethod
def tearDownClass(cls):
kill_child_process(cls.process.pid)
def test_mmlu(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mmlu",
num_examples=3000,
num_threads=1024,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.63, f"{metrics}"
def test_human_eval(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="humaneval",
num_examples=None,
num_threads=1024,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.43, f"{metrics}"
def test_mgsm_en(self):
args = SimpleNamespace(
base_url=self.base_url,
model=self.model,
eval_name="mgsm_en",
num_examples=None,
num_threads=1024,
)
metrics = run_eval(args)
assert metrics["score"] >= 0.64, f"{metrics}"
if __name__ == "__main__":
unittest.main()
import os
import subprocess
import unittest
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MOE_MODEL_NAME_FOR_TEST
class TestServingLatency(unittest.TestCase):
def test_default(self):
command = [
"python3",
"-m",
"sglang.bench_latency",
"--model",
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
"--batch-size",
"1",
"--input",
"128",
"--output",
"8",
"--tp",
"2",
]
process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
output = stdout.decode()
error = stderr.decode()
print(f"Output: {output}")
print(f"Error: {error}")
lastline = output.split("\n")[-3]
value = float(lastline.split(" ")[-2])
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert value > 125
kill_child_process(process.pid)
if __name__ == "__main__":
unittest.main()
......@@ -23,7 +23,6 @@ class TestServingThroughput(unittest.TestCase):
other_args.append("--disable-flashinfer")
other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
other_args.extend(["--tensor-parallel-size", "2"])
other_args.append("--enable-p2p-check")
model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
base_url = DEFAULT_URL_FOR_TEST
......@@ -35,7 +34,7 @@ class TestServingThroughput(unittest.TestCase):
)
# Run benchmark
num_prompts = 200
num_prompts = 300
args = SimpleNamespace(
backend="sglang",
base_url=base_url,
......@@ -76,8 +75,7 @@ class TestServingThroughput(unittest.TestCase):
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 950, H100 (SMX): 1800
assert res["output_throughput"] > 1750
assert res["output_throughput"] > 1850
def test_default_without_radix_cache(self):
res = self.run_test(
......@@ -87,18 +85,7 @@ class TestServingThroughput(unittest.TestCase):
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 950, H100 (SMX): 1900
assert res["output_throughput"] > 1850
def test_all_cases(self):
for disable_radix_cache in [False, True]:
for disable_flashinfer in [False, True]:
for chunked_prefill_size in [-1, 2048]:
self.run_test(
disable_radix_cache=False,
disable_flashinfer=False,
chunked_prefill_size=-1,
)
assert res["output_throughput"] > 1950
if __name__ == "__main__":
......
import os
import subprocess
import unittest
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST
class TestServingLatency(unittest.TestCase):
def test_default(self):
command = [
"python3",
"-m",
"sglang.bench_latency",
"--model",
DEFAULT_MODEL_NAME_FOR_TEST,
"--batch-size",
"1",
"--input",
"128",
"--output",
"8",
]
process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
output = stdout.decode()
error = stderr.decode()
print(f"Output: {output}")
print(f"Error: {error}")
lastline = output.split("\n")[-3]
value = float(lastline.split(" ")[-2])
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert value > 130
kill_child_process(process.pid)
if __name__ == "__main__":
unittest.main()
......@@ -33,7 +33,7 @@ class TestServingThroughput(unittest.TestCase):
)
# Run benchmark
num_prompts = 400
num_prompts = 500
args = SimpleNamespace(
backend="sglang",
base_url=base_url,
......@@ -74,8 +74,7 @@ class TestServingThroughput(unittest.TestCase):
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 1450, H100 (SMX): 2550
assert res["output_throughput"] > 2500
assert res["output_throughput"] > 2400
def test_default_without_radix_cache(self):
res = self.run_test(
......@@ -85,7 +84,6 @@ class TestServingThroughput(unittest.TestCase):
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 1500, H100 (SMX): 2850
assert res["output_throughput"] > 2800
def test_default_without_chunked_prefill(self):
......@@ -96,18 +94,7 @@ class TestServingThroughput(unittest.TestCase):
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
# A100 (PCIE): 1450, H100 (SMX): 2550
assert res["output_throughput"] > 2500
def test_all_cases(self):
for disable_radix_cache in [False, True]:
for disable_flashinfer in [False, True]:
for chunked_prefill_size in [-1, 2048]:
self.run_test(
disable_radix_cache=False,
disable_flashinfer=False,
chunked_prefill_size=-1,
)
assert res["output_throughput"] > 2400
if __name__ == "__main__":
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment