Unverified Commit 68be2f6d authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

[CI] Include triton backend and online serving benchmark into CI (#1408)

parent b912de11
......@@ -75,7 +75,7 @@ jobs:
cd test/srt
python3 run_suite.py --suite minimal --range-begin 8
performance-test-1-gpu:
performance-test-1-gpu-part-1:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 1-gpu-runner
steps:
......@@ -88,29 +88,54 @@ jobs:
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
- name: Benchmark Serving Throughput
- name: Benchmark Offline Throughput
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
- name: Benchmark Serving Latency
- name: Benchmark Offline Throughput (w/o RadixAttention)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_serving_latency.TestServingLatency.test_default
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
- name: Benchmark Serving Throughput (w/o RadixAttention)
- name: Benchmark Offline Throughput (w/o ChunkedPrefill)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_chunked_prefill
- name: Benchmark Serving Throughput (w/o ChunkedPrefill)
- name: Benchmark Offline Throughput (w/ Triton)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
performance-test-1-gpu-part-2:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 1-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
- name: Benchmark Single Latency
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_latency.TestBenchLatency.test_default
- name: Benchmark Online Latency
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
performance-test-2-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
......@@ -125,23 +150,24 @@ jobs:
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
- name: Benchmark Serving Throughput (TP=2)
- name: Benchmark Offline Throughput (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default
python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
- name: Benchmark Serving Latency (TP=2)
- name: Benchmark Offline Throughput (w/o RadixAttention) (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_moe_serving_latency.TestServingLatency.test_default
python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
- name: Benchmark Serving Throughput (w/o RadixAttention) (TP=2)
- name: Benchmark Single Latency (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache
python3 -m unittest test_bench_latency.TestBenchLatency.test_moe_default
accuracy-test-1-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
......@@ -192,7 +218,7 @@ jobs:
finish:
needs: [
unit-test-frontend, unit-test-backend-part-0, unit-test-backend-part-1,
performance-test-1-gpu, performance-test-2-gpu,
performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu,
accuracy-test-1-gpu, accuracy-test-2-gpu
]
runs-on: ubuntu-latest
......
......@@ -7,6 +7,7 @@ import subprocess
import threading
import time
from functools import partial
from types import SimpleNamespace
from typing import Callable, List, Optional
import numpy as np
......@@ -14,6 +15,7 @@ import requests
import torch
import torch.nn.functional as F
from sglang.bench_serving import run_benchmark
from sglang.global_config import global_config
from sglang.lang.backend.openai import OpenAI
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
......@@ -501,3 +503,47 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
def get_similarities(vec1, vec2):
return F.cosine_similarity(torch.tensor(vec1), torch.tensor(vec2), dim=0)
def run_bench_serving(model, num_prompts, request_rate, other_server_args):
# Launch the server
base_url = DEFAULT_URL_FOR_TEST
process = popen_launch_server(
model,
base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_server_args,
)
# Run benchmark
args = SimpleNamespace(
backend="sglang",
base_url=base_url,
host=None,
port=None,
dataset_name="random",
dataset_path="",
model=None,
tokenizer=None,
num_prompts=num_prompts,
sharegpt_output_len=None,
random_input_len=4096,
random_output_len=2048,
random_range_ratio=0.0,
request_rate=request_rate,
multi=None,
seed=0,
output_file=None,
disable_tqdm=False,
disable_stream=False,
disable_ignore_eos=False,
extra_request_body=None,
)
try:
res = run_benchmark(args)
finally:
kill_child_process(process.pid)
assert res["completed"] == num_prompts
return res
import os
import subprocess
import unittest
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
)
class TestBenchLatency(unittest.TestCase):
def test_default(self):
command = [
"python3",
"-m",
"sglang.bench_latency",
"--model-path",
DEFAULT_MODEL_NAME_FOR_TEST,
"--batch-size",
"1",
"--input",
"128",
"--output",
"8",
]
process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
try:
stdout, stderr = process.communicate()
output = stdout.decode()
error = stderr.decode()
print(f"Output: {output}")
print(f"Error: {error}")
lastline = output.split("\n")[-3]
value = float(lastline.split(" ")[-2])
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert value > 130
finally:
kill_child_process(process.pid)
def test_moe_default(self):
command = [
"python3",
"-m",
"sglang.bench_latency",
"--model",
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
"--batch-size",
"1",
"--input",
"128",
"--output",
"8",
"--tp",
"2",
]
process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
try:
stdout, stderr = process.communicate()
output = stdout.decode()
error = stderr.decode()
print(f"Output: {output}")
print(f"Error: {error}")
lastline = output.split("\n")[-3]
value = float(lastline.split(" ")[-2])
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert value > 125
finally:
kill_child_process(process.pid)
if __name__ == "__main__":
unittest.main()
import os
import unittest
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
run_bench_serving,
)
class TestBenchServing(unittest.TestCase):
def test_offline_throughput_default(self):
res = run_bench_serving(
model=DEFAULT_MODEL_NAME_FOR_TEST,
num_prompts=500,
request_rate=float("inf"),
other_server_args=[],
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 2600
def test_offline_throughput_without_radix_cache(self):
res = run_bench_serving(
model=DEFAULT_MODEL_NAME_FOR_TEST,
num_prompts=500,
request_rate=float("inf"),
other_server_args=["--disable-radix-cache"],
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 2800
def test_offline_throughput_without_chunked_prefill(self):
res = run_bench_serving(
model=DEFAULT_MODEL_NAME_FOR_TEST,
num_prompts=500,
request_rate=float("inf"),
other_server_args=["--chunked-prefill-size", "-1"],
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 2600
def test_offline_throughput_with_triton_attention_backend(self):
res = run_bench_serving(
model=DEFAULT_MODEL_NAME_FOR_TEST,
num_prompts=500,
request_rate=float("inf"),
other_server_args=[
"--attention-backend",
"triton",
"--context-length",
"8192",
],
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 2600
def test_online_latency_default(self):
res = run_bench_serving(
model=DEFAULT_MODEL_NAME_FOR_TEST,
num_prompts=100,
request_rate=1,
other_server_args=[],
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["median_e2e_latency_ms"] < 12000
assert res["median_ttft_ms"] < 78
assert res["median_itl_ms"] < 12
def test_moe_offline_throughput_default(self):
res = run_bench_serving(
model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
num_prompts=300,
request_rate=float("inf"),
other_server_args=["--tp", "2"],
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 1850
def test_moe_offline_throughput_without_radix_cache(self):
res = run_bench_serving(
model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
num_prompts=300,
request_rate=float("inf"),
other_server_args=["--tp", "2", "--disable-radix-cache"],
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 1950
if __name__ == "__main__":
unittest.main()
import os
import subprocess
import unittest
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MOE_MODEL_NAME_FOR_TEST
class TestServingLatency(unittest.TestCase):
def test_default(self):
command = [
"python3",
"-m",
"sglang.bench_latency",
"--model",
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
"--batch-size",
"1",
"--input",
"128",
"--output",
"8",
"--tp",
"2",
]
process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
output = stdout.decode()
error = stderr.decode()
print(f"Output: {output}")
print(f"Error: {error}")
lastline = output.split("\n")[-3]
value = float(lastline.split(" ")[-2])
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert value > 125
kill_child_process(process.pid)
if __name__ == "__main__":
unittest.main()
import os
import unittest
from types import SimpleNamespace
from sglang.bench_serving import run_benchmark
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import (
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
class TestServingThroughput(unittest.TestCase):
def run_test(self, disable_radix_cache, attention_backend, chunked_prefill_size):
# Launch the server
other_args = []
if disable_radix_cache:
other_args.append("--disable-radix-cache")
if attention_backend:
other_args.extend(["--attention-backend", attention_backend])
other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
other_args.extend(["--tensor-parallel-size", "2"])
model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
base_url = DEFAULT_URL_FOR_TEST
process = popen_launch_server(
model,
base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args,
)
# Run benchmark
num_prompts = 300
args = SimpleNamespace(
backend="sglang",
base_url=base_url,
host=None,
port=None,
dataset_name="random",
dataset_path="",
model=None,
tokenizer=None,
num_prompts=num_prompts,
sharegpt_output_len=None,
random_input_len=4096,
random_output_len=2048,
random_range_ratio=0.0,
request_rate=float("inf"),
multi=None,
seed=0,
output_file=None,
disable_tqdm=False,
disable_stream=False,
disable_ignore_eos=False,
extra_request_body=None,
)
try:
res = run_benchmark(args)
finally:
kill_child_process(process.pid)
assert res["completed"] == num_prompts
return res
def test_default(self):
res = self.run_test(
disable_radix_cache=ServerArgs.disable_radix_cache,
attention_backend=ServerArgs.attention_backend,
chunked_prefill_size=ServerArgs.chunked_prefill_size,
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 1800
def test_default_without_radix_cache(self):
res = self.run_test(
disable_radix_cache=True,
attention_backend=ServerArgs.attention_backend,
chunked_prefill_size=ServerArgs.chunked_prefill_size,
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 1950
if __name__ == "__main__":
unittest.main()
import os
import subprocess
import unittest
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST
class TestServingLatency(unittest.TestCase):
def test_default(self):
command = [
"python3",
"-m",
"sglang.bench_latency",
"--model-path",
DEFAULT_MODEL_NAME_FOR_TEST,
"--batch-size",
"1",
"--input",
"128",
"--output",
"8",
]
process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
output = stdout.decode()
error = stderr.decode()
print(f"Output: {output}")
print(f"Error: {error}")
lastline = output.split("\n")[-3]
value = float(lastline.split(" ")[-2])
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert value > 130
kill_child_process(process.pid)
if __name__ == "__main__":
unittest.main()
import os
import unittest
from types import SimpleNamespace
from sglang.bench_serving import run_benchmark
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
class TestServingThroughput(unittest.TestCase):
def run_test(self, disable_radix_cache, attention_backend, chunked_prefill_size):
# Launch the server
other_args = []
if disable_radix_cache:
other_args.append("--disable-radix-cache")
if attention_backend:
other_args.extend(["--attention-backend", attention_backend])
other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
model = DEFAULT_MODEL_NAME_FOR_TEST
base_url = DEFAULT_URL_FOR_TEST
process = popen_launch_server(
model,
base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args,
)
# Run benchmark
num_prompts = 500
args = SimpleNamespace(
backend="sglang",
base_url=base_url,
host=None,
port=None,
dataset_name="random",
dataset_path="",
model=None,
tokenizer=None,
num_prompts=num_prompts,
sharegpt_output_len=None,
random_input_len=4096,
random_output_len=2048,
random_range_ratio=0.0,
request_rate=float("inf"),
multi=None,
seed=0,
output_file=None,
disable_tqdm=False,
disable_stream=False,
disable_ignore_eos=False,
extra_request_body=None,
)
try:
res = run_benchmark(args)
finally:
kill_child_process(process.pid)
assert res["completed"] == num_prompts
return res
def test_default(self):
res = self.run_test(
disable_radix_cache=ServerArgs.disable_radix_cache,
attention_backend=ServerArgs.attention_backend,
chunked_prefill_size=ServerArgs.chunked_prefill_size,
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 2400
def test_default_without_radix_cache(self):
res = self.run_test(
disable_radix_cache=True,
attention_backend=ServerArgs.attention_backend,
chunked_prefill_size=ServerArgs.chunked_prefill_size,
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 2800
def test_default_without_chunked_prefill(self):
res = self.run_test(
disable_radix_cache=ServerArgs.disable_radix_cache,
attention_backend=ServerArgs.attention_backend,
chunked_prefill_size=-1,
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 2400
def test_default_with_triton_attention_backend(self):
res = self.run_test(
disable_radix_cache=ServerArgs.disable_radix_cache,
attention_backend="triton",
chunked_prefill_size=-1,
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 2400
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment