"tests/python/vscode:/vscode.git/clone" did not exist on "1d86b796a1575117b1c3f0c69569cf154120c437"
Unverified Commit 68be2f6d authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

[CI] Include triton backend and online serving benchmark into CI (#1408)

parent b912de11
...@@ -75,7 +75,7 @@ jobs: ...@@ -75,7 +75,7 @@ jobs:
cd test/srt cd test/srt
python3 run_suite.py --suite minimal --range-begin 8 python3 run_suite.py --suite minimal --range-begin 8
performance-test-1-gpu: performance-test-1-gpu-part-1:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
steps: steps:
...@@ -88,29 +88,54 @@ jobs: ...@@ -88,29 +88,54 @@ jobs:
pip install -e "python[all]" pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
- name: Benchmark Serving Throughput - name: Benchmark Offline Throughput
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
cd test/srt cd test/srt
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default
- name: Benchmark Serving Latency - name: Benchmark Offline Throughput (w/o RadixAttention)
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
cd test/srt cd test/srt
python3 -m unittest test_serving_latency.TestServingLatency.test_default python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_radix_cache
- name: Benchmark Serving Throughput (w/o RadixAttention) - name: Benchmark Offline Throughput (w/o ChunkedPrefill)
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
cd test/srt cd test/srt
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_without_chunked_prefill
- name: Benchmark Serving Throughput (w/o ChunkedPrefill) - name: Benchmark Offline Throughput (w/ Triton)
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
cd test/srt cd test/srt
python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_chunked_prefill python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_with_triton_attention_backend
performance-test-1-gpu-part-2:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 1-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Install dependencies
run: |
pip install --upgrade pip
pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
- name: Benchmark Single Latency
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_latency.TestBenchLatency.test_default
- name: Benchmark Online Latency
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_default
performance-test-2-gpu: performance-test-2-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
...@@ -125,23 +150,24 @@ jobs: ...@@ -125,23 +150,24 @@ jobs:
pip install -e "python[all]" pip install -e "python[all]"
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/ --force-reinstall
- name: Benchmark Serving Throughput (TP=2) - name: Benchmark Offline Throughput (TP=2)
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
cd test/srt cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_default
- name: Benchmark Serving Latency (TP=2) - name: Benchmark Offline Throughput (w/o RadixAttention) (TP=2)
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
cd test/srt cd test/srt
python3 -m unittest test_moe_serving_latency.TestServingLatency.test_default python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
- name: Benchmark Serving Throughput (w/o RadixAttention) (TP=2) - name: Benchmark Single Latency (TP=2)
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
cd test/srt cd test/srt
python3 -m unittest test_moe_serving_throughput.TestServingThroughput.test_default_without_radix_cache python3 -m unittest test_bench_latency.TestBenchLatency.test_moe_default
accuracy-test-1-gpu: accuracy-test-1-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
...@@ -192,7 +218,7 @@ jobs: ...@@ -192,7 +218,7 @@ jobs:
finish: finish:
needs: [ needs: [
unit-test-frontend, unit-test-backend-part-0, unit-test-backend-part-1, unit-test-frontend, unit-test-backend-part-0, unit-test-backend-part-1,
performance-test-1-gpu, performance-test-2-gpu, performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu,
accuracy-test-1-gpu, accuracy-test-2-gpu accuracy-test-1-gpu, accuracy-test-2-gpu
] ]
runs-on: ubuntu-latest runs-on: ubuntu-latest
......
...@@ -7,6 +7,7 @@ import subprocess ...@@ -7,6 +7,7 @@ import subprocess
import threading import threading
import time import time
from functools import partial from functools import partial
from types import SimpleNamespace
from typing import Callable, List, Optional from typing import Callable, List, Optional
import numpy as np import numpy as np
...@@ -14,6 +15,7 @@ import requests ...@@ -14,6 +15,7 @@ import requests
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from sglang.bench_serving import run_benchmark
from sglang.global_config import global_config from sglang.global_config import global_config
from sglang.lang.backend.openai import OpenAI from sglang.lang.backend.openai import OpenAI
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
...@@ -501,3 +503,47 @@ def run_unittest_files(files: List[str], timeout_per_file: float): ...@@ -501,3 +503,47 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
def get_similarities(vec1, vec2): def get_similarities(vec1, vec2):
return F.cosine_similarity(torch.tensor(vec1), torch.tensor(vec2), dim=0) return F.cosine_similarity(torch.tensor(vec1), torch.tensor(vec2), dim=0)
def run_bench_serving(model, num_prompts, request_rate, other_server_args):
# Launch the server
base_url = DEFAULT_URL_FOR_TEST
process = popen_launch_server(
model,
base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_server_args,
)
# Run benchmark
args = SimpleNamespace(
backend="sglang",
base_url=base_url,
host=None,
port=None,
dataset_name="random",
dataset_path="",
model=None,
tokenizer=None,
num_prompts=num_prompts,
sharegpt_output_len=None,
random_input_len=4096,
random_output_len=2048,
random_range_ratio=0.0,
request_rate=request_rate,
multi=None,
seed=0,
output_file=None,
disable_tqdm=False,
disable_stream=False,
disable_ignore_eos=False,
extra_request_body=None,
)
try:
res = run_benchmark(args)
finally:
kill_child_process(process.pid)
assert res["completed"] == num_prompts
return res
import os
import subprocess
import unittest
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
)
class TestBenchLatency(unittest.TestCase):
def test_default(self):
command = [
"python3",
"-m",
"sglang.bench_latency",
"--model-path",
DEFAULT_MODEL_NAME_FOR_TEST,
"--batch-size",
"1",
"--input",
"128",
"--output",
"8",
]
process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
try:
stdout, stderr = process.communicate()
output = stdout.decode()
error = stderr.decode()
print(f"Output: {output}")
print(f"Error: {error}")
lastline = output.split("\n")[-3]
value = float(lastline.split(" ")[-2])
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert value > 130
finally:
kill_child_process(process.pid)
def test_moe_default(self):
command = [
"python3",
"-m",
"sglang.bench_latency",
"--model",
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
"--batch-size",
"1",
"--input",
"128",
"--output",
"8",
"--tp",
"2",
]
process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
try:
stdout, stderr = process.communicate()
output = stdout.decode()
error = stderr.decode()
print(f"Output: {output}")
print(f"Error: {error}")
lastline = output.split("\n")[-3]
value = float(lastline.split(" ")[-2])
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert value > 125
finally:
kill_child_process(process.pid)
if __name__ == "__main__":
unittest.main()
import os
import unittest
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
run_bench_serving,
)
class TestBenchServing(unittest.TestCase):
def test_offline_throughput_default(self):
res = run_bench_serving(
model=DEFAULT_MODEL_NAME_FOR_TEST,
num_prompts=500,
request_rate=float("inf"),
other_server_args=[],
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 2600
def test_offline_throughput_without_radix_cache(self):
res = run_bench_serving(
model=DEFAULT_MODEL_NAME_FOR_TEST,
num_prompts=500,
request_rate=float("inf"),
other_server_args=["--disable-radix-cache"],
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 2800
def test_offline_throughput_without_chunked_prefill(self):
res = run_bench_serving(
model=DEFAULT_MODEL_NAME_FOR_TEST,
num_prompts=500,
request_rate=float("inf"),
other_server_args=["--chunked-prefill-size", "-1"],
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 2600
def test_offline_throughput_with_triton_attention_backend(self):
res = run_bench_serving(
model=DEFAULT_MODEL_NAME_FOR_TEST,
num_prompts=500,
request_rate=float("inf"),
other_server_args=[
"--attention-backend",
"triton",
"--context-length",
"8192",
],
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 2600
def test_online_latency_default(self):
res = run_bench_serving(
model=DEFAULT_MODEL_NAME_FOR_TEST,
num_prompts=100,
request_rate=1,
other_server_args=[],
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["median_e2e_latency_ms"] < 12000
assert res["median_ttft_ms"] < 78
assert res["median_itl_ms"] < 12
def test_moe_offline_throughput_default(self):
res = run_bench_serving(
model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
num_prompts=300,
request_rate=float("inf"),
other_server_args=["--tp", "2"],
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 1850
def test_moe_offline_throughput_without_radix_cache(self):
res = run_bench_serving(
model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
num_prompts=300,
request_rate=float("inf"),
other_server_args=["--tp", "2", "--disable-radix-cache"],
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 1950
if __name__ == "__main__":
unittest.main()
import os
import subprocess
import unittest
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MOE_MODEL_NAME_FOR_TEST
class TestServingLatency(unittest.TestCase):
def test_default(self):
command = [
"python3",
"-m",
"sglang.bench_latency",
"--model",
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
"--batch-size",
"1",
"--input",
"128",
"--output",
"8",
"--tp",
"2",
]
process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
output = stdout.decode()
error = stderr.decode()
print(f"Output: {output}")
print(f"Error: {error}")
lastline = output.split("\n")[-3]
value = float(lastline.split(" ")[-2])
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert value > 125
kill_child_process(process.pid)
if __name__ == "__main__":
unittest.main()
import os
import unittest
from types import SimpleNamespace
from sglang.bench_serving import run_benchmark
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import (
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
class TestServingThroughput(unittest.TestCase):
def run_test(self, disable_radix_cache, attention_backend, chunked_prefill_size):
# Launch the server
other_args = []
if disable_radix_cache:
other_args.append("--disable-radix-cache")
if attention_backend:
other_args.extend(["--attention-backend", attention_backend])
other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
other_args.extend(["--tensor-parallel-size", "2"])
model = DEFAULT_MOE_MODEL_NAME_FOR_TEST
base_url = DEFAULT_URL_FOR_TEST
process = popen_launch_server(
model,
base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args,
)
# Run benchmark
num_prompts = 300
args = SimpleNamespace(
backend="sglang",
base_url=base_url,
host=None,
port=None,
dataset_name="random",
dataset_path="",
model=None,
tokenizer=None,
num_prompts=num_prompts,
sharegpt_output_len=None,
random_input_len=4096,
random_output_len=2048,
random_range_ratio=0.0,
request_rate=float("inf"),
multi=None,
seed=0,
output_file=None,
disable_tqdm=False,
disable_stream=False,
disable_ignore_eos=False,
extra_request_body=None,
)
try:
res = run_benchmark(args)
finally:
kill_child_process(process.pid)
assert res["completed"] == num_prompts
return res
def test_default(self):
res = self.run_test(
disable_radix_cache=ServerArgs.disable_radix_cache,
attention_backend=ServerArgs.attention_backend,
chunked_prefill_size=ServerArgs.chunked_prefill_size,
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 1800
def test_default_without_radix_cache(self):
res = self.run_test(
disable_radix_cache=True,
attention_backend=ServerArgs.attention_backend,
chunked_prefill_size=ServerArgs.chunked_prefill_size,
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 1950
if __name__ == "__main__":
unittest.main()
import os
import subprocess
import unittest
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST
class TestServingLatency(unittest.TestCase):
def test_default(self):
command = [
"python3",
"-m",
"sglang.bench_latency",
"--model-path",
DEFAULT_MODEL_NAME_FOR_TEST,
"--batch-size",
"1",
"--input",
"128",
"--output",
"8",
]
process = subprocess.Popen(
command, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
stdout, stderr = process.communicate()
output = stdout.decode()
error = stderr.decode()
print(f"Output: {output}")
print(f"Error: {error}")
lastline = output.split("\n")[-3]
value = float(lastline.split(" ")[-2])
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert value > 130
kill_child_process(process.pid)
if __name__ == "__main__":
unittest.main()
import os
import unittest
from types import SimpleNamespace
from sglang.bench_serving import run_benchmark
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
class TestServingThroughput(unittest.TestCase):
def run_test(self, disable_radix_cache, attention_backend, chunked_prefill_size):
# Launch the server
other_args = []
if disable_radix_cache:
other_args.append("--disable-radix-cache")
if attention_backend:
other_args.extend(["--attention-backend", attention_backend])
other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
model = DEFAULT_MODEL_NAME_FOR_TEST
base_url = DEFAULT_URL_FOR_TEST
process = popen_launch_server(
model,
base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_args,
)
# Run benchmark
num_prompts = 500
args = SimpleNamespace(
backend="sglang",
base_url=base_url,
host=None,
port=None,
dataset_name="random",
dataset_path="",
model=None,
tokenizer=None,
num_prompts=num_prompts,
sharegpt_output_len=None,
random_input_len=4096,
random_output_len=2048,
random_range_ratio=0.0,
request_rate=float("inf"),
multi=None,
seed=0,
output_file=None,
disable_tqdm=False,
disable_stream=False,
disable_ignore_eos=False,
extra_request_body=None,
)
try:
res = run_benchmark(args)
finally:
kill_child_process(process.pid)
assert res["completed"] == num_prompts
return res
def test_default(self):
res = self.run_test(
disable_radix_cache=ServerArgs.disable_radix_cache,
attention_backend=ServerArgs.attention_backend,
chunked_prefill_size=ServerArgs.chunked_prefill_size,
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 2400
def test_default_without_radix_cache(self):
res = self.run_test(
disable_radix_cache=True,
attention_backend=ServerArgs.attention_backend,
chunked_prefill_size=ServerArgs.chunked_prefill_size,
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 2800
def test_default_without_chunked_prefill(self):
res = self.run_test(
disable_radix_cache=ServerArgs.disable_radix_cache,
attention_backend=ServerArgs.attention_backend,
chunked_prefill_size=-1,
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 2400
def test_default_with_triton_attention_backend(self):
res = self.run_test(
disable_radix_cache=ServerArgs.disable_radix_cache,
attention_backend="triton",
chunked_prefill_size=-1,
)
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
assert res["output_throughput"] > 2400
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment