Unverified Commit 82076370 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Improve end-to-end throughput test and its coverage (#1039)

parent 7de60345
...@@ -37,23 +37,16 @@ jobs: ...@@ -37,23 +37,16 @@ jobs:
- name: Benchmark Serving Throughput - name: Benchmark Serving Throughput
run: | run: |
python3 -m sglang.launch_server --model meta-llama/Meta-Llama-3.1-8B-Instruct --port 8413 --disable-radix-cache & cd test/srt
SERVER_PID=$! python3 -m unittest test_serving_throughput.TestServingThroughput.test_default
echo "Waiting for server to start..." - name: Benchmark Serving Throughput (w/o RadixAttention)
for i in {1..120}; do run: |
if curl -s http://127.0.0.1:8413/health; then cd test/srt
echo "Server is up!" python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_radix_cache
break
fi - name: Benchmark Serving Throughput (w/o FlashInfer)
if [ $i -eq 120 ]; then run: |
echo "Server failed to start within 120 seconds" cd test/srt
exit 1 python3 -m unittest test_serving_throughput.TestServingThroughput.test_default_without_flashinfer
fi
sleep 1
done
cd $HOME && python3 -m sglang.bench_serving --backend sglang --port 8413 --dataset-name random --num-prompts 500 --random-input 4096 --random-output 2048
echo "Stopping server..."
kill -9 $SERVER_PID
...@@ -39,6 +39,8 @@ from transformers import ( ...@@ -39,6 +39,8 @@ from transformers import (
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
global args
@dataclass @dataclass
class RequestFuncInput: class RequestFuncInput:
...@@ -749,7 +751,11 @@ def check_chat_template(model_path): ...@@ -749,7 +751,11 @@ def check_chat_template(model_path):
return False return False
def fire(args: argparse.Namespace): def run_benchmark(args_: argparse.Namespace):
global args
args = args_
set_ulimit()
random.seed(args.seed) random.seed(args.seed)
np.random.seed(args.seed) np.random.seed(args.seed)
...@@ -853,7 +859,7 @@ def fire(args: argparse.Namespace): ...@@ -853,7 +859,7 @@ def fire(args: argparse.Namespace):
) )
) )
else: else:
asyncio.run( return asyncio.run(
benchmark( benchmark(
backend=backend, backend=backend,
api_url=api_url, api_url=api_url,
...@@ -962,11 +968,6 @@ if __name__ == "__main__": ...@@ -962,11 +968,6 @@ if __name__ == "__main__":
"Otherwise, we use Poisson process to synthesize the request arrival times. Default is 128.0.", "Otherwise, we use Poisson process to synthesize the request arrival times. Default is 128.0.",
) )
parser.add_argument("--seed", type=int, default=0, help="Default is 0.") parser.add_argument("--seed", type=int, default=0, help="Default is 0.")
parser.add_argument(
"--disable-tqdm",
action="store_true",
help="Specify to disable tqdm progress bar.",
)
parser.add_argument( parser.add_argument(
"--multi", "--multi",
action="store_true", action="store_true",
...@@ -979,6 +980,11 @@ if __name__ == "__main__": ...@@ -979,6 +980,11 @@ if __name__ == "__main__":
help="Range of request rates in the format start,stop,step. Default is 2,34,2. It also supports a list of request rates, requiring the parameters to not equal three.", help="Range of request rates in the format start,stop,step. Default is 2,34,2. It also supports a list of request rates, requiring the parameters to not equal three.",
) )
parser.add_argument("--output-file", type=str, help="Output JSONL file name.") parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
parser.add_argument(
"--disable-tqdm",
action="store_true",
help="Specify to disable tqdm progress bar.",
)
parser.add_argument( parser.add_argument(
"--disable-stream", "--disable-stream",
action="store_true", action="store_true",
...@@ -996,8 +1002,5 @@ if __name__ == "__main__": ...@@ -996,8 +1002,5 @@ if __name__ == "__main__":
help="Append given JSON object to the request payload. You can use this to specify" help="Append given JSON object to the request payload. You can use this to specify"
"additional generate params like sampling params.", "additional generate params like sampling params.",
) )
set_ulimit()
args = parser.parse_args() args = parser.parse_args()
fire(args) run_benchmark(args)
...@@ -21,6 +21,7 @@ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint ...@@ -21,6 +21,7 @@ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
from sglang.utils import get_exception_traceback from sglang.utils import get_exception_traceback
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct" DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
DEFAULT_URL_FOR_TEST = "http://127.0.0.1:8157"
def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None): def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
......
...@@ -3,7 +3,11 @@ from types import SimpleNamespace ...@@ -3,7 +3,11 @@ from types import SimpleNamespace
from sglang.srt.utils import kill_child_process from sglang.srt.utils import kill_child_process
from sglang.test.run_eval import run_eval from sglang.test.run_eval import run_eval
from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
class TestAccuracy(unittest.TestCase): class TestAccuracy(unittest.TestCase):
...@@ -11,7 +15,7 @@ class TestAccuracy(unittest.TestCase): ...@@ -11,7 +15,7 @@ class TestAccuracy(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = "http://127.0.0.1:8157" cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server( cls.process = popen_launch_server(
cls.model, cls.model,
cls.base_url, cls.base_url,
......
...@@ -4,7 +4,7 @@ import openai ...@@ -4,7 +4,7 @@ import openai
from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.utils import kill_child_process from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import popen_launch_server from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, popen_launch_server
class TestOpenAIServer(unittest.TestCase): class TestOpenAIServer(unittest.TestCase):
...@@ -12,7 +12,7 @@ class TestOpenAIServer(unittest.TestCase): ...@@ -12,7 +12,7 @@ class TestOpenAIServer(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = "intfloat/e5-mistral-7b-instruct" cls.model = "intfloat/e5-mistral-7b-instruct"
cls.base_url = "http://127.0.0.1:8157" cls.base_url = DEFAULT_URL_FOR_TEST
cls.api_key = "sk-123456" cls.api_key = "sk-123456"
cls.process = popen_launch_server( cls.process = popen_launch_server(
cls.model, cls.base_url, timeout=300, api_key=cls.api_key cls.model, cls.base_url, timeout=300, api_key=cls.api_key
......
...@@ -3,7 +3,11 @@ from types import SimpleNamespace ...@@ -3,7 +3,11 @@ from types import SimpleNamespace
from sglang.srt.utils import kill_child_process from sglang.srt.utils import kill_child_process
from sglang.test.run_eval import run_eval from sglang.test.run_eval import run_eval
from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
class TestAccuracy(unittest.TestCase): class TestAccuracy(unittest.TestCase):
...@@ -11,7 +15,7 @@ class TestAccuracy(unittest.TestCase): ...@@ -11,7 +15,7 @@ class TestAccuracy(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = "http://127.0.0.1:8157" cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
@classmethod @classmethod
......
...@@ -8,7 +8,11 @@ import openai ...@@ -8,7 +8,11 @@ import openai
from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.utils import kill_child_process from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
class TestOpenAIServer(unittest.TestCase): class TestOpenAIServer(unittest.TestCase):
...@@ -16,7 +20,7 @@ class TestOpenAIServer(unittest.TestCase): ...@@ -16,7 +20,7 @@ class TestOpenAIServer(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = "http://127.0.0.1:8157" cls.base_url = DEFAULT_URL_FOR_TEST
cls.api_key = "sk-123456" cls.api_key = "sk-123456"
cls.process = popen_launch_server( cls.process = popen_launch_server(
cls.model, cls.model,
......
...@@ -6,7 +6,11 @@ import openai ...@@ -6,7 +6,11 @@ import openai
from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.utils import kill_child_process from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
class TestOpenAIServer(unittest.TestCase): class TestOpenAIServer(unittest.TestCase):
...@@ -14,7 +18,7 @@ class TestOpenAIServer(unittest.TestCase): ...@@ -14,7 +18,7 @@ class TestOpenAIServer(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = "http://127.0.0.1:8157" cls.base_url = DEFAULT_URL_FOR_TEST
cls.api_key = "sk-123456" cls.api_key = "sk-123456"
cls.process = popen_launch_server( cls.process = popen_launch_server(
cls.model, cls.base_url, timeout=300, api_key=cls.api_key cls.model, cls.base_url, timeout=300, api_key=cls.api_key
......
import unittest
from types import SimpleNamespace
from sglang.bench_serving import run_benchmark
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
class TestServingThroughput(unittest.TestCase):
def run_test(self, disable_radix_cache, disable_flashinfer, chunked_prefill_size):
# Launch the server
other_args = []
if disable_radix_cache:
other_args.append("--disable-radix-cache")
if disable_flashinfer:
other_args.append("--disable-flashinfer")
other_args.extend(["--chunked-prefill-size", str(chunked_prefill_size)])
model = DEFAULT_MODEL_NAME_FOR_TEST
base_url = "http://127.0.0.1:9157"
process = popen_launch_server(
model, base_url, timeout=300, other_args=other_args
)
# Run benchmark
num_prompts = 400
args = SimpleNamespace(
backend="sglang",
base_url=base_url,
host=None,
port=None,
dataset_name="random",
dataset_path="",
model=None,
tokenizer=None,
num_prompts=num_prompts,
sharegpt_output_len=None,
random_input_len=4096,
random_output_len=2048,
random_range_ratio=0.0,
request_rate=float("inf"),
multi=None,
seed=0,
output_file=None,
disable_tqdm=False,
disable_stream=False,
disable_ignore_eos=False,
extra_request_body=None,
)
try:
res = run_benchmark(args)
finally:
kill_child_process(process.pid)
assert res["completed"] == num_prompts
def test_default(self):
self.run_test(
disable_radix_cache=False,
disable_flashinfer=False,
chunked_prefill_size=-1,
)
def test_default_without_radix_cache(self):
self.run_test(
disable_radix_cache=True,
disable_flashinfer=False,
chunked_prefill_size=-1,
)
def test_default_without_flashinfer(self):
self.run_test(
disable_radix_cache=False,
disable_flashinfer=True,
chunked_prefill_size=-1,
)
def test_all_cases(self):
for disable_radix_cache in [False, True]:
for disable_flashinfer in [False, True]:
for chunked_prefill_size in [-1, 2048]:
self.run_test(
disable_radix_cache=False,
disable_flashinfer=False,
chunked_prefill_size=-1,
)
if __name__ == "__main__":
unittest.main()
...@@ -4,7 +4,11 @@ import unittest ...@@ -4,7 +4,11 @@ import unittest
import requests import requests
from sglang.srt.utils import kill_child_process from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
class TestSkipTokenizerInit(unittest.TestCase): class TestSkipTokenizerInit(unittest.TestCase):
...@@ -12,7 +16,7 @@ class TestSkipTokenizerInit(unittest.TestCase): ...@@ -12,7 +16,7 @@ class TestSkipTokenizerInit(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = "http://127.0.0.1:8157" cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server( cls.process = popen_launch_server(
cls.model, cls.base_url, timeout=300, other_args=["--skip-tokenizer-init"] cls.model, cls.base_url, timeout=300, other_args=["--skip-tokenizer-init"]
) )
......
...@@ -4,7 +4,11 @@ import unittest ...@@ -4,7 +4,11 @@ import unittest
import requests import requests
from sglang.srt.utils import kill_child_process from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
class TestSRTEndpoint(unittest.TestCase): class TestSRTEndpoint(unittest.TestCase):
...@@ -12,7 +16,7 @@ class TestSRTEndpoint(unittest.TestCase): ...@@ -12,7 +16,7 @@ class TestSRTEndpoint(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = "http://127.0.0.1:8157" cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300) cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
@classmethod @classmethod
......
import json
import unittest
import requests
from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server
class TestSRTEndpoint(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = "http://127.0.0.1:8157"
cls.process = popen_launch_server(cls.model, cls.base_url, timeout=300)
@classmethod
def tearDownClass(cls):
kill_child_process(cls.process.pid)
def run_decode(
self, return_logprob=False, top_logprobs_num=0, return_text=False, n=1
):
response = requests.post(
self.base_url + "/generate",
json={
"text": "The capital of France is",
"sampling_params": {
"temperature": 0 if n == 1 else 0.5,
"max_new_tokens": 32,
"n": n,
},
"stream": False,
"return_logprob": return_logprob,
"top_logprobs_num": top_logprobs_num,
"return_text_in_logprobs": return_text,
"logprob_start_len": 0,
},
)
print(json.dumps(response.json()))
print("=" * 100)
def test_simple_decode(self):
self.run_decode()
def test_parallel_sample(self):
self.run_decode(n=3)
def test_logprob(self):
for top_logprobs_num in [0, 3]:
for return_text in [True, False]:
self.run_decode(
return_logprob=True,
top_logprobs_num=top_logprobs_num,
return_text=return_text,
)
if __name__ == "__main__":
unittest.main()
...@@ -3,7 +3,11 @@ from types import SimpleNamespace ...@@ -3,7 +3,11 @@ from types import SimpleNamespace
from sglang.srt.utils import kill_child_process from sglang.srt.utils import kill_child_process
from sglang.test.run_eval import run_eval from sglang.test.run_eval import run_eval
from sglang.test.test_utils import DEFAULT_MODEL_NAME_FOR_TEST, popen_launch_server from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_URL_FOR_TEST,
popen_launch_server,
)
class TestAccuracy(unittest.TestCase): class TestAccuracy(unittest.TestCase):
...@@ -11,7 +15,7 @@ class TestAccuracy(unittest.TestCase): ...@@ -11,7 +15,7 @@ class TestAccuracy(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = DEFAULT_MODEL_NAME_FOR_TEST cls.model = DEFAULT_MODEL_NAME_FOR_TEST
cls.base_url = "http://127.0.0.1:8157" cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server( cls.process = popen_launch_server(
cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"] cls.model, cls.base_url, timeout=300, other_args=["--enable-torch-compile"]
) )
......
...@@ -5,7 +5,7 @@ import openai ...@@ -5,7 +5,7 @@ import openai
from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.utils import kill_child_process from sglang.srt.utils import kill_child_process
from sglang.test.test_utils import popen_launch_server from sglang.test.test_utils import DEFAULT_URL_FOR_TEST, popen_launch_server
class TestOpenAIVisionServer(unittest.TestCase): class TestOpenAIVisionServer(unittest.TestCase):
...@@ -13,7 +13,7 @@ class TestOpenAIVisionServer(unittest.TestCase): ...@@ -13,7 +13,7 @@ class TestOpenAIVisionServer(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = "liuhaotian/llava-v1.6-vicuna-7b" cls.model = "liuhaotian/llava-v1.6-vicuna-7b"
cls.base_url = "http://127.0.0.1:8157" cls.base_url = DEFAULT_URL_FOR_TEST
cls.api_key = "sk-123456" cls.api_key = "sk-123456"
cls.process = popen_launch_server( cls.process = popen_launch_server(
cls.model, cls.model,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment