Unverified Commit a4331cd2 authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Add accuracy and latency tests of eagle into CI (#3027)

parent ec1c21cd
...@@ -128,7 +128,7 @@ jobs: ...@@ -128,7 +128,7 @@ jobs:
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
cd test/srt cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_default python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_bs1
- name: Benchmark online latency - name: Benchmark online latency
timeout-minutes: 10 timeout-minutes: 10
...@@ -148,6 +148,13 @@ jobs: ...@@ -148,6 +148,13 @@ jobs:
cd test/srt cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
- name: Benchmark online latency (EAGLE)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_online_latency_eagle
performance-test-1-gpu-part-2: performance-test-1-gpu-part-2:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
...@@ -196,7 +203,13 @@ jobs: ...@@ -196,7 +203,13 @@ jobs:
timeout-minutes: 10 timeout-minutes: 10
run: | run: |
cd test/srt cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_default python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
- name: Benchmark single latency + torch.compile (TP=2)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_torch_compile_tp2_bs1
- name: Benchmark offline throughput (TP=2) - name: Benchmark offline throughput (TP=2)
timeout-minutes: 10 timeout-minutes: 10
...@@ -210,6 +223,7 @@ jobs: ...@@ -210,6 +223,7 @@ jobs:
cd test/srt cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache python3 -m unittest test_bench_serving.TestBenchServing.test_moe_offline_throughput_without_radix_cache
accuracy-test-1-gpu: accuracy-test-1-gpu:
if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request' if: github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request'
runs-on: 1-gpu-runner runs-on: 1-gpu-runner
......
...@@ -42,6 +42,9 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-In ...@@ -42,6 +42,9 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-In
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct" DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmzheng/sglang-EAGLE-llama2-chat-7B"
def is_in_ci(): def is_in_ci():
"""Return whether it is in CI runner.""" """Return whether it is in CI runner."""
...@@ -538,6 +541,7 @@ def run_bench_serving( ...@@ -538,6 +541,7 @@ def run_bench_serving(
random_input_len=4096, random_input_len=4096,
random_output_len=2048, random_output_len=2048,
disable_stream=False, disable_stream=False,
disable_ignore_eos=False,
need_warmup=False, need_warmup=False,
): ):
# Launch the server # Launch the server
...@@ -572,7 +576,7 @@ def run_bench_serving( ...@@ -572,7 +576,7 @@ def run_bench_serving(
disable_stream=disable_stream, disable_stream=disable_stream,
return_logprob=False, return_logprob=False,
seed=0, seed=0,
disable_ignore_eos=False, disable_ignore_eos=disable_ignore_eos,
extra_request_body=None, extra_request_body=None,
apply_chat_template=False, apply_chat_template=False,
profile=None, profile=None,
......
...@@ -37,8 +37,7 @@ class TestQwen2(unittest.TestCase): ...@@ -37,8 +37,7 @@ class TestQwen2(unittest.TestCase):
port=int(self.base_url.split(":")[-1]), port=int(self.base_url.split(":")[-1]),
) )
metrics = run_eval(args) metrics = run_eval(args)
print(metrics) print(f"{metrics=}")
self.assertGreater(metrics["accuracy"], 0.81) self.assertGreater(metrics["accuracy"], 0.81)
...@@ -69,8 +68,7 @@ class TestQwen2FP8(unittest.TestCase): ...@@ -69,8 +68,7 @@ class TestQwen2FP8(unittest.TestCase):
port=int(self.base_url.split(":")[-1]), port=int(self.base_url.split(":")[-1]),
) )
metrics = run_eval(args) metrics = run_eval(args)
print(metrics) print(f"{metrics=}")
self.assertGreater(metrics["accuracy"], 0.79) self.assertGreater(metrics["accuracy"], 0.79)
......
...@@ -5,24 +5,46 @@ from sglang.test.test_utils import ( ...@@ -5,24 +5,46 @@ from sglang.test.test_utils import (
DEFAULT_MOE_MODEL_NAME_FOR_TEST, DEFAULT_MOE_MODEL_NAME_FOR_TEST,
is_in_ci, is_in_ci,
run_bench_one_batch, run_bench_one_batch,
write_github_step_summary,
) )
class TestBenchOneBatch(unittest.TestCase): class TestBenchOneBatch(unittest.TestCase):
def test_default(self): def test_bs1(self):
output_throughput = run_bench_one_batch(DEFAULT_MODEL_NAME_FOR_TEST, []) output_throughput = run_bench_one_batch(DEFAULT_MODEL_NAME_FOR_TEST, [])
if is_in_ci(): if is_in_ci():
write_github_step_summary(
f"### test_bs1\n"
f"output_throughput : {output_throughput:.2f} token/s\n"
)
self.assertGreater(output_throughput, 135) self.assertGreater(output_throughput, 135)
def test_moe_default(self): def test_moe_tp2_bs1(self):
output_throughput = run_bench_one_batch( output_throughput = run_bench_one_batch(
DEFAULT_MOE_MODEL_NAME_FOR_TEST, ["--tp", "2"] DEFAULT_MOE_MODEL_NAME_FOR_TEST, ["--tp", "2"]
) )
if is_in_ci(): if is_in_ci():
write_github_step_summary(
f"### test_moe_tp2_bs1\n"
f"output_throughput : {output_throughput:.2f} token/s\n"
)
self.assertGreater(output_throughput, 125) self.assertGreater(output_throughput, 125)
def test_torch_compile_tp2_bs1(self):
output_throughput = run_bench_one_batch(
DEFAULT_MODEL_NAME_FOR_TEST,
["--tp", "2", "--enable-torch-compile", "--cuda-graph-max-bs", "2"],
)
if is_in_ci():
write_github_step_summary(
f"### test_torch_compile_tp2_bs1\n"
f"output_throughput : {output_throughput:.2f} token/s\n"
)
self.assertGreater(output_throughput, 240)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
import unittest import unittest
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
DEFAULT_FP8_MODEL_NAME_FOR_TEST, DEFAULT_FP8_MODEL_NAME_FOR_TEST,
DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_MOE_MODEL_NAME_FOR_TEST, DEFAULT_MOE_MODEL_NAME_FOR_TEST,
...@@ -47,7 +49,7 @@ class TestBenchServing(unittest.TestCase): ...@@ -47,7 +49,7 @@ class TestBenchServing(unittest.TestCase):
) )
# There is a regression with torch 2.5 # There is a regression with torch 2.5
# This number was 950 for torch 2.4 # This number was 950 for torch 2.4
self.assertGreater(res["output_throughput"], 800) self.assertGreater(res["output_throughput"], 850)
def test_offline_throughput_without_radix_cache(self): def test_offline_throughput_without_radix_cache(self):
res = run_bench_serving( res = run_bench_serving(
...@@ -131,6 +133,36 @@ class TestBenchServing(unittest.TestCase): ...@@ -131,6 +133,36 @@ class TestBenchServing(unittest.TestCase):
self.assertLess(res["median_ttft_ms"], 86) self.assertLess(res["median_ttft_ms"], 86)
self.assertLess(res["median_itl_ms"], 10) self.assertLess(res["median_itl_ms"], 10)
def test_online_latency_eagle(self):
res = run_bench_serving(
model=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
num_prompts=50,
request_rate=1,
disable_ignore_eos=True,
dataset_name="sharegpt",
other_server_args=[
"--speculative-algorithm",
"EAGLE",
"--speculative-draft-model-path",
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
"--speculative-num-steps",
"5",
"--speculative-eagle-topk",
"8",
"--speculative-num-draft-tokens",
"64",
"--mem-fraction-static",
"0.7",
],
)
if is_in_ci():
write_github_step_summary(
f"### test_online_latency_eagle\n"
f'median_e2e_latency_ms : {res["median_e2e_latency_ms"]:.2f} ms\n'
)
self.assertLess(res["median_e2e_latency_ms"], 10000)
def test_moe_offline_throughput_default(self): def test_moe_offline_throughput_default(self):
res = run_bench_serving( res = run_bench_serving(
model=DEFAULT_MOE_MODEL_NAME_FOR_TEST, model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
......
import multiprocessing
import random import random
import threading
import time import time
import unittest import unittest
from types import SimpleNamespace
import requests import requests
from transformers import AutoConfig, AutoTokenizer
import sglang as sgl import sglang as sgl
from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.srt.utils import kill_process_tree from sglang.srt.utils import kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST, DEFAULT_URL_FOR_TEST,
popen_launch_server, popen_launch_server,
...@@ -19,60 +23,59 @@ class TestEAGLEEngine(unittest.TestCase): ...@@ -19,60 +23,59 @@ class TestEAGLEEngine(unittest.TestCase):
def test_eagle_accuracy(self): def test_eagle_accuracy(self):
prompt = "Today is a sunny day and I like" prompt = "Today is a sunny day and I like"
target_model_path = "meta-llama/Llama-2-7b-chat-hf"
speculative_draft_model_path = "lmzheng/sglang-EAGLE-llama2-chat-7B"
sampling_params = {"temperature": 0, "max_new_tokens": 8} sampling_params = {"temperature": 0, "max_new_tokens": 8}
# Get the reference output
ref_engine = sgl.Engine(model_path=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST)
ref_output = ref_engine.generate(prompt, sampling_params)["text"]
ref_engine.shutdown()
# Launch EAGLE engine
engine = sgl.Engine( engine = sgl.Engine(
model_path=target_model_path, model_path=DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
speculative_draft_model_path=speculative_draft_model_path, speculative_draft_model_path=DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
speculative_algorithm="EAGLE", speculative_algorithm="EAGLE",
speculative_num_steps=3, speculative_num_steps=5,
speculative_eagle_topk=4, speculative_eagle_topk=8,
speculative_num_draft_tokens=16, speculative_num_draft_tokens=64,
mem_fraction_static=0.7,
) )
out1 = engine.generate(prompt, sampling_params)["text"]
engine.shutdown()
engine = sgl.Engine(model_path=target_model_path)
out2 = engine.generate(prompt, sampling_params)["text"]
engine.shutdown()
print("==== Answer 1 ====") # Case 1: Test the output of EAGLE engine is the same as normal engine
print(out1) out1 = engine.generate(prompt, sampling_params)["text"]
print(f"{out1=}, {ref_output=}")
print("==== Answer 2 ====") self.assertEqual(out1, ref_output)
print(out2)
self.assertEqual(out1, out2)
def test_eagle_end_check(self): # Case 2: Test the output of EAGLE engine does not contain unexpected EOS
prompt = "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nToday is a sunny day and I like [/INST]" prompt = "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nToday is a sunny day and I like [/INST]"
target_model_path = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(target_model_path)
speculative_draft_model_path = "lmzheng/sglang-EAGLE-llama2-chat-7B"
sampling_params = { sampling_params = {
"temperature": 0, "temperature": 0,
"max_new_tokens": 1024, "max_new_tokens": 1024,
"skip_special_tokens": False, "skip_special_tokens": False,
} }
engine = sgl.Engine( tokenizer = get_tokenizer(DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST)
model_path=target_model_path, out2 = engine.generate(prompt, sampling_params)["text"]
speculative_draft_model_path=speculative_draft_model_path, print(f"{out2=}")
speculative_algorithm="EAGLE", tokens = tokenizer.encode(out2, truncation=False)
speculative_num_steps=3,
speculative_eagle_topk=4,
speculative_num_draft_tokens=16,
)
out1 = engine.generate(prompt, sampling_params)["text"]
engine.shutdown()
print("==== Answer 1 ====")
print(repr(out1))
tokens = tokenizer.encode(out1, truncation=False)
assert tokenizer.eos_token_id not in tokens assert tokenizer.eos_token_id not in tokens
# Case 3: Batched prompts
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = {"temperature": 0, "max_new_tokens": 30}
outputs = engine.generate(prompts, sampling_params)
for prompt, output in zip(prompts, outputs):
print("===============================")
print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
# Shutdown the engine
engine.shutdown()
prompts = [ prompts = [
"[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nToday is a sunny day and I like[/INST]" "[INST] <<SYS>>\\nYou are a helpful assistant.\\n<</SYS>>\\nToday is a sunny day and I like[/INST]"
...@@ -83,64 +86,27 @@ prompts = [ ...@@ -83,64 +86,27 @@ prompts = [
] ]
def process(server_url: str): class TestEAGLEServer(unittest.TestCase):
time.sleep(random.uniform(0, 2))
for prompt in prompts:
url = server_url
data = {
"model": "base",
"text": prompt,
"sampling_params": {
"temperature": 0,
"max_new_tokens": 1024,
},
}
response = requests.post(url, json=data)
assert response.status_code == 200
def abort_process(server_url: str):
for prompt in prompts:
try:
time.sleep(1)
url = server_url
data = {
"model": "base",
"text": prompt,
"sampling_params": {
"temperature": 0,
"max_new_tokens": 1024,
},
}
# set timeout = 1s,mock disconnected
requests.post(url, json=data, timeout=1)
except:
pass
class TestEAGLELaunchServer(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
speculative_draft_model_path = "lmzheng/sglang-EAGLE-llama2-chat-7B"
cls.model = "meta-llama/Llama-2-7b-chat-hf"
cls.base_url = DEFAULT_URL_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST
cls.process = popen_launch_server( cls.process = popen_launch_server(
cls.model, DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
cls.base_url, cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[ other_args=[
"--speculative-algorithm", "--speculative-algorithm",
"EAGLE", "EAGLE",
"--speculative-draft-model-path", "--speculative-draft-model-path",
speculative_draft_model_path, DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
"--speculative-num-steps", "--speculative-num-steps",
"3", "5",
"--speculative-eagle-topk", "--speculative-eagle-topk",
"4", "8",
"--speculative-num-draft-tokens", "--speculative-num-draft-tokens",
"16", "64",
"--served-model-name", "--mem-fraction-static",
"base", "0.7",
], ],
) )
...@@ -148,40 +114,67 @@ class TestEAGLELaunchServer(unittest.TestCase): ...@@ -148,40 +114,67 @@ class TestEAGLELaunchServer(unittest.TestCase):
def tearDownClass(cls): def tearDownClass(cls):
kill_process_tree(cls.process.pid) kill_process_tree(cls.process.pid)
def test_eagle_server_concurrency(self): def send_request(self):
concurrency = 4 time.sleep(random.uniform(0, 2))
processes = [ for prompt in prompts:
multiprocessing.Process( url = self.base_url + "/generate"
target=process, data = {
kwargs={"server_url": self.base_url + "/generate"}, "text": prompt,
) "sampling_params": {
for _ in range(concurrency) "temperature": 0,
] "max_new_tokens": 1024,
for worker in processes: },
worker.start() }
for p in processes: response = requests.post(url, json=data)
p.join() assert response.status_code == 200
def test_eagle_server_request_abort(self): def send_requests_abort(self):
for prompt in prompts:
try:
time.sleep(random.uniform(0, 2))
url = self.base_url + "/generate"
data = {
"model": "base",
"text": prompt,
"sampling_params": {
"temperature": 0,
"max_new_tokens": 1024,
},
}
# set timeout = 1s,mock disconnected
requests.post(url, json=data, timeout=1)
except Exception as e:
print(e)
pass
def test_request_abort(self):
concurrency = 4 concurrency = 4
processes = [ threads = [
multiprocessing.Process( threading.Thread(target=self.send_request) for _ in range(concurrency)
target=process,
kwargs={"server_url": self.base_url + "/generate"},
)
for _ in range(concurrency)
] + [ ] + [
multiprocessing.Process( threading.Thread(target=self.send_requests_abort)
target=abort_process,
kwargs={"server_url": self.base_url + "/generate"},
)
for _ in range(concurrency) for _ in range(concurrency)
] ]
for worker in processes: for worker in threads:
worker.start() worker.start()
for p in processes: for p in threads:
p.join() p.join()
def test_gsm8k(self):
args = SimpleNamespace(
num_shots=5,
data_path=None,
num_questions=200,
max_new_tokens=512,
parallel=128,
host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]),
)
metrics = run_eval(args)
print(f"{metrics=}")
self.assertGreater(metrics["accuracy"], 0.20)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -23,7 +23,7 @@ class TestTorchCompile(unittest.TestCase): ...@@ -23,7 +23,7 @@ class TestTorchCompile(unittest.TestCase):
cls.model, cls.model,
cls.base_url, cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=["--enable-torch-compile"], other_args=["--enable-torch-compile", "--cuda-graph-max-bs", "4"],
) )
@classmethod @classmethod
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment