"vscode:/vscode.git/clone" did not exist on "b975bceff3558b7d93566e18f47f20862cb6b977"
Unverified Commit 7e61737d authored by Vedant V Jhaveri's avatar Vedant V Jhaveri Committed by GitHub
Browse files

[Generative Scores API] add performance tests to CICD (#10830)

parent 3c699772
...@@ -460,6 +460,39 @@ jobs: ...@@ -460,6 +460,39 @@ jobs:
cd test/srt cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency
performance-test-1-gpu-part-3:
needs: [check-changes, sgl-kernel-build-wheels]
if: always() && !failure() && !cancelled() &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
runs-on: 1-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download artifacts
if: needs.check-changes.outputs.sgl_kernel == 'true'
uses: actions/download-artifact@v4
with:
path: sgl-kernel/dist/
merge-multiple: true
pattern: wheel-python3.10-cuda12.9
- name: Install dependencies
run: |
CUSTOM_BUILD_SGL_KERNEL=${{needs.check-changes.outputs.sgl_kernel}} bash scripts/ci/ci_install_dependency.sh
- name: Benchmark Scores online latency and throughput
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_latency_throughput
- name: Benchmark Scores online latency and throughput (batch size scaling)
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_score_api_batch_scaling
performance-test-2-gpu: performance-test-2-gpu:
needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels] needs: [check-changes, unit-test-backend-2-gpu, sgl-kernel-build-wheels]
if: always() && !failure() && !cancelled() && if: always() && !failure() && !cancelled() &&
......
...@@ -43,6 +43,7 @@ from sglang.utils import get_exception_traceback ...@@ -43,6 +43,7 @@ from sglang.utils import get_exception_traceback
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct" DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct" DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B" DEFAULT_SMALL_MODEL_NAME_FOR_TEST_BASE = "meta-llama/Llama-3.2-1B"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE = "Qwen/Qwen3-Reranker-0.6B"
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1" DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B" DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_BASE = "Qwen/Qwen1.5-MoE-A2.7B"
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat" DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST_CHAT = "Qwen/Qwen1.5-MoE-A2.7B-Chat"
...@@ -873,6 +874,154 @@ def run_bench_serving( ...@@ -873,6 +874,154 @@ def run_bench_serving(
return res return res
def run_score_benchmark(
model,
num_requests=100,
batch_size=5,
other_server_args=None,
need_warmup=False,
device="auto",
):
"""Score API benchmark function compatible with run_bench_serving pattern"""
if other_server_args is None:
other_server_args = []
if device == "auto":
device = auto_config_device()
# Launch the server (consistent with run_bench_serving)
base_url = DEFAULT_URL_FOR_TEST
process = popen_launch_server(
model,
base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=other_server_args,
)
async def _run_benchmark():
# Load tokenizer for generating test data
from sglang.srt.hf_transformers_utils import get_tokenizer
tokenizer = get_tokenizer(model)
# Score API configuration
score_query_tokens = 120
score_item_tokens = 180
score_label_token_ids = [9454, 2753] # Yes/No token IDs
special_token = "<|im_start|>"
def generate_text_with_token_count(num_tokens):
"""Generate text with precise token count using replicated token."""
text = special_token * num_tokens
actual_tokens = len(tokenizer.encode(text, add_special_tokens=False))
if actual_tokens != num_tokens:
text = special_token * (
num_tokens
// len(tokenizer.encode(special_token, add_special_tokens=False))
)
return text
if need_warmup:
warmup_data = {
"query": generate_text_with_token_count(score_query_tokens),
"items": [
generate_text_with_token_count(score_item_tokens) for _ in range(3)
],
"label_token_ids": score_label_token_ids,
"model": model,
"apply_softmax": True,
}
async with aiohttp.ClientSession() as session:
try:
await session.post(
f"{base_url}/v1/score",
json=warmup_data,
timeout=aiohttp.ClientTimeout(total=30),
)
except:
pass # Ignore warmup errors
test_requests = []
for i in range(num_requests):
query = generate_text_with_token_count(score_query_tokens)
items = [
generate_text_with_token_count(score_item_tokens)
for _ in range(batch_size)
]
score_data = {
"query": query,
"items": items,
"label_token_ids": score_label_token_ids,
"model": model,
"apply_softmax": True,
}
test_requests.append(score_data)
start_time = time.monotonic()
successful_requests = 0
total_latency = 0
latencies = []
async with aiohttp.ClientSession() as session:
for request_data in test_requests:
try:
request_start = time.monotonic()
async with session.post(
f"{base_url}/v1/score",
json=request_data,
timeout=aiohttp.ClientTimeout(total=30),
) as response:
if response.status == 200:
response_data = await response.json()
request_end = time.monotonic()
if "scores" in response_data or "logprobs" in response_data:
latency_ms = (request_end - request_start) * 1000
latencies.append(latency_ms)
total_latency += latency_ms
successful_requests += 1
except Exception:
continue
end_time = time.monotonic()
total_time = end_time - start_time
if successful_requests > 0:
throughput = successful_requests / total_time
avg_latency = total_latency / successful_requests
latencies.sort()
p95_latency = latencies[int(len(latencies) * 0.95)] if latencies else 0
return {
"completed": successful_requests,
"total_requests": num_requests,
"throughput": throughput,
"avg_latency_ms": avg_latency,
"p95_latency_ms": p95_latency,
"successful_requests": successful_requests,
}
else:
return {
"completed": 0,
"total_requests": num_requests,
"throughput": 0,
"avg_latency_ms": 0,
"p95_latency_ms": 0,
"successful_requests": 0,
}
try:
res = asyncio.run(_run_benchmark())
finally:
kill_process_tree(process.pid)
assert res["completed"] == res["successful_requests"]
return res
def run_bench_serving_multi( def run_bench_serving_multi(
model, model,
base_url, base_url,
......
...@@ -4,17 +4,20 @@ import unittest ...@@ -4,17 +4,20 @@ import unittest
import requests import requests
from sglang.srt.hf_transformers_utils import get_tokenizer
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST, DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST,
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST, DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST,
DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_MODEL_NAME_FOR_TEST_FP8, DEFAULT_MODEL_NAME_FOR_TEST_FP8,
DEFAULT_MOE_MODEL_NAME_FOR_TEST, DEFAULT_MOE_MODEL_NAME_FOR_TEST,
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE,
DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST, DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST,
CustomTestCase, CustomTestCase,
is_in_amd_ci, is_in_amd_ci,
is_in_ci, is_in_ci,
run_bench_serving, run_bench_serving,
run_score_benchmark,
write_github_step_summary, write_github_step_summary,
) )
...@@ -440,6 +443,71 @@ class TestBenchServing(CustomTestCase): ...@@ -440,6 +443,71 @@ class TestBenchServing(CustomTestCase):
) )
self.assertGreater(res["input_throughput"], 4000) self.assertGreater(res["input_throughput"], 4000)
def test_score_api_latency_throughput(self):
"""Test score API latency and throughput performance"""
res = run_score_benchmark(
model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE,
num_requests=1000,
batch_size=10,
other_server_args=[],
need_warmup=True,
)
if is_in_ci():
write_github_step_summary(
f"### test_score_api_throughput\n"
f"Average latency: {res['avg_latency_ms']:.2f} ms\n"
f"P95 latency: {res['p95_latency_ms']:.2f} ms\n"
f"Score API throughput: {res['throughput']:.2f} req/s\n"
f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n"
)
self.assertEqual(res["successful_requests"], res["total_requests"])
self.assertLess(res["avg_latency_ms"], 48)
self.assertLess(res["p95_latency_ms"], 50)
self.assertGreater(res["throughput"], 20)
def test_score_api_batch_scaling(self):
"""Test score API performance with different batch sizes"""
batch_sizes = [10, 25, 50]
for batch_size in batch_sizes:
res = run_score_benchmark(
model=DEFAULT_SMALL_MODEL_NAME_FOR_TEST_SCORE,
num_requests=500,
batch_size=batch_size,
)
if is_in_ci():
write_github_step_summary(
f"### test_score_api_batch_scaling_size_{batch_size}\n"
f"Batch size: {batch_size}\n"
f"Average latency: {res['avg_latency_ms']:.2f} ms\n"
f"P95 latency: {res['p95_latency_ms']:.2f} ms\n"
f"Throughput: {res['throughput']:.2f} req/s\n"
f"Successful requests: {res['successful_requests']}/{res['total_requests']}\n"
)
self.assertEqual(res["successful_requests"], res["total_requests"])
if batch_size == 10:
avg_latency_bound = 45
elif batch_size == 25:
avg_latency_bound = 50
elif batch_size == 50:
avg_latency_bound = 60
else:
avg_latency_bound = 60
self.assertLess(res["avg_latency_ms"], avg_latency_bound)
if batch_size == 10:
p95_latency_bound = 50
elif batch_size == 25:
p95_latency_bound = 60
elif batch_size == 50:
p95_latency_bound = 65
else:
p95_latency_bound = 65
self.assertLess(res["p95_latency_ms"], p95_latency_bound)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment