Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
e88dd482
Unverified
Commit
e88dd482
authored
May 07, 2025
by
XinyuanTong
Committed by
GitHub
May 07, 2025
Browse files
[CI]Add performance CI for VLM (#6038)
Signed-off-by:
Xinyuan Tong
<
justinning0323@outlook.com
>
parent
73600673
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
219 additions
and
4 deletions
+219
-4
.github/workflows/pr-test.yml
.github/workflows/pr-test.yml
+12
-0
python/sglang/bench_serving.py
python/sglang/bench_serving.py
+149
-1
python/sglang/test/test_utils.py
python/sglang/test/test_utils.py
+2
-1
test/srt/test_bench_serving.py
test/srt/test_bench_serving.py
+54
-0
test/srt/test_skip_tokenizer_init.py
test/srt/test_skip_tokenizer_init.py
+2
-2
No files found.
.github/workflows/pr-test.yml
View file @
e88dd482
...
@@ -162,6 +162,18 @@ jobs:
...
@@ -162,6 +162,18 @@ jobs:
cd test/srt
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_default_fp8
-
name
:
Benchmark VLM offline throughput
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_offline_throughput
-
name
:
Benchmark VLM online latency
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_bench_serving.TestBenchServing.test_vlm_online_latency
performance-test-2-gpu
:
performance-test-2-gpu
:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft ==
false
github.event.pull_request.draft ==
false
...
...
python/sglang/bench_serving.py
View file @
e88dd482
...
@@ -58,6 +58,7 @@ class RequestFuncInput:
...
@@ -58,6 +58,7 @@ class RequestFuncInput:
output_len
:
int
output_len
:
int
model
:
str
model
:
str
lora_name
:
str
lora_name
:
str
image_data
:
str
extra_request_body
:
Dict
[
str
,
Any
]
extra_request_body
:
Dict
[
str
,
Any
]
...
@@ -347,6 +348,11 @@ async def async_request_sglang_generate(
...
@@ -347,6 +348,11 @@ async def async_request_sglang_generate(
"logprob_start_len"
:
-
1
,
"logprob_start_len"
:
-
1
,
**
request_func_input
.
extra_request_body
,
**
request_func_input
.
extra_request_body
,
}
}
# Add image data if available
if
request_func_input
.
image_data
:
payload
[
"image_data"
]
=
request_func_input
.
image_data
headers
=
get_auth_headers
()
headers
=
get_auth_headers
()
output
=
RequestFuncOutput
()
output
=
RequestFuncOutput
()
...
@@ -510,6 +516,13 @@ def get_dataset(args, tokenizer):
...
@@ -510,6 +516,13 @@ def get_dataset(args, tokenizer):
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
args
=
args
,
args
=
args
,
)
)
elif
args
.
dataset_name
==
"mmmu"
:
input_requests
=
sample_mmmu_requests
(
num_requests
=
args
.
num_prompts
,
tokenizer
=
tokenizer
,
fixed_output_len
=
args
.
random_output_len
,
random_sample
=
True
,
)
else
:
else
:
raise
ValueError
(
f
"Unknown dataset:
{
args
.
dataset_name
}
"
)
raise
ValueError
(
f
"Unknown dataset:
{
args
.
dataset_name
}
"
)
return
input_requests
return
input_requests
...
@@ -597,6 +610,121 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
...
@@ -597,6 +610,121 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
return
filename
return
filename
def
sample_mmmu_requests
(
num_requests
:
int
,
tokenizer
:
PreTrainedTokenizerBase
,
fixed_output_len
:
Optional
[
int
]
=
None
,
random_sample
:
bool
=
True
,
)
->
List
[
Tuple
[
str
,
int
,
int
]]:
"""
Sample requests from the MMMU dataset using HuggingFace datasets.
Args:
num_requests: Number of requests to sample.
tokenizer: Tokenizer to use for token counting.
fixed_output_len: If provided, use this fixed output length for all requests.
random_sample: Whether to randomly sample or take the first N.
Returns:
List of tuples (prompt, prompt_token_len, output_token_len).
"""
try
:
import
base64
import
io
from
datasets
import
load_dataset
except
ImportError
:
raise
ImportError
(
"Please install datasets: pip install datasets"
)
print
(
"Loading MMMU dataset from HuggingFace..."
)
try
:
print
(
"Attempting to load MMMU Math dataset..."
)
mmmu_dataset
=
load_dataset
(
"MMMU/MMMU"
,
"Math"
,
split
=
"test"
)
print
(
f
"Successfully loaded MMMU Math dataset from HuggingFace with
{
len
(
mmmu_dataset
)
}
examples"
)
except
Exception
as
e
:
print
(
f
"Failed to load MMMU Math dataset:
{
e
}
"
)
raise
ValueError
(
f
"Failed to load MMMU dataset:
{
e
}
"
)
# Sample from the dataset
if
len
(
mmmu_dataset
)
>
num_requests
:
if
random_sample
:
# Random sample
indices
=
random
.
sample
(
range
(
len
(
mmmu_dataset
)),
num_requests
)
sample_dataset
=
mmmu_dataset
.
select
(
indices
)
else
:
# Take first N
sample_dataset
=
mmmu_dataset
.
select
(
range
(
min
(
num_requests
,
len
(
mmmu_dataset
)))
)
else
:
print
(
f
"Dataset has less than
{
num_requests
}
examples, using all examples"
)
sample_dataset
=
mmmu_dataset
print
(
f
"Selected
{
len
(
sample_dataset
)
}
examples for benchmarking"
)
# Create prompts
filtered_dataset
=
[]
for
i
,
example
in
enumerate
(
sample_dataset
):
try
:
# Extract image_1
image
=
example
.
get
(
"image_1"
)
if
image
is
not
None
:
if
hasattr
(
image
,
"save"
):
# Convert RGBA images to RGB before encoding
if
image
.
mode
==
"RGBA"
:
image
=
image
.
convert
(
"RGB"
)
# Encode image to base64
buffered
=
io
.
BytesIO
()
image
.
save
(
buffered
,
format
=
"JPEG"
)
img_str
=
base64
.
b64encode
(
buffered
.
getvalue
()).
decode
(
"utf-8"
)
image_path
=
f
"data:image/jpeg;base64,
{
img_str
}
"
else
:
continue
# Extract the question
question
=
example
.
get
(
"question"
)
# Create the prompt with image, question
prompt
=
f
"Question:
{
question
}
\n\n
Answer: "
prompt
=
tokenizer
.
apply_chat_template
(
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_path
}},
{
"type"
:
"text"
,
"text"
:
prompt
},
],
}
],
add_generation_prompt
=
True
,
tokenize
=
False
,
)
prompt
=
f
"<image>
{
image_path
}
</image>
{
prompt
}
"
# Calculate token lengths
# Note: This is approximate since we're not rendering the actual image tokens
prompt_token_ids
=
tokenizer
.
encode
(
prompt
)
prompt_len
=
(
len
(
prompt_token_ids
)
+
512
)
# Add estimate for image tokens
output_len
=
fixed_output_len
if
fixed_output_len
is
not
None
else
256
filtered_dataset
.
append
((
prompt
,
prompt_len
,
output_len
))
except
Exception
as
e
:
print
(
f
"Error processing example
{
i
}
:
{
e
}
"
)
print
(
f
"
\n
Created
{
len
(
filtered_dataset
)
}
MMMU prompts"
)
return
filtered_dataset
def
sample_sharegpt_requests
(
def
sample_sharegpt_requests
(
dataset_path
:
str
,
dataset_path
:
str
,
num_requests
:
int
,
num_requests
:
int
,
...
@@ -1004,6 +1132,15 @@ async def benchmark(
...
@@ -1004,6 +1132,15 @@ async def benchmark(
else
:
else
:
lora_name
=
None
lora_name
=
None
if
"<image>"
in
test_prompt
:
import
re
image_match
=
re
.
search
(
r
"<image>(.*?)</image>(.*)"
,
test_prompt
)
image_data
=
image_match
.
group
(
1
)
if
image_match
else
None
test_prompt
=
image_match
.
group
(
2
)
if
image_match
else
test_prompt
else
:
image_data
=
None
# Create the test input once
# Create the test input once
test_input
=
RequestFuncInput
(
test_input
=
RequestFuncInput
(
model
=
model_id
,
model
=
model_id
,
...
@@ -1012,6 +1149,7 @@ async def benchmark(
...
@@ -1012,6 +1149,7 @@ async def benchmark(
prompt_len
=
test_prompt_len
,
prompt_len
=
test_prompt_len
,
output_len
=
min
(
test_output_len
,
32
),
output_len
=
min
(
test_output_len
,
32
),
lora_name
=
lora_name
,
lora_name
=
lora_name
,
image_data
=
image_data
,
extra_request_body
=
extra_request_body
,
extra_request_body
=
extra_request_body
,
)
)
...
@@ -1063,6 +1201,15 @@ async def benchmark(
...
@@ -1063,6 +1201,15 @@ async def benchmark(
else
:
else
:
lora_name
=
None
lora_name
=
None
if
"<image>"
in
prompt
:
import
re
image_match
=
re
.
search
(
r
"<image>(.*?)</image>(.*)"
,
prompt
)
image_data
=
image_match
.
group
(
1
)
if
image_match
else
None
prompt
=
image_match
.
group
(
2
)
if
image_match
else
prompt
else
:
image_data
=
None
request_func_input
=
RequestFuncInput
(
request_func_input
=
RequestFuncInput
(
model
=
model_id
,
model
=
model_id
,
prompt
=
prompt
,
prompt
=
prompt
,
...
@@ -1070,6 +1217,7 @@ async def benchmark(
...
@@ -1070,6 +1217,7 @@ async def benchmark(
prompt_len
=
prompt_len
,
prompt_len
=
prompt_len
,
output_len
=
output_len
,
output_len
=
output_len
,
lora_name
=
lora_name
,
lora_name
=
lora_name
,
image_data
=
image_data
,
extra_request_body
=
extra_request_body
,
extra_request_body
=
extra_request_body
,
)
)
tasks
.
append
(
tasks
.
append
(
...
@@ -1444,7 +1592,7 @@ if __name__ == "__main__":
...
@@ -1444,7 +1592,7 @@ if __name__ == "__main__":
"--dataset-name"
,
"--dataset-name"
,
type
=
str
,
type
=
str
,
default
=
"sharegpt"
,
default
=
"sharegpt"
,
choices
=
[
"sharegpt"
,
"random"
,
"random-ids"
,
"generated-shared-prefix"
],
choices
=
[
"sharegpt"
,
"random"
,
"random-ids"
,
"generated-shared-prefix"
,
"mmmu"
],
help
=
"Name of the dataset to benchmark on."
,
help
=
"Name of the dataset to benchmark on."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
...
...
python/sglang/test/test_utils.py
View file @
e88dd482
...
@@ -79,7 +79,8 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Ins
...
@@ -79,7 +79,8 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Ins
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2
=
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2
=
"neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1
=
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4,hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1
=
"hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4,hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN
=
"Qwen/Qwen2.5-1.5B-Instruct"
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN
=
"Qwen/Qwen2.5-1.5B-Instruct"
DEFAULT_SMALL_VLM_MODEL_NAME
=
"Qwen/Qwen2-VL-2B"
DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST
=
"Qwen/Qwen2.5-VL-3B-Instruct"
DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST
=
"qwen2-vl"
DEFAULT_IMAGE_URL
=
"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
DEFAULT_IMAGE_URL
=
"https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
DEFAULT_VIDEO_URL
=
"https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
DEFAULT_VIDEO_URL
=
"https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
...
...
test/srt/test_bench_serving.py
View file @
e88dd482
...
@@ -7,6 +7,8 @@ from sglang.test.test_utils import (
...
@@ -7,6 +7,8 @@ from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_MODEL_NAME_FOR_TEST_FP8
,
DEFAULT_MODEL_NAME_FOR_TEST_FP8
,
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
DEFAULT_MOE_MODEL_NAME_FOR_TEST
,
DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST
,
DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST
,
CustomTestCase
,
CustomTestCase
,
is_in_ci
,
is_in_ci
,
run_bench_serving
,
run_bench_serving
,
...
@@ -148,6 +150,58 @@ class TestBenchServing(CustomTestCase):
...
@@ -148,6 +150,58 @@ class TestBenchServing(CustomTestCase):
self
.
assertLess
(
res
[
"median_ttft_ms"
],
86
)
self
.
assertLess
(
res
[
"median_ttft_ms"
],
86
)
self
.
assertLess
(
res
[
"median_itl_ms"
],
10
)
self
.
assertLess
(
res
[
"median_itl_ms"
],
10
)
def
test_vlm_offline_throughput
(
self
):
res
=
run_bench_serving
(
model
=
DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST
,
num_prompts
=
200
,
request_rate
=
float
(
"inf"
),
other_server_args
=
[
"--chat-template"
,
DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST
,
"--mem-fraction-static"
,
"0.7"
,
],
dataset_name
=
"mmmu"
,
)
if
is_in_ci
():
write_github_step_summary
(
f
"### test_vlm_offline_throughput
\n
"
f
'Output throughput:
{
res
[
"output_throughput"
]:.
2
f
}
token/s
\n
'
)
if
os
.
getenv
(
"SGLANG_AMD_CI"
)
==
"1"
:
self
.
assertGreater
(
res
[
"output_throughput"
],
2000
)
# TODO: not set yet, need AMD machine
else
:
self
.
assertGreater
(
res
[
"output_throughput"
],
2500
)
def
test_vlm_online_latency
(
self
):
res
=
run_bench_serving
(
model
=
DEFAULT_SMALL_VLM_MODEL_NAME_FOR_TEST
,
num_prompts
=
50
,
request_rate
=
1
,
other_server_args
=
[
"--chat-template"
,
DEFAULT_VLM_CHAT_TEMPLATE_FOR_TEST
,
"--mem-fraction-static"
,
"0.7"
,
],
dataset_name
=
"mmmu"
,
)
if
is_in_ci
():
write_github_step_summary
(
f
"### test_vlm_online_latency
\n
"
f
'median_e2e_latency_ms:
{
res
[
"median_e2e_latency_ms"
]:.
2
f
}
ms
\n
'
)
self
.
assertLess
(
res
[
"median_e2e_latency_ms"
],
16000
)
if
os
.
getenv
(
"SGLANG_AMD_CI"
)
==
"1"
:
self
.
assertLess
(
res
[
"median_ttft_ms"
],
150
)
# TODO: not set yet, need AMD machine
else
:
self
.
assertLess
(
res
[
"median_ttft_ms"
],
90
)
self
.
assertLess
(
res
[
"median_itl_ms"
],
8
)
def
test_online_latency_eagle
(
self
):
def
test_online_latency_eagle
(
self
):
res
=
run_bench_serving
(
res
=
run_bench_serving
(
model
=
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
,
model
=
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST
,
...
...
test/srt/test_skip_tokenizer_init.py
View file @
e88dd482
...
@@ -16,7 +16,7 @@ from sglang.srt.utils import kill_process_tree
...
@@ -16,7 +16,7 @@ from sglang.srt.utils import kill_process_tree
from
sglang.test.test_utils
import
(
from
sglang.test.test_utils
import
(
DEFAULT_IMAGE_URL
,
DEFAULT_IMAGE_URL
,
DEFAULT_SMALL_MODEL_NAME_FOR_TEST
,
DEFAULT_SMALL_MODEL_NAME_FOR_TEST
,
DEFAULT_SMALL_VLM_MODEL_NAME
,
DEFAULT_SMALL_VLM_MODEL_NAME
_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
DEFAULT_URL_FOR_TEST
,
CustomTestCase
,
CustomTestCase
,
...
@@ -195,7 +195,7 @@ class TestSkipTokenizerInitVLM(TestSkipTokenizerInit):
...
@@ -195,7 +195,7 @@ class TestSkipTokenizerInitVLM(TestSkipTokenizerInit):
cls
.
image_url
=
DEFAULT_IMAGE_URL
cls
.
image_url
=
DEFAULT_IMAGE_URL
response
=
requests
.
get
(
cls
.
image_url
)
response
=
requests
.
get
(
cls
.
image_url
)
cls
.
image
=
Image
.
open
(
BytesIO
(
response
.
content
))
cls
.
image
=
Image
.
open
(
BytesIO
(
response
.
content
))
cls
.
model
=
DEFAULT_SMALL_VLM_MODEL_NAME
cls
.
model
=
DEFAULT_SMALL_VLM_MODEL_NAME
_FOR_TEST
cls
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
cls
.
model
,
use_fast
=
False
)
cls
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
cls
.
model
,
use_fast
=
False
)
cls
.
processor
=
AutoProcessor
.
from_pretrained
(
cls
.
model
,
trust_remote_code
=
True
)
cls
.
processor
=
AutoProcessor
.
from_pretrained
(
cls
.
model
,
trust_remote_code
=
True
)
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment