Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
7d9679b7
Unverified
Commit
7d9679b7
authored
Apr 25, 2025
by
Ravi Theja
Committed by
GitHub
Apr 25, 2025
Browse files
Add MMMU benchmark results (#4491)
Co-authored-by:
Ravi Theja Desetty
<
ravitheja@Ravis-MacBook-Pro.local
>
parent
b5be5694
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
189 additions
and
7 deletions
+189
-7
benchmark/mmmu/README.md
benchmark/mmmu/README.md
+0
-7
scripts/ci_install_dependency.sh
scripts/ci_install_dependency.sh
+4
-0
test/srt/models/test_vlm_models.py
test/srt/models/test_vlm_models.py
+184
-0
test/srt/run_suite.py
test/srt/run_suite.py
+1
-0
No files found.
benchmark/mmmu/README.md
View file @
7d9679b7
...
@@ -21,10 +21,3 @@ It's recommended to reduce the memory usage by appending something ike `--mem-fr
...
@@ -21,10 +21,3 @@ It's recommended to reduce the memory usage by appending something ike `--mem-fr
```
```
python benchmark/mmmu/bench_hf.py --model-path Qwen/Qwen2-VL-7B-Instruct
python benchmark/mmmu/bench_hf.py --model-path Qwen/Qwen2-VL-7B-Instruct
```
```
Some popular model results:
1.
Qwen/Qwen2-VL-2B-Instruct: 0.241
2.
Qwen/Qwen2-VL-7B-Instruct: 0.255
3.
Qwen/Qwen2.5-VL-3B-Instruct: 0.245
4.
Qwen/Qwen2.5-VL-7B-Instruct: 0.242
scripts/ci_install_dependency.sh
View file @
7d9679b7
...
@@ -27,3 +27,7 @@ pip install transformers==4.51.0 sentence_transformers accelerate peft pandas da
...
@@ -27,3 +27,7 @@ pip install transformers==4.51.0 sentence_transformers accelerate peft pandas da
# For compling xgrammar kernels
# For compling xgrammar kernels
pip
install
cuda-python nvidia-cuda-nvrtc-cu12
pip
install
cuda-python nvidia-cuda-nvrtc-cu12
# For lmms_evals evaluating MMMU
git clone
--branch
v0.3.3
--depth
1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
pip
install
-e
lmms-eval/
test/srt/models/test_vlm_models.py
0 → 100644
View file @
7d9679b7
import
argparse
import
glob
import
json
import
os
import
random
import
subprocess
import
sys
import
unittest
from
types
import
SimpleNamespace
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.test_utils
import
(
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
DEFAULT_URL_FOR_TEST
,
CustomTestCase
,
is_in_ci
,
popen_launch_server
,
)
# VLM models for testing
MODELS
=
[
SimpleNamespace
(
model
=
"google/gemma-3-27b-it"
,
chat_template
=
"gemma-it"
,
mmmu_accuracy
=
0.45
),
SimpleNamespace
(
model
=
"Qwen/Qwen2.5-VL-3B-Instruct"
,
chat_template
=
"qwen2-vl"
,
mmmu_accuracy
=
0.4
,
),
SimpleNamespace
(
model
=
"openbmb/MiniCPM-V-2_6"
,
chat_template
=
"minicpmv"
,
mmmu_accuracy
=
0.4
),
]
class
TestVLMModels
(
CustomTestCase
):
parsed_args
=
None
# Class variable to store args
@
classmethod
def
setUpClass
(
cls
):
# Removed argument parsing from here
cls
.
base_url
=
DEFAULT_URL_FOR_TEST
cls
.
api_key
=
"sk-123456"
cls
.
time_out
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
# Set OpenAI API key and base URL environment variables. Needed for lmm-evals to work.
os
.
environ
[
"OPENAI_API_KEY"
]
=
cls
.
api_key
os
.
environ
[
"OPENAI_API_BASE"
]
=
f
"
{
cls
.
base_url
}
/v1"
def
run_mmmu_eval
(
self
,
model_version
:
str
,
chat_template
:
str
,
output_path
:
str
,
*
,
env
:
dict
|
None
=
None
,
):
"""
Evaluate a VLM on the MMMU validation set with lmms‑eval.
Only `model_version` (checkpoint) and `chat_template` vary;
We are focusing only on the validation set due to resource constraints.
"""
# -------- fixed settings --------
model
=
"openai_compatible"
tp
=
1
tasks
=
"mmmu_val"
batch_size
=
1
log_suffix
=
"openai_compatible"
os
.
makedirs
(
output_path
,
exist_ok
=
True
)
# -------- compose --model_args --------
model_args
=
(
f
'model_version="
{
model_version
}
",'
f
'chat_template="
{
chat_template
}
",'
f
"tp=
{
tp
}
"
)
# -------- build command list --------
cmd
=
[
"python3"
,
"-m"
,
"lmms_eval"
,
"--model"
,
model
,
"--model_args"
,
model_args
,
"--tasks"
,
tasks
,
"--batch_size"
,
str
(
batch_size
),
"--log_samples"
,
"--log_samples_suffix"
,
log_suffix
,
"--output_path"
,
str
(
output_path
),
]
subprocess
.
run
(
cmd
,
check
=
True
,
timeout
=
3600
,
)
def
test_vlm_mmmu_benchmark
(
self
):
"""Test VLM models against MMMU benchmark."""
models_to_test
=
MODELS
if
is_in_ci
():
models_to_test
=
[
random
.
choice
(
MODELS
)]
for
model
in
models_to_test
:
print
(
f
"
\n
Testing model:
{
model
.
model
}
"
)
process
=
None
mmmu_accuracy
=
0
# Initialize to handle potential exceptions
try
:
# Launch server for testing
process
=
popen_launch_server
(
model
.
model
,
base_url
=
self
.
base_url
,
timeout
=
self
.
time_out
,
api_key
=
self
.
api_key
,
other_args
=
[
"--chat-template"
,
model
.
chat_template
,
"--trust-remote-code"
,
"--mem-fraction-static"
,
str
(
self
.
parsed_args
.
mem_fraction_static
),
# Use class variable
],
)
# Run evaluation
self
.
run_mmmu_eval
(
model
.
model
,
model
.
chat_template
,
"./logs"
)
# Get the result file
result_file_path
=
glob
.
glob
(
"./logs/*.json"
)[
0
]
with
open
(
result_file_path
,
"r"
)
as
f
:
result
=
json
.
load
(
f
)
print
(
f
"Result
\n
:
{
result
}
"
)
# Process the result
mmmu_accuracy
=
result
[
"results"
][
"mmmu_val"
][
"mmmu_acc,none"
]
print
(
f
"Model
{
model
.
model
}
achieved accuracy:
{
mmmu_accuracy
:.
4
f
}
"
)
# Assert performance meets expected threshold
self
.
assertGreaterEqual
(
mmmu_accuracy
,
model
.
mmmu_accuracy
,
f
"Model
{
model
.
model
}
accuracy (
{
mmmu_accuracy
:.
4
f
}
) below expected threshold (
{
model
.
mmmu_accuracy
:.
4
f
}
)"
,
)
except
Exception
as
e
:
print
(
f
"Error testing
{
model
.
model
}
:
{
e
}
"
)
self
.
fail
(
f
"Test failed for
{
model
.
model
}
:
{
e
}
"
)
finally
:
# Ensure process cleanup happens regardless of success/failure
if
process
is
not
None
and
process
.
poll
()
is
None
:
print
(
f
"Cleaning up process
{
process
.
pid
}
"
)
try
:
kill_process_tree
(
process
.
pid
)
except
Exception
as
e
:
print
(
f
"Error killing process:
{
e
}
"
)
if
__name__
==
"__main__"
:
# Define and parse arguments here, before unittest.main
parser
=
argparse
.
ArgumentParser
(
description
=
"Test VLM models"
)
parser
.
add_argument
(
"--mem-fraction-static"
,
type
=
float
,
help
=
"Static memory fraction for the model"
,
default
=
0.6
,
)
# Parse args intended for unittest
args
=
parser
.
parse_args
()
# Store the parsed args object on the class
TestVLMModels
.
parsed_args
=
args
# Pass args to unittest
unittest
.
main
(
argv
=
[
sys
.
argv
[
0
]])
test/srt/run_suite.py
View file @
7d9679b7
...
@@ -24,6 +24,7 @@ suites = {
...
@@ -24,6 +24,7 @@ suites = {
TestFile
(
"models/test_reward_models.py"
,
83
),
TestFile
(
"models/test_reward_models.py"
,
83
),
TestFile
(
"models/test_gme_qwen_models.py"
,
45
),
TestFile
(
"models/test_gme_qwen_models.py"
,
45
),
TestFile
(
"models/test_clip_models.py"
,
100
),
TestFile
(
"models/test_clip_models.py"
,
100
),
TestFile
(
"models/test_vlm_models.py"
,
100
),
TestFile
(
"test_abort.py"
,
51
),
TestFile
(
"test_abort.py"
,
51
),
TestFile
(
"test_block_int8.py"
,
22
),
TestFile
(
"test_block_int8.py"
,
22
),
TestFile
(
"test_chunked_prefill.py"
,
336
),
TestFile
(
"test_chunked_prefill.py"
,
336
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment