Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
8bce8cf6
Commit
8bce8cf6
authored
Jul 02, 2024
by
lintangsutawika
Browse files
add mmmu tasks
parent
34a079e3
Changes
9
Show whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
560 additions
and
0 deletions
+560
-0
lm_eval/tasks/mmmu/_template_yaml
lm_eval/tasks/mmmu/_template_yaml
+20
-0
lm_eval/tasks/mmmu/art_and_design.yaml
lm_eval/tasks/mmmu/art_and_design.yaml
+19
-0
lm_eval/tasks/mmmu/business.yaml
lm_eval/tasks/mmmu/business.yaml
+23
-0
lm_eval/tasks/mmmu/health_and_medicine.yaml
lm_eval/tasks/mmmu/health_and_medicine.yaml
+23
-0
lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml
lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml
+19
-0
lm_eval/tasks/mmmu/mmmu.yaml
lm_eval/tasks/mmmu/mmmu.yaml
+8
-0
lm_eval/tasks/mmmu/science.yaml
lm_eval/tasks/mmmu/science.yaml
+23
-0
lm_eval/tasks/mmmu/tech_and_engineering.yaml
lm_eval/tasks/mmmu/tech_and_engineering.yaml
+31
-0
lm_eval/tasks/mmmu/utils.py
lm_eval/tasks/mmmu/utils.py
+394
-0
No files found.
lm_eval/tasks/mmmu/_template_yaml
0 → 100644
View file @
8bce8cf6
dataset_path: MMMU/MMMU
validation_split: validation
output_type: generate_until
doc_to_image: !function utils.doc_to_image
doc_to_text: !function utils.doc_to_text
doc_to_target: "answer"
process_results: !function utils.process_results
generation_kwargs:
until:
- "<|endoftext|>"
temperature: 1.0
do_sample: false
max_gen_toks: 512
top_p: 0.94
repetition_penalty: 1.0
image_aspect_ratio: original
metric_list:
- metric: mmmu_acc
aggregation: mean
higher_is_better: true
lm_eval/tasks/mmmu/art_and_design.yaml
0 → 100644
View file @
8bce8cf6
group
:
mmmu_art_and_design
group_alias
:
Art and Design
task
:
-
task
:
mmmu_art
include
:
_template_yaml
task_alias
:
Art
dataset_name
:
Art
-
task
:
mmmu_art_theory
include
:
_template_yaml
task_alias
:
Art Theory
dataset_name
:
Art_Theory
-
task
:
mmmu_design
include
:
_template_yaml
task_alias
:
Design
dataset_name
:
Design
-
task
:
mmmu_music
include
:
_template_yaml
task_alias
:
Music
dataset_name
:
Music
lm_eval/tasks/mmmu/business.yaml
0 → 100644
View file @
8bce8cf6
group
:
mmmu_business
group_alias
:
Business
task
:
# - task: mmmu_accounting
# include: _template_yaml
# task_alias: Accounting
# dataset_name: Accounting
-
task
:
mmmu_economics
include
:
_template_yaml
task_alias
:
Economics
dataset_name
:
Economics
# - task: mmmu_finance
# include: _template_yaml
# task_alias: Finance
# dataset_name: Finance
# - task: mmmu_manage
# include: _template_yaml
# task_alias: Manage
# dataset_name: Manage
# - task: mmmu_marketing
# include: _template_yaml
# task_alias: Marketing
# dataset_name: Marketing
lm_eval/tasks/mmmu/health_and_medicine.yaml
0 → 100644
View file @
8bce8cf6
group
:
mmmu_health_and_medicine
group_alias
:
Health and Medicine
task
:
-
task
:
mmmu_basic_medical_science
include
:
_template_yaml
task_alias
:
Basic Medical Science
dataset_name
:
Basic_Medical_Science
-
task
:
mmmu_clinical_medicine
include
:
_template_yaml
task_alias
:
Clinical Medicine
dataset_name
:
Clinical_Medicine
-
task
:
mmmu_diagnostics_and_laboratory_medicine
include
:
_template_yaml
task_alias
:
Diagnostics and Laboratory Medicine
dataset_name
:
Diagnostics_and_Laboratory_Medicine
-
task
:
mmmu_pharmacy
include
:
_template_yaml
task_alias
:
Pharmacy
dataset_name
:
Pharmacy
-
task
:
mmmu_public_health
include
:
_template_yaml
task_alias
:
Public Health
dataset_name
:
Public_Health
lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml
0 → 100644
View file @
8bce8cf6
group
:
mmmu_humanities_and_social_science
group_alias
:
Humanities and Social Science
task
:
-
task
:
mmmu_history
include
:
_template_yaml
task_alias
:
History
dataset_name
:
History
-
task
:
mmmu_literature
include
:
_template_yaml
task_alias
:
Literature
dataset_name
:
Literature
-
task
:
mmmu_sociology
include
:
_template_yaml
task_alias
:
Sociology
dataset_name
:
Sociology
-
task
:
mmmu_psychology
include
:
_template_yaml
task_alias
:
Psychology
dataset_name
:
Psychology
lm_eval/tasks/mmmu/mmmu.yaml
0 → 100644
View file @
8bce8cf6
group
:
mmmu
task
:
-
mmmu_art_and_design
-
mmmu_business
-
mmmu_health_and_medicine
-
mmmu_humanities_and_social_science
-
mmmu_science
-
mmmu_tech_and_engineering
lm_eval/tasks/mmmu/science.yaml
0 → 100644
View file @
8bce8cf6
group
:
mmmu_science
group_alias
:
Science
task
:
-
task
:
mmmu_biology
include
:
_template_yaml
task_alias
:
Biology
dataset_name
:
Biology
-
task
:
mmmu_chemistry
include
:
_template_yaml
task_alias
:
Chemistry
dataset_name
:
Chemistry
-
task
:
mmmu_geography
include
:
_template_yaml
task_alias
:
Geography
dataset_name
:
Geography
-
task
:
mmmu_math
include
:
_template_yaml
task_alias
:
Math
dataset_name
:
Math
-
task
:
mmmu_physics
include
:
_template_yaml
task_alias
:
Physics
dataset_name
:
Physics
lm_eval/tasks/mmmu/tech_and_engineering.yaml
0 → 100644
View file @
8bce8cf6
group
:
mmmu_tech_and_engineering
group_alias
:
Tech and Engineering
task
:
-
task
:
mmmu_agriculture
include
:
_template_yaml
task_alias
:
Agriculture
dataset_name
:
Agriculture
-
task
:
mmmu_architecture_and_engineering
include
:
_template_yaml
task_alias
:
Architecture and Engineering
dataset_name
:
Architecture_and_Engineering
-
task
:
mmmu_computer_science
include
:
_template_yaml
task_alias
:
Computer Science
dataset_name
:
Computer_Science
-
task
:
mmmu_electronics
include
:
_template_yaml
task_alias
:
Electronics
dataset_name
:
Electronics
-
task
:
mmmu_energy_and_power
include
:
_template_yaml
task_alias
:
Energy and Power
dataset_name
:
Energy_and_Power
-
task
:
mmmu_materials
include
:
_template_yaml
task_alias
:
Materials
dataset_name
:
Materials
-
task
:
mmmu_mechanical_engineering
include
:
_template_yaml
task_alias
:
Mechanical Engineering
dataset_name
:
Mechanical_Engineering
lm_eval/tasks/mmmu/utils.py
0 → 100644
View file @
8bce8cf6
import
ast
import
random
import
re
import
numpy
as
np
MULTI_CHOICE_PROMPT
=
"Answer with the option letter from the given choices directly."
OPEN_ENDED_PROMPT
=
"Answer the question using a single word or phrase."
def
replace_images_tokens
(
input_string
):
for
i
in
range
(
1
,
8
):
question_text
=
f
"<image
{
i
}
>"
query_text
=
"<image>"
if
question_text
in
input_string
:
input_string
=
input_string
.
replace
(
question_text
,
query_text
)
return
input_string
def
parse_options
(
options
):
option_letters
=
[
chr
(
ord
(
"A"
)
+
i
)
for
i
in
range
(
len
(
options
))]
choices_str
=
"
\n
"
.
join
(
[
f
"
{
option_letter
}
.
{
option
}
"
for
option_letter
,
option
in
zip
(
option_letters
,
options
)
]
)
return
choices_str
def
construct_prompt
(
doc
):
question
=
doc
[
"question"
]
if
doc
[
"question_type"
]
==
"multiple-choice"
:
# Weirdly, data["options"] is a string in MMMU Huggingface dataset
parsed_options
=
parse_options
(
ast
.
literal_eval
(
doc
[
"options"
]))
# parsed_options already prepends a newline so no need to add space here
question
=
f
"
{
question
}
\n
{
parsed_options
}
\n
{
MULTI_CHOICE_PROMPT
}
"
else
:
question
=
f
"
{
question
}
\n
{
OPEN_ENDED_PROMPT
}
"
return
question
def
doc_to_text
(
doc
):
question
=
construct_prompt
(
doc
)
return
replace_images_tokens
(
question
)
def
doc_to_image
(
doc
):
prompt
=
construct_prompt
(
doc
)
image_tokens
=
re
.
findall
(
r
"<image \d+>"
,
prompt
)
# Remove <> and swap space as _
image_tokens
=
[
image_token
.
strip
(
"<>"
).
replace
(
" "
,
"_"
)
for
image_token
in
image_tokens
]
visual
=
[
doc
[
image_token
].
convert
(
"RGB"
)
for
image_token
in
image_tokens
]
return
visual
def
process_results
(
doc
,
results
):
pred
=
results
[
0
]
if
doc
[
"question_type"
]
==
"multiple-choice"
:
index2ans
,
all_choices
=
get_multi_choice_info
(
ast
.
literal_eval
(
doc
[
"options"
]))
parsed_pred
=
parse_multi_choice_response
(
pred
,
all_choices
,
index2ans
)
else
:
parsed_pred
=
parse_open_response
(
pred
)
sample_dict
=
{
"id"
:
doc
[
"id"
],
"subdomain"
:
extract_subset_name
(
doc
[
"id"
]),
"question_type"
:
doc
[
"question_type"
],
"answer"
:
doc
[
"answer"
],
"parsed_pred"
:
parsed_pred
,
}
_
,
result_dict
=
evaluate_mmmu
([
sample_dict
])
return
result_dict
def
extract_subset_name
(
input_string
):
# Define a regex pattern to match "validation_" at the beginning and "_<number>" at the end
split
=
input_string
.
split
(
"_"
)[
0
]
pattern
=
re
.
compile
(
rf
"^
{
split
}
_(.+?)_\d+$"
)
match
=
pattern
.
search
(
input_string
)
if
match
:
return
match
.
group
(
1
)
else
:
raise
ValueError
(
f
'No match found in "
{
input_string
}
"'
)
##################
# Helper functions written by official MMMU repo.
##################
def
calculate_ins_level_acc
(
results
):
"""Calculate the instruction level accuracy for given Subject results
https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L246
"""
acc
=
0
ins_num
=
0
for
cat_results
in
results
.
values
():
acc
+=
cat_results
[
"mmmu_acc"
]
*
cat_results
[
"num_example"
]
ins_num
+=
cat_results
[
"num_example"
]
if
ins_num
==
0
:
return
0
return
acc
/
ins_num
def
eval_multi_choice
(
gold_i
,
pred_i
):
"""
Evaluate a multiple choice instance.
https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L175
"""
correct
=
False
# only they are exactly the same, we consider it as correct
if
isinstance
(
gold_i
,
list
):
for
answer
in
gold_i
:
if
answer
==
pred_i
:
correct
=
True
break
else
:
# gold_i is a string
if
gold_i
==
pred_i
:
correct
=
True
return
correct
def
eval_open
(
gold_i
,
pred_i
):
"""
Evaluate an open question instance
https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L191
"""
correct
=
False
if
isinstance
(
gold_i
,
list
):
# use float to avoid trivial matches
norm_answers
=
[]
for
answer
in
gold_i
:
norm_answers
.
extend
(
normalize_str
(
answer
))
else
:
norm_answers
=
normalize_str
(
gold_i
)
for
pred
in
pred_i
:
# pred is already normalized in parse response phase
if
isinstance
(
pred
,
str
):
# if it's a string, then find if ans in the pred_i
for
norm_ans
in
norm_answers
:
# only see if the string answer in the string pred
if
isinstance
(
norm_ans
,
str
)
and
norm_ans
in
pred
:
if
not
correct
:
correct
=
True
break
else
:
# it's a float number
if
pred
in
norm_answers
:
if
not
correct
:
correct
=
True
break
return
correct
def
evaluate_mmmu
(
samples
):
"""
Batch evaluation for multiple choice and open questions.
https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L219
"""
pred_correct
=
0
judge_dict
=
dict
()
for
sample
in
samples
:
gold_i
=
sample
[
"answer"
]
pred_i
=
sample
[
"parsed_pred"
]
if
sample
[
"question_type"
]
==
"multiple-choice"
:
correct
=
eval_multi_choice
(
gold_i
,
pred_i
)
else
:
# open question
correct
=
eval_open
(
gold_i
,
pred_i
)
if
correct
:
judge_dict
[
sample
[
"id"
]]
=
"Correct"
pred_correct
+=
1
else
:
judge_dict
[
sample
[
"id"
]]
=
"Wrong"
if
len
(
samples
)
==
0
:
return
{
"mmmu_acc"
:
0
}
return
judge_dict
,
{
"mmmu_acc"
:
pred_correct
/
len
(
samples
)}
def
parse_multi_choice_response
(
response
,
all_choices
,
index2ans
):
"""
Parse the prediction from the generated response.
Return the predicted index e.g., A, B, C, D.
https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L10
"""
for
char
in
[
","
,
"."
,
"!"
,
"?"
,
";"
,
":"
,
"'"
]:
response
=
response
.
strip
(
char
)
response
=
" "
+
response
+
" "
# add space to avoid partial match
index_ans
=
True
ans_with_brack
=
False
candidates
=
[]
for
choice
in
all_choices
:
# e.g., (A) (B) (C) (D)
if
f
"(
{
choice
}
)"
in
response
:
candidates
.
append
(
choice
)
ans_with_brack
=
True
if
len
(
candidates
)
==
0
:
for
choice
in
all_choices
:
# e.g., A B C D
if
f
"
{
choice
}
"
in
response
:
candidates
.
append
(
choice
)
if
len
(
candidates
)
==
0
:
for
choice
in
all_choices
:
# e.g., A. B. C. D.
if
f
"
{
choice
}
."
in
response
:
candidates
.
append
(
choice
)
# if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
if
len
(
candidates
)
==
0
and
len
(
response
.
split
())
>
5
:
for
index
,
ans
in
index2ans
.
items
():
if
ans
.
lower
()
in
response
.
lower
():
candidates
.
append
(
index
)
index_ans
=
False
# it's content ans.
if
len
(
candidates
)
==
0
:
# still not get answer, randomly choose one.
pred_index
=
random
.
choice
(
all_choices
)
elif
len
(
candidates
)
>
1
:
start_indexes
=
[]
if
index_ans
:
if
ans_with_brack
:
for
can
in
candidates
:
index
=
response
.
rfind
(
f
"(
{
can
}
)"
)
start_indexes
.
append
(
index
)
# -1 will be ignored anyway
# start_indexes = [generated_response.index(f'({can})') for can in candidates]
else
:
for
can
in
candidates
:
index
=
response
.
rfind
(
f
"
{
can
}
"
)
start_indexes
.
append
(
index
)
else
:
for
can
in
candidates
:
index
=
response
.
lower
().
rfind
(
index2ans
[
can
].
lower
())
start_indexes
.
append
(
index
)
# get the last one
pred_index
=
candidates
[
np
.
argmax
(
start_indexes
)]
else
:
# if only one candidate, use it.
pred_index
=
candidates
[
0
]
return
pred_index
def
extract_numbers
(
string
):
"""
Exact all forms of numbers from a string with regex.
https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L100
"""
# Pattern for numbers with commas
pattern_commas
=
r
"-?\b\d{1,3}(?:,\d{3})+\b"
# Pattern for scientific notation
pattern_scientific
=
r
"-?\d+(?:\.\d+)?[eE][+-]?\d+"
# Pattern for simple numbers without commas
pattern_simple
=
r
"-?(?:\d+\.\d+|\.\d+|\d+\b)(?![eE][+-]?\d+)(?![,\d])"
# Extract numbers with commas
numbers_with_commas
=
re
.
findall
(
pattern_commas
,
string
)
# Extract numbers in scientific notation
numbers_scientific
=
re
.
findall
(
pattern_scientific
,
string
)
# Extract simple numbers without commas
numbers_simple
=
re
.
findall
(
pattern_simple
,
string
)
# Combine all extracted numbersz
all_numbers
=
numbers_with_commas
+
numbers_scientific
+
numbers_simple
return
all_numbers
def
check_is_number
(
string
):
"""
Check if the given string a number.
https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L65
"""
try
:
float
(
string
.
replace
(
","
,
""
))
return
True
except
ValueError
:
# check if there's comma inside
return
False
def
normalize_str
(
string
):
"""
Normalize the str to lower case and make them float numbers if possible.
https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L76
"""
# check if characters in the string
# if number, numerize it.
string
=
string
.
strip
()
is_number
=
check_is_number
(
string
)
if
is_number
:
string
=
string
.
replace
(
","
,
""
)
string
=
float
(
string
)
# leave 2 decimal
string
=
round
(
string
,
2
)
return
[
string
]
else
:
# it's likely to be a string
# lower it
string
=
string
.
lower
()
if
len
(
string
)
==
1
:
return
[
" "
+
string
,
string
+
" "
]
# avoid trivial matches
return
[
string
]
def
parse_open_response
(
response
):
"""
Parse the prediction from the generated response.
Return a list of predicted strings or numbers.
https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L122
"""
# content = content.strip("\n").strip(".").strip(" ")
def
get_key_subresponses
(
response
):
key_responses
=
[]
response
=
response
.
strip
().
strip
(
"."
).
lower
()
sub_responses
=
re
.
split
(
r
"\.\s(?=[A-Z])|\n"
,
response
)
indicators_of_keys
=
[
"could be "
,
"so "
,
"is "
,
"thus "
,
"therefore "
,
"final "
,
"answer "
,
"result "
,
]
key_responses
=
[]
for
index
,
resp
in
enumerate
(
sub_responses
):
# if last one, accept it's an equation (the entire response can be just one sentence with equation)
if
index
==
len
(
sub_responses
)
-
1
:
indicators_of_keys
.
extend
([
"="
])
shortest_key_response
=
None
# the shortest response that may contain the answer (tail part of the response)
for
indicator
in
indicators_of_keys
:
if
indicator
in
resp
:
if
not
shortest_key_response
:
shortest_key_response
=
resp
.
split
(
indicator
)[
-
1
].
strip
()
else
:
if
len
(
resp
.
split
(
indicator
)[
-
1
].
strip
())
<
len
(
shortest_key_response
):
shortest_key_response
=
resp
.
split
(
indicator
)[
-
1
].
strip
()
# key_responses.append(resp.split(indicator)[1].strip())
if
shortest_key_response
:
# and it's not trivial
if
shortest_key_response
.
strip
()
not
in
[
":"
,
","
,
"."
,
"!"
,
"?"
,
";"
,
":"
,
"'"
,
]:
key_responses
.
append
(
shortest_key_response
)
if
len
(
key_responses
)
==
0
:
# did not found any
return
[
response
]
return
key_responses
# pdb.set_trace()
key_responses
=
get_key_subresponses
(
response
)
pred_list
=
key_responses
.
copy
()
# keep the original string response
for
resp
in
key_responses
:
pred_list
.
extend
(
extract_numbers
(
resp
))
tmp_pred_list
=
[]
for
i
in
range
(
len
(
pred_list
)):
tmp_pred_list
.
extend
(
normalize_str
(
pred_list
[
i
]))
pred_list
=
tmp_pred_list
# remove duplicates
pred_list
=
list
(
set
(
pred_list
))
return
pred_list
def
get_multi_choice_info
(
options
):
"""
Given the list of options for multiple choice question
Return the index2ans and all_choices
https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/data_utils.py#L54
"""
start_chr
=
"A"
all_choices
=
[]
index2ans
=
{}
for
i
,
option
in
enumerate
(
options
):
index2ans
[
chr
(
ord
(
start_chr
)
+
i
)]
=
option
all_choices
.
append
(
chr
(
ord
(
start_chr
)
+
i
))
return
index2ans
,
all_choices
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment