Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
0d187eda
"include/vscode:/vscode.git/clone" did not exist on "33fad9ba8dac4458f5f5b8ed24937e91d3cccd07"
Commit
0d187eda
authored
Sep 17, 2024
by
Baber
Browse files
add mathvista
parent
fb963f0f
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
142 additions
and
0 deletions
+142
-0
lm_eval/tasks/mathvista/mathvista.yaml
lm_eval/tasks/mathvista/mathvista.yaml
+18
-0
lm_eval/tasks/mathvista/utils.py
lm_eval/tasks/mathvista/utils.py
+124
-0
No files found.
lm_eval/tasks/mathvista/mathvista.yaml
0 → 100644
View file @
0d187eda
dataset_path
:
AI4Math/MathVista
task
:
mathvista_mcq
test_split
:
testmini
output_type
:
"
greedy_until"
process_docs
:
!function
utils.process_docs
doc_to_image
:
!function
utils.doc_to_image
doc_to_text
:
"
<image>
{{query}}"
#doc_to_choice: '{{ ["A", "B", "C", "D", "E", "F"][:choices.length] }}'
doc_to_target
:
answer
process_results
:
!function
utils.process_results
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
metadata
:
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/mathvista/utils.py
0 → 100644
View file @
0d187eda
import
re
from
Levenshtein
import
distance
# taken from https://github.com/lupantech/MathVista/blob/main/evaluation/calculate_score.py
def
get_most_similar
(
prediction
:
str
,
choices
:
list
):
"""
Use the Levenshtein distance (or edit distance) to determine which of the choices is most similar to the given prediction
"""
distances
=
[
distance
(
prediction
,
choice
)
for
choice
in
choices
]
ind
=
distances
.
index
(
min
(
distances
))
return
choices
[
ind
]
# return min(choices, key=lambda choice: distance(prediction, choice))
def
normalize_extracted_answer
(
extraction
,
choices
:
list
,
question_type
:
str
,
answer_type
:
str
,
precision
,
ignore_empty_extractions
=
False
,
):
"""
Normalize the extracted answer to match the answer type
"""
if
question_type
==
"multi_choice"
:
# make sure the extraction is a string
if
isinstance
(
extraction
,
str
):
extraction
=
extraction
.
strip
()
else
:
try
:
extraction
=
str
(
extraction
)
except
Exception
:
extraction
=
""
# if the extraction is empty, return None
if
ignore_empty_extractions
and
not
extraction
:
return
None
# extract "A" from "(A) text"
letter
=
re
.
findall
(
r
"\(([a-zA-Z])\)"
,
extraction
)
if
len
(
letter
)
>
0
:
extraction
=
letter
[
0
].
upper
()
sequential_characters
=
[
chr
(
ord
(
"A"
)
+
i
)
for
i
in
range
(
len
(
choices
))]
# if model output a character, use it as index of available choices
if
extraction
in
sequential_characters
:
option_index
=
sequential_characters
.
index
(
extraction
)
normalized_extraction
=
choices
[
option_index
]
else
:
# select the most similar option
normalized_extraction
=
get_most_similar
(
extraction
,
choices
)
assert
normalized_extraction
in
choices
elif
answer_type
==
"integer"
:
try
:
normalized_extraction
=
str
(
int
(
float
(
extraction
)))
except
Exception
:
normalized_extraction
=
None
elif
answer_type
==
"float"
:
try
:
normalized_extraction
=
str
(
round
(
float
(
extraction
),
precision
))
except
Exception
:
normalized_extraction
=
None
elif
answer_type
==
"list"
:
try
:
normalized_extraction
=
str
(
extraction
)
except
Exception
:
normalized_extraction
=
None
return
normalized_extraction
def
safe_equal
(
prediction
,
answer
):
"""
Check if the prediction is equal to the answer, even if they are of different types
"""
try
:
if
prediction
==
answer
:
return
True
return
False
except
Exception
:
return
False
def
get_acc_with_contion
(
res_pd
,
key
,
value
):
if
key
==
"skills"
:
total_pd
=
res_pd
[
res_pd
[
key
].
apply
(
lambda
x
:
value
in
x
)]
else
:
total_pd
=
res_pd
[
res_pd
[
key
]
==
value
]
correct_pd
=
total_pd
[
total_pd
[
"true_false"
]
==
True
]
# noqa: E712
acc
=
len
(
correct_pd
)
/
len
(
total_pd
)
return
len
(
correct_pd
),
len
(
total_pd
),
acc
# adapted from https://github.com/lupantech/MathVista/blob/main/evaluation/extract_answer.py
def
process_results
(
doc
,
results
):
response
=
results
[
0
]
choices
=
doc
[
"choices"
]
question_type
=
doc
[
"question_type"
]
answer_type
=
doc
[
"answer_type"
]
precision
=
doc
[
"precision"
]
# noqa: F841
extraction
=
doc
[
"extraction"
]
# noqa: F841
if
question_type
==
"multi_choice"
and
response
in
choices
:
return
{
"acc"
:
1.0
}
if
answer_type
==
"integer"
:
try
:
extraction
=
int
(
response
)
return
str
(
extraction
)
except
Exception
:
pass
if
answer_type
==
"float"
:
try
:
extraction
=
str
(
float
(
response
))
return
extraction
except
Exception
:
pass
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment