Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
b27bc18e
"vscode:/vscode.git/clone" did not exist on "5640b3869439f298a45d54a0aa2e38d5c4572524"
Commit
b27bc18e
authored
Sep 18, 2024
by
Baber
Browse files
add processing code
parent
0d187eda
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
46 additions
and
27 deletions
+46
-27
lm_eval/tasks/mathvista/mathvista.yaml
lm_eval/tasks/mathvista/mathvista.yaml
+10
-4
lm_eval/tasks/mathvista/utils.py
lm_eval/tasks/mathvista/utils.py
+36
-23
No files found.
lm_eval/tasks/mathvista/mathvista.yaml
View file @
b27bc18e
dataset_path
:
AI4Math/MathVista
task
:
mathvista
_mcq
task
:
mathvista
test_split
:
testmini
output_type
:
"
g
reedy
_until"
process_docs
:
!function
utils.process_docs
doc_to_image
:
!function
utils.doc_to
_image
output_type
:
"
g
enerate
_until"
#
process_docs: !function utils.process_docs
doc_to_image
:
decoded
_image
doc_to_text
:
"
<image>
{{query}}"
#doc_to_choice: '{{ ["A", "B", "C", "D", "E", "F"][:choices.length] }}'
doc_to_target
:
answer
process_results
:
!function
utils.process_results
generation_kwargs
:
until
:
-
"
<|endoftext|>"
temperature
:
0.0
do_sample
:
false
max_gen_toks
:
64
metric_list
:
-
metric
:
acc
aggregation
:
mean
...
...
lm_eval/tasks/mathvista/utils.py
View file @
b27bc18e
import
re
from
typing
import
Optional
from
Levenshtein
import
distance
# taken from https://github.com/lupantech/MathVista/blob/main/evaluation/calculate_score.py
def
get_most_similar
(
prediction
:
str
,
choices
:
list
):
def
get_most_similar
(
prediction
:
str
,
choices
:
list
)
->
float
:
"""
Use the Levenshtein distance (or edit distance) to determine which of the choices is most similar to the given prediction
"""
...
...
@@ -14,17 +15,19 @@ def get_most_similar(prediction: str, choices: list):
# return min(choices, key=lambda choice: distance(prediction, choice))
# taken from https://github.com/lupantech/MathVista/blob/main/evaluation/extract_answer.py
def
normalize_extracted_answer
(
extraction
,
extraction
:
str
,
choices
:
list
,
question_type
:
str
,
answer_type
:
str
,
precision
,
ignore_empty_extractions
=
Fals
e
,
):
ignore_empty_extractions
=
Tru
e
,
)
->
Optional
[
str
]
:
"""
Normalize the extracted answer to match the answer type
"""
if
question_type
==
"multi_choice"
:
# make sure the extraction is a string
if
isinstance
(
extraction
,
str
):
...
...
@@ -88,37 +91,47 @@ def safe_equal(prediction, answer):
return
False
def
get_acc_with_contion
(
res_pd
,
key
,
value
):
if
key
==
"skills"
:
total_pd
=
res_pd
[
res_pd
[
key
].
apply
(
lambda
x
:
value
in
x
)]
else
:
total_pd
=
res_pd
[
res_pd
[
key
]
==
value
]
correct_pd
=
total_pd
[
total_pd
[
"true_false"
]
==
True
]
# noqa: E712
acc
=
len
(
correct_pd
)
/
len
(
total_pd
)
return
len
(
correct_pd
),
len
(
total_pd
),
acc
def
extract_answer
(
response
:
str
,
problem
:
dict
)
->
str
:
question_type
=
problem
[
"question_type"
]
answer_type
=
problem
[
"answer_type"
]
choices
=
problem
[
"choices"
]
# query = problem["query"]
# pid = problem['pid']
if
response
==
""
:
return
""
# adapted from https://github.com/lupantech/MathVista/blob/main/evaluation/extract_answer.py
def
process_results
(
doc
,
results
):
response
=
results
[
0
]
choices
=
doc
[
"choices"
]
question_type
=
doc
[
"question_type"
]
answer_type
=
doc
[
"answer_type"
]
precision
=
doc
[
"precision"
]
# noqa: F841
extraction
=
doc
[
"extraction"
]
# noqa: F841
if
question_type
==
"multi_choice"
and
response
in
choices
:
return
{
"acc"
:
1.0
}
return
response
if
answer_type
==
"integer"
:
try
:
extraction
=
int
(
response
)
return
str
(
extraction
)
except
Exception
:
pass
if
answer_type
==
"float"
:
try
:
extraction
=
str
(
float
(
response
))
return
extraction
except
Exception
:
pass
return
""
# adapted from https://github.com/lupantech/MathVista/blob/main/evaluation/extract_answer.py
def
process_results
(
doc
:
dict
,
results
:
list
[
str
]):
response
=
results
[
0
]
# noqa: F841
choices
=
doc
[
"choices"
]
question_type
=
doc
[
"question_type"
]
answer_type
=
doc
[
"answer_type"
]
precision
=
doc
[
"precision"
]
# noqa: F841
answer
=
doc
[
"answer"
]
extracted_answer
=
extract_answer
(
response
,
doc
)
normalized_extraction
=
normalize_extracted_answer
(
extracted_answer
,
choices
,
question_type
,
answer_type
,
precision
)
res
=
safe_equal
(
normalized_extraction
,
answer
)
return
{
"acc"
:
1.0
}
if
res
else
{
"acc"
:
0.0
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment