Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
mPLUG-DocOwl_pytorch
Commits
58d33d4c
Commit
58d33d4c
authored
Nov 13, 2024
by
wanglch
Browse files
Initial commit
parents
Pipeline
#1904
canceled with stages
Changes
108
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2123 additions
and
0 deletions
+2123
-0
docowl_doclocal4k_evaluate.py
docowl_doclocal4k_evaluate.py
+90
-0
docowl_infer.py
docowl_infer.py
+99
-0
evaluation/benchmarks_eval.py
evaluation/benchmarks_eval.py
+256
-0
evaluation/due_benchmarks_eval.py
evaluation/due_benchmarks_eval.py
+164
-0
evaluation/due_evaluator/__init__.py
evaluation/due_evaluator/__init__.py
+4
-0
evaluation/due_evaluator/__main__.py
evaluation/due_evaluator/__main__.py
+96
-0
evaluation/due_evaluator/__version__.py
evaluation/due_evaluator/__version__.py
+4
-0
evaluation/due_evaluator/due_evaluator.py
evaluation/due_evaluator/due_evaluator.py
+181
-0
evaluation/due_evaluator/py.typed
evaluation/due_evaluator/py.typed
+0
-0
evaluation/due_evaluator/scorers/__init__.py
evaluation/due_evaluator/scorers/__init__.py
+9
-0
evaluation/due_evaluator/scorers/accuracy_scorer.py
evaluation/due_evaluator/scorers/accuracy_scorer.py
+53
-0
evaluation/due_evaluator/scorers/anls_scorer.py
evaluation/due_evaluator/scorers/anls_scorer.py
+76
-0
evaluation/due_evaluator/scorers/base_scorer.py
evaluation/due_evaluator/scorers/base_scorer.py
+22
-0
evaluation/due_evaluator/scorers/fscorer.py
evaluation/due_evaluator/scorers/fscorer.py
+201
-0
evaluation/due_evaluator/scorers/geval_scorer.py
evaluation/due_evaluator/scorers/geval_scorer.py
+57
-0
evaluation/due_evaluator/scorers/group_anls.py
evaluation/due_evaluator/scorers/group_anls.py
+112
-0
evaluation/due_evaluator/scorers/mean_fscorer.py
evaluation/due_evaluator/scorers/mean_fscorer.py
+25
-0
evaluation/due_evaluator/scorers/wtq_scorer.py
evaluation/due_evaluator/scorers/wtq_scorer.py
+310
-0
evaluation/due_evaluator/utils.py
evaluation/due_evaluator/utils.py
+74
-0
evaluation/evaluator.py
evaluation/evaluator.py
+290
-0
No files found.
docowl_doclocal4k_evaluate.py
0 → 100644
View file @
58d33d4c
import
json
import
jsonlines
from
docowl_infer
import
DocOwlInfer
from
tqdm
import
tqdm
import
os
from
icecream
import
ic
from
evaluation.benchmarks_eval
import
llm_text_localization_eval
import
argparse
def
read_jsonl
(
filename
):
lines
=
[]
with
open
(
filename
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
jsonlines
.
Reader
(
f
):
lines
.
append
(
line
)
return
lines
def
save_jsonl
(
data
,
filename
,
print_log
=
True
):
"""data is a list"""
with
open
(
filename
,
"w"
)
as
f
:
f
.
write
(
"
\n
"
.
join
([
json
.
dumps
(
e
,
ensure_ascii
=
False
)
for
e
in
data
]))
if
print_log
:
print
(
'save %d samples to %s'
%
(
len
(
data
),
filename
))
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'docowl1.5 doclocal4k evaluation'
)
parser
.
add_argument
(
'--model_path'
,
type
=
str
,
help
=
'the directory path of model'
)
parser
.
add_argument
(
'--task'
,
type
=
str
,
choices
=
[
'text_grounding'
,
'text_recognition'
])
parser
.
add_argument
(
'--doclocal4k_dir'
,
type
=
str
,
help
=
'the directory path of DocLocal4K'
)
parser
.
add_argument
(
'--save_dir'
,
type
=
str
,
help
=
'the directory to save predictions of the model'
)
args
=
parser
.
parse_args
()
model_path
=
args
.
model_path
task
=
args
.
task
doclocal4k_dir
=
args
.
doclocal4k_dir
save_dir
=
args
.
save_dir
if
not
os
.
path
.
exists
(
save_dir
):
os
.
makedirs
(
save_dir
)
test_path
=
os
.
path
.
join
(
doclocal4k_dir
,
task
+
'.jsonl'
)
save_path
=
os
.
path
.
join
(
save_dir
,
task
+
'_test_pred.jsonl'
)
if
os
.
path
.
exists
(
save_path
):
print
(
save_path
+
' exists, skip inference. '
)
else
:
docowl
=
DocOwlInfer
(
ckpt_path
=
model_path
,
anchors
=
'grid_9'
,
add_global_img
=
False
)
print
(
'load model from '
,
model_path
)
# infer the test samples one by one
test_samples
=
read_jsonl
(
test_path
)
infer_results
=
[]
for
sample
in
tqdm
(
test_samples
):
image
=
os
.
path
.
join
(
doclocal4k_dir
,
sample
[
'image'
][
0
])
assert
os
.
path
.
exists
(
image
)
question
=
sample
[
'messages'
][
0
]
answer
=
sample
[
'messages'
][
1
]
assert
question
[
'role'
]
==
'user'
assert
answer
[
'role'
]
==
'assistant'
query
=
question
[
'content'
].
replace
(
'<|image|>'
,
''
)
gt_answer
=
answer
[
'content'
]
model_answer
=
docowl
.
inference
(
image
,
query
)
sample
[
'model_answer'
]
=
model_answer
sample
[
'gt_answer'
]
=
gt_answer
ic
(
model_answer
,
gt_answer
)
infer_results
.
append
(
sample
)
save_jsonl
(
infer_results
,
save_path
)
# calculate metrics
pred_path
=
save_path
if
not
os
.
path
.
exists
(
pred_path
):
print
(
'not exists:'
,
pred_path
)
exit
(
0
)
if
task
==
'text_recognition'
:
llm_text_localization_eval
(
metric_names
=
[
'BLEU1'
,
'BLEU2'
,
'BLEU3'
,
'BLEU4'
],
result_path
=
pred_path
,
save_each_eval
=
True
)
elif
task
==
'text_grounding'
:
llm_text_localization_eval
(
metric_names
=
[
'IOU@0.5'
],
result_path
=
pred_path
,
save_each_eval
=
True
)
print
(
'=============================================='
)
docowl_infer.py
0 → 100644
View file @
58d33d4c
import
torch
from
PIL
import
Image
from
transformers
import
TextStreamer
import
os
from
mplug_docowl.constants
import
IMAGE_TOKEN_INDEX
,
DEFAULT_IMAGE_TOKEN
from
mplug_docowl.conversation
import
conv_templates
,
SeparatorStyle
from
mplug_docowl.model.builder
import
load_pretrained_model
from
mplug_docowl.mm_utils
import
process_images
,
tokenizer_image_token
,
get_model_name_from_path
,
KeywordsStoppingCriteria
from
mplug_docowl.processor
import
DocProcessor
from
icecream
import
ic
import
time
class
DocOwlInfer
():
def
__init__
(
self
,
ckpt_path
,
anchors
=
'grid_9'
,
add_global_img
=
True
,
load_8bit
=
False
,
load_4bit
=
False
):
model_name
=
get_model_name_from_path
(
ckpt_path
)
ic
(
model_name
)
self
.
tokenizer
,
self
.
model
,
_
,
_
=
load_pretrained_model
(
ckpt_path
,
None
,
model_name
,
load_8bit
=
load_8bit
,
load_4bit
=
load_4bit
,
device
=
"cuda"
)
self
.
doc_image_processor
=
DocProcessor
(
image_size
=
448
,
anchors
=
anchors
,
add_global_img
=
add_global_img
,
add_textual_crop_indicator
=
True
)
self
.
streamer
=
TextStreamer
(
self
.
tokenizer
,
skip_prompt
=
True
,
skip_special_tokens
=
True
)
def
inference
(
self
,
image
,
query
):
image_tensor
,
patch_positions
,
text
=
self
.
doc_image_processor
(
images
=
image
,
query
=
'<|image|>'
+
query
)
image_tensor
=
image_tensor
.
to
(
self
.
model
.
device
,
dtype
=
torch
.
float16
)
patch_positions
=
patch_positions
.
to
(
self
.
model
.
device
)
# ic(image_tensor.shape, patch_positions.shape, text)
conv
=
conv_templates
[
"mplug_owl2"
].
copy
()
roles
=
conv
.
roles
# ("USER", "ASSISTANT")
conv
.
append_message
(
conv
.
roles
[
0
],
text
)
conv
.
append_message
(
conv
.
roles
[
1
],
None
)
prompt
=
conv
.
get_prompt
()
# ic(prompt)
input_ids
=
tokenizer_image_token
(
prompt
,
self
.
tokenizer
,
IMAGE_TOKEN_INDEX
,
return_tensors
=
'pt'
).
unsqueeze
(
0
).
to
(
self
.
model
.
device
)
# ic(input_ids)
stop_str
=
conv
.
sep2
keywords
=
[
stop_str
]
stopping_criteria
=
KeywordsStoppingCriteria
(
keywords
,
self
.
tokenizer
,
input_ids
)
with
torch
.
inference_mode
():
output_ids
=
self
.
model
.
generate
(
input_ids
,
images
=
image_tensor
,
patch_positions
=
patch_positions
,
do_sample
=
False
,
temperature
=
1.0
,
max_new_tokens
=
512
,
streamer
=
self
.
streamer
,
use_cache
=
True
,
stopping_criteria
=
[
stopping_criteria
])
outputs
=
self
.
tokenizer
.
decode
(
output_ids
[
0
,
input_ids
.
shape
[
1
]:]).
strip
()
return
outputs
.
replace
(
'</s>'
,
''
)
if
__name__
==
'__main__'
:
model_path
=
'/home/wanglch/mPLUG-DocOwl/DocOwl1.5-Omni-base/'
docowl
=
DocOwlInfer
(
ckpt_path
=
model_path
,
anchors
=
'grid_9'
,
add_global_img
=
True
)
print
(
'load model from '
,
model_path
)
# exit(0)
qas
=
[
# docvqa case
{
"image_path"
:
"/home/wanglch/mPLUG-DocOwl/image/hp.jpg"
,
"question"
:
"详细描述这张图片"
},
{
"image_path"
:
"/home/wanglch/mPLUG-DocOwl/image/R-C.jpg"
,
"question"
:
"详细描述这张图片"
},
]
for
qa
in
qas
:
image
=
qa
[
'image_path'
]
query
=
qa
[
'question'
]
start_time
=
time
.
time
()
## give relatively longer answer
answer
=
docowl
.
inference
(
image
,
query
)
end_time
=
time
.
time
()
cost_seconds
=
end_time
-
start_time
## answer with detailed explanation
# query = qa['question']+'Answer the question with detailed explanation.'
# answer = docowl.inference(image, query)
ic
(
image
)
ic
(
query
,
answer
)
ic
(
cost_seconds
)
# ic(query_simple, answer_simple)
print
(
'=================='
)
evaluation/benchmarks_eval.py
0 → 100644
View file @
58d33d4c
import
jsonlines
import
json
from
icecream
import
ic
import
re
from
evaluator
import
doc_evaluate
import
os
from
tqdm
import
tqdm
import
random
from
pathlib
import
Path
def
parser_line
(
line
):
image
=
line
[
'image'
][
0
]
assert
len
(
line
[
'messages'
])
==
2
assert
line
[
'messages'
][
0
][
'role'
]
==
'user'
question
=
line
[
'messages'
][
0
][
'content'
].
replace
(
'<|image|>'
,
''
)
predicted_answer
=
line
[
'model_answer'
].
replace
(
'
\n
'
,
''
).
strip
()
gt_answer
=
line
[
'gt_answer'
].
replace
(
'
\n
'
,
''
).
strip
()
return
image
,
question
,
predicted_answer
,
gt_answer
def
parser_ground_line
(
line
):
task_name
=
line
[
'task_name'
]
# e.g. paragraph_bbox2t_sft
obj
=
task_name
.
split
(
'_'
)[
0
]
image
=
line
[
'image'
][
0
]
assert
'messages'
in
line
assert
len
(
line
[
'messages'
])
==
2
assert
line
[
'messages'
][
0
][
'role'
]
==
'user'
question
=
line
[
'messages'
][
0
][
'content'
].
replace
(
'<|image|>'
,
''
)
task_name
=
line
[
'task_name'
]
if
't2bbox'
in
task_name
:
gt_answer
=
line
[
'gt_answer'
].
strip
().
replace
(
'<bbox>'
,
''
).
replace
(
'</bbox>'
,
''
)
gt_answer
=
[
max
(
min
(
int
(
x
)
/
999
,
1.0
),
0.0
)
for
x
in
gt_answer
.
split
(
','
)]
model_answer
=
line
[
'model_answer'
].
strip
().
replace
(
'<bbox>'
,
''
).
replace
(
'</bbox>'
,
''
)
try
:
model_answer
=
[
max
(
min
(
int
(
x
)
/
999
,
1.0
),
0.0
)
for
x
in
model_answer
.
split
(
','
)]
except
Exception
as
e
:
model_answer
=
[
0.0
,
0.0
,
0.0
,
0.0
]
try
:
assert
len
(
model_answer
)
==
4
except
AssertionError
as
e
:
# ic(line)
model_answer
=
[
0.0
,
0.0
,
0.0
,
0.0
]
# exit(0)
else
:
assert
'bbox2t'
in
task_name
model_answer
=
line
[
'model_answer'
].
strip
().
replace
(
'<ocr>'
,
''
).
replace
(
'</ocr>'
,
''
)
model_answer
=
model_answer
.
strip
()
gt_answer
=
line
[
'gt_answer'
].
strip
().
replace
(
'<ocr>'
,
''
).
replace
(
'</ocr>'
,
''
)
gt_answer
=
gt_answer
.
strip
()
return
image
,
question
,
model_answer
,
gt_answer
,
obj
def
save_jsonl
(
data
,
filename
):
"""data is a list"""
with
open
(
filename
,
"w"
)
as
f
:
f
.
write
(
"
\n
"
.
join
([
json
.
dumps
(
e
,
ensure_ascii
=
False
)
for
e
in
data
]))
print
(
'save %d samples to %s'
%
(
len
(
data
),
filename
))
def
llm_benchmark_eval
(
metric_names
=
[
'ContainAccuracy'
],
result_path
=
''
,
save_each_eval
=
True
):
if
not
Path
(
result_path
).
exists
():
ic
(
'not exists'
,
result_path
)
return
ic
(
result_path
)
gts
=
[]
preds
=
[]
imgs
=
[]
ques
=
[]
with
open
(
result_path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
jsonlines
.
Reader
(
f
):
img
,
question
,
model_answer
,
gt_answer
=
parser_line
(
line
)
if
gt_answer
.
endswith
(
'.'
):
gt_answer
=
gt_answer
[:
-
1
]
imgs
.
append
(
img
)
gts
.
append
([
gt_answer
])
preds
.
append
(
model_answer
)
ques
.
append
(
question
)
ic
(
len
(
gts
),
len
(
preds
))
metric2scores
=
{}
for
metric_name
in
metric_names
:
score
,
scores
=
doc_evaluate
(
metric
=
metric_name
,
targets
=
gts
,
predictions
=
preds
)
ic
(
metric_name
,
score
)
metric2scores
[
metric_name
]
=
scores
if
save_each_eval
:
save_path
=
result_path
.
replace
(
'.jsonl'
,
'_metrics.jsonl'
)
eval_result
=
[]
for
i
in
range
(
len
(
imgs
)):
# assert len(scores) == len(imgs)
eval_result
.
append
({
'metric2score'
:
[{
'metric'
:
metric
,
'score'
:
scores
[
i
]}
for
metric
,
scores
in
metric2scores
.
items
()],
'image'
:
imgs
[
i
],
'question'
:
ques
[
i
],
'gt'
:
gts
[
i
][
0
],
'pred'
:
preds
[
i
]})
save_jsonl
(
eval_result
,
save_path
)
def
llm_text_localization_eval
(
metric_names
=
[
'BLEU1'
,
'BLEU2'
,
'BLEU3'
,
'BLEU4'
],
result_path
=
''
,
save_each_eval
=
True
):
if
not
Path
(
result_path
).
exists
():
ic
(
'not exists'
,
result_path
)
return
ic
(
result_path
)
gts
=
[]
preds
=
[]
imgs
=
[]
ques
=
[]
objs
=
[]
with
open
(
result_path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
jsonlines
.
Reader
(
f
):
img
,
question
,
model_answer
,
gt_answer
,
obj
=
parser_ground_line
(
line
)
# model_answer = model_answer.strip()
if
isinstance
(
gt_answer
,
str
)
and
isinstance
(
model_answer
,
str
):
if
gt_answer
.
endswith
(
'.'
):
gt_answer
=
gt_answer
[:
-
1
]
imgs
.
append
(
img
)
gts
.
append
([
gt_answer
])
preds
.
append
(
model_answer
)
ques
.
append
(
question
)
objs
.
append
(
obj
)
ic
(
len
(
gts
),
len
(
preds
))
metric2scores
=
{}
metric2score
=
{}
for
metric_name
in
metric_names
:
score
,
scores
=
doc_evaluate
(
metric
=
metric_name
,
targets
=
gts
,
predictions
=
preds
)
# ic(metric_name, score)
metric2scores
[
metric_name
]
=
scores
metric2score
[
metric_name
]
=
str
(
round
(
score
,
2
))
# calculate metric of each type of object (word, phrase, line, paragraph)
obj2metrics
=
{}
for
metric_name
in
metric_names
:
scores
=
metric2scores
[
metric_name
]
obj2scores
=
{}
for
i
,
obj
in
enumerate
(
objs
):
score
=
scores
[
i
]
if
obj
not
in
obj2scores
:
obj2scores
[
obj
]
=
[]
obj2scores
[
obj
].
append
(
score
)
for
obj
,
scores
in
obj2scores
.
items
():
num
=
len
(
scores
)
if
metric_name
==
'IOU@0.5'
:
score
=
round
(
100
*
sum
(
scores
)
/
len
(
scores
),
2
)
else
:
score
=
round
(
sum
(
scores
)
/
len
(
scores
),
2
)
# ic(metric_name, obj, num, score)
if
obj
==
'word'
and
metric_name
in
[
'BLEU2'
,
'BLEU3'
,
'BLEU4'
]:
continue
if
obj
==
'phrase'
and
metric_name
in
[
'BLEU1'
,
'BLEU3'
,
'BLEU4'
]:
continue
if
obj
==
'line'
and
metric_name
in
[
'BLEU1'
,
'BLEU2'
,
'BLEU4'
]:
continue
if
obj
==
'paragraph'
and
metric_name
in
[
'BLEU1'
,
'BLEU2'
,
'BLEU3'
]:
continue
obj2metrics
[
obj
+
'_'
+
metric_name
]
=
score
# print('---------------------------')
ic
(
obj2metrics
)
if
'BLEU1'
in
metric_names
:
# recognition evaluation
ave
=
round
(
sum
(
obj2metrics
.
values
())
/
len
(
obj2metrics
.
values
()),
2
)
ic
(
ave
)
else
:
# grounding evaluation
ave
=
metric2score
[
'IOU@0.5'
]
ic
(
ave
)
if
save_each_eval
:
save_path
=
result_path
.
replace
(
'.jsonl'
,
'_metrics.jsonl'
)
eval_result
=
[]
for
i
in
range
(
len
(
imgs
)):
# assert len(scores) == len(imgs)
eval_result
.
append
({
'metric2score'
:
[{
'metric'
:
metric
,
'score'
:
scores
[
i
]}
for
metric
,
scores
in
metric2scores
.
items
()],
'image'
:
imgs
[
i
],
'question'
:
ques
[
i
],
'gt'
:
gts
[
i
][
0
],
'pred'
:
preds
[
i
]})
save_jsonl
(
eval_result
,
save_path
)
def
llm_textcaps_textvqa_eval
(
result_path
,
dataset
=
'TextVQA'
,
split
=
'test'
,
meta_dir
=
''
):
if
dataset
==
'TextVQA'
:
question_ids_path
=
os
.
path
.
join
(
meta_dir
,
dataset
,
split
+
'_q_ids.json'
)
if
not
os
.
path
.
exists
(
question_ids_path
):
qa_path
=
os
.
path
.
join
(
meta_dir
,
dataset
,
'TextVQA_0.5.1_'
+
split
+
'.json'
)
raw_qa_data
=
json
.
load
(
open
(
qa_path
,
'r'
,
encoding
=
'utf-8'
))
raw_qa_data
=
raw_qa_data
[
'data'
]
# collect QAs of an identical image
print
(
'collecting QAs......'
)
img2qas
=
{}
que_num
=
0
for
qa
in
tqdm
(
raw_qa_data
):
if
dataset
==
'TextVQA'
:
imgid
=
qa
[
'image_id'
]
question
=
qa
[
'question'
]
q_id
=
qa
[
'question_id'
]
if
imgid
not
in
img2qas
:
img2qas
[
imgid
]
=
{}
img2qas
[
imgid
][
question
]
=
q_id
que_num
+=
1
ic
(
que_num
)
json
.
dump
(
img2qas
,
open
(
question_ids_path
,
'w'
,
encoding
=
'utf-8'
))
print
(
'save question ids to '
,
question_ids_path
)
q_ids
=
json
.
load
(
open
(
question_ids_path
,
'r'
,
encoding
=
'utf-8'
))
llm_results
=
[]
with
open
(
result_path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
jsonlines
.
Reader
(
f
):
img
=
line
[
'image'
][
0
]
imgid
=
img
.
split
(
'/'
)[
-
1
].
replace
(
'.jpg'
,
''
)
assert
line
[
'messages'
][
0
][
'role'
]
==
'user'
question
=
line
[
'messages'
][
0
][
'content'
].
replace
(
'<|image|>'
,
''
)
if
dataset
==
'TextVQA'
:
q_id
=
q_ids
[
imgid
][
question
]
# gt_answer = str(line['gt_answer']).replace('\n', '')
model_answer
=
str
(
line
[
'model_answer'
].
strip
()).
replace
(
'
\n
'
,
''
)
# ic(imgid, question, model_answer)
if
model_answer
.
endswith
(
'.'
):
model_answer
=
model_answer
[:
-
1
]
llm_results
.
append
({
'question_id'
:
q_id
,
'answer'
:
model_answer
})
else
:
llm_results
=
[]
img2captions
=
{}
with
open
(
result_path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
jsonlines
.
Reader
(
f
):
img
=
line
[
'image'
][
0
]
imgid
=
img
.
split
(
'/'
)[
-
1
].
replace
(
'.jpg'
,
''
)
model_answer
=
str
(
line
[
'model_answer'
]).
replace
(
'
\n
'
,
''
)
# ic(imgid, model_answer)
if
imgid
not
in
img2captions
:
img2captions
[
imgid
]
=
[]
img2captions
[
imgid
].
append
(
model_answer
)
for
imgid
,
captions
in
img2captions
.
items
():
llm_results
.
append
({
'image_id'
:
imgid
,
'caption'
:
random
.
choice
(
captions
)})
ic
(
len
(
llm_results
))
save_path
=
result_path
.
replace
(
'.jsonl'
,
'_official_eval.json'
)
json
.
dump
(
llm_results
,
open
(
save_path
,
'w'
,
encoding
=
'utf-8'
))
print
(
'save LLM predictions in the official format to '
,
save_path
)
if
split
==
'test'
:
print
(
'!!!!!! upload this file to official website for evaluation !!!!!'
)
evaluation/due_benchmarks_eval.py
0 → 100644
View file @
58d33d4c
import
json
from
icecream
import
ic
import
jsonlines
import
copy
import
random
import
os
from
due_evaluator.due_evaluator
import
DueEvaluator
def
dataset2metrics
(
dataset_name
):
if
dataset_name
in
[
'DocVQA'
,
'InfographicsVQA'
]:
return
[
'ANLS'
]
elif
dataset_name
in
[
'KleisterCharity'
,
'DeepForm'
]:
return
[
'F1'
]
elif
dataset_name
in
[
'TabFact'
]:
return
[
'F1'
]
elif
dataset_name
in
[
'PWC'
]:
return
[
'GROUP-ANLS'
]
elif
dataset_name
in
[
'WikiTableQuestions'
]:
return
[
'WTQ'
]
else
:
print
(
'unsupported dataset:'
,
dataset_name
)
def
eval_due
(
dataset_name
,
pred_path
,
gt_path
):
metrics
=
dataset2metrics
(
dataset_name
)
preds
=
read_jsonl
(
pred_path
)
gts
=
read_jsonl
(
gt_path
)
print
(
'pred %d, gt %d'
%
(
len
(
preds
),
len
(
gts
)))
for
metric
in
metrics
:
evaluator
=
DueEvaluator
(
reference
=
gts
,
answers
=
preds
,
ignore_case
=
True
,
metric
=
metric
)
general_scorer
,
label_scorers
=
evaluator
.
_evalute
()
ic
(
'Overall %s:%.4f'
%
(
metric
,
general_scorer
.
score
()))
"""for label, scorer in label_scorers.items():
print('%s %s:%.4f' % (label, metric, scorer.score()))"""
def
read_jsonl
(
path
):
data
=
[]
with
open
(
path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
jsonlines
.
Reader
(
f
):
data
.
append
(
line
)
return
data
def
save_jsonl
(
data
,
path
):
with
open
(
path
,
'w'
)
as
f
:
for
line
in
data
:
f
.
write
(
json
.
dumps
(
line
,
ensure_ascii
=
False
)
+
'
\n
'
)
print
(
'save %d samples(imgs) to %s '
%
(
len
(
data
),
path
))
def
add_tabfact_missing_img
(
due_preds
,
meta_dir
):
ref_path
=
meta_dir
+
'TabFact/test/document.jsonl'
new_due_preds
=
[]
i
=
-
1
with
open
(
ref_path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
jsonlines
.
Reader
(
f
):
i
+=
1
if
due_preds
[
i
][
'name'
]
==
line
[
'name'
]:
"""# copy raw statement from anno file, avoid small revisions
img = {'name':line['name'], 'annotations':[]}
for i, anno in enumerate(line['annotations']):
pred_value = due_preds[i]['annotations']['values'][0]['value']
img['annotations'].append({'key':anno['key'], 'values':[{'value':pred_value}]})
new_due_preds.append(img)"""
new_due_preds
.
append
(
due_preds
[
i
])
continue
else
:
print
(
'add random prediction for missing img:'
,
line
[
'name'
])
img
=
{
'name'
:
line
[
'name'
],
'annotations'
:[]}
for
anno
in
line
[
'annotations'
]:
img
[
'annotations'
].
append
({
'key'
:
anno
[
'key'
],
'values'
:[{
'value'
:
random
.
choice
([
'0'
,
'1'
])}]})
new_due_preds
.
append
(
img
)
i
-=
1
return
new_due_preds
def
llm_duebenchmark_eval
(
dataset_name
,
split
,
llm_pred_path
,
meta_dir
):
"""
reformat results by LLM for due-benchmark evaluation
"""
assert
dataset_name
in
[
'DocVQA'
,
'InfographicsVQA'
,
'WikiTableQuestions'
,
'DeepForm'
,
'KleisterCharity'
,
'TabFact'
]
ic
(
dataset_name
)
if
dataset_name
==
'DeepForm'
:
dataset_categories
=
[
'advertiser'
,
'flight_from'
,
'flight_to'
,
'gross_amount'
,
'contract_num'
]
elif
dataset_name
==
'KleisterCharity'
:
dataset_categories
=
[
'address__post_town'
,
'address__postcode'
,
'address__street_line'
,
'charity_name'
,
'charity_number'
,
'income_annually_in_british_pounds'
,
'report_date'
,
'spending_annually_in_british_pounds'
]
preds
=
[]
with
open
(
llm_pred_path
,
'r'
,
encoding
=
'utf-8'
)
as
f
:
for
line
in
jsonlines
.
Reader
(
f
):
assert
len
(
line
[
'messages'
])
==
2
assert
line
[
'messages'
][
0
][
'role'
]
==
'user'
question
=
line
[
'messages'
][
0
][
'content'
].
replace
(
'<|image|>'
,
''
)
preds
.
append
({
'name'
:
line
[
'image'
][
0
],
'question'
:
question
,
'answer'
:
str
(
line
[
'model_answer'
]).
strip
().
replace
(
'
\n
'
,
''
)})
meta_path
=
os
.
path
.
join
(
meta_dir
,
dataset_name
,
split
,
'metadata.jsonl'
)
meta_data
=
read_jsonl
(
meta_path
)
ic
(
len
(
meta_data
),
len
(
preds
))
assert
len
(
meta_data
)
==
len
(
preds
)
for
i
in
range
(
len
(
meta_data
)):
preds
[
i
][
'name'
]
=
meta_data
[
i
][
'file_name'
].
split
(
'/'
)[
-
1
].
split
(
'.pdf'
)[
0
]
# for ie task, covert category question to the category
if
dataset_name
in
[
'DeepForm'
,
'KleisterCharity'
]:
cate_question
=
json
.
loads
(
meta_data
[
i
][
'ground_truth'
])[
'gt_parses'
][
0
][
'question'
]
for
cate
in
dataset_categories
:
if
cate
in
cate_question
:
preds
[
i
][
'question'
]
=
cate
break
# for qa task, copy question is necessary, question in preds can have some minor revisions
# keep quesiton consistent with gt file is necessary for due eveluation
else
:
preds
[
i
][
'question'
]
=
json
.
loads
(
meta_data
[
i
][
'ground_truth'
])[
'gt_parses'
][
0
][
'question'
]
if
dataset_name
==
'TabFact'
:
if
preds
[
i
][
'answer'
].
lower
()
==
'true'
:
preds
[
i
][
'answer'
]
=
'1'
else
:
assert
preds
[
i
][
'answer'
].
lower
()
==
'false'
preds
[
i
][
'answer'
]
=
'0'
# reorganize preds to 1 line means QA pairs or category-value pairs of 1 image
due_preds
=
[]
img
=
{}
for
i
in
range
(
len
(
preds
)):
pred
=
preds
[
i
]
if
'name'
not
in
img
:
# start img
img
[
'name'
]
=
pred
[
'name'
]
img
[
'annotations'
]
=
[]
elif
pred
[
'name'
]
!=
img
[
'name'
]:
# save previous img results and init a new one
due_preds
.
append
(
copy
.
deepcopy
(
img
))
img
=
{}
img
[
'name'
]
=
pred
[
'name'
]
img
[
'annotations'
]
=
[]
# for ie task, if the answer is none, drop the category-value pair
if
dataset_name
not
in
[
'DeepForm'
,
'KleisterCharity'
]
or
pred
[
'answer'
]
!=
'None'
:
img
[
'annotations'
].
append
({
'key'
:
pred
[
'question'
],
'values'
:[{
'value'
:
pred
[
'answer'
]}]})
if
i
==
len
(
preds
)
-
1
:
due_preds
.
append
(
copy
.
deepcopy
(
img
))
if
dataset_name
==
'TabFact'
:
due_preds
=
add_tabfact_missing_img
(
due_preds
,
meta_dir
)
save_path
=
llm_pred_path
.
replace
(
'.jsonl'
,
'_due.jsonl'
)
save_jsonl
(
due_preds
,
save_path
)
gt_path
=
os
.
path
.
join
(
meta_dir
,
dataset_name
,
split
,
'document.jsonl'
)
eval_due
(
dataset_name
,
save_path
,
gt_path
)
evaluation/due_evaluator/__init__.py
0 → 100644
View file @
58d33d4c
from
.__main__
import
cli_main
from
.due_evaluator
import
DueEvaluator
__all__
=
[
'DueEvaluator'
,
'cli_main'
]
evaluation/due_evaluator/__main__.py
0 → 100644
View file @
58d33d4c
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import
argparse
import
sys
from
typing
import
Optional
,
Set
import
json
from
due_evaluator.due_evaluator
import
DueEvaluator
from
due_evaluator.utils
import
property_scores_to_string
def
parse_args
():
"""Parse CLI arguments.
Returns:
namespace: namespace with parsed variables.
"""
parser
=
argparse
.
ArgumentParser
(
'Document Understanding Evaluator'
)
parser
.
add_argument
(
'--out-files'
,
'-o'
,
type
=
argparse
.
FileType
(
'r'
,
encoding
=
'utf-8'
),
required
=
True
,
nargs
=
'+'
,
help
=
'Out file to evaluate'
,
)
parser
.
add_argument
(
'--reference'
,
'-r'
,
type
=
argparse
.
FileType
(
'r'
,
encoding
=
'utf-8'
),
required
=
True
,
help
=
'Reference file'
,
)
parser
.
add_argument
(
'--metric'
,
'-m'
,
type
=
str
,
default
=
'F1'
,
choices
=
[
'F1'
,
'MEAN-F1'
,
'ANLS'
,
'WTQ'
,
'GROUP-ANLS'
])
parser
.
add_argument
(
'--return-score'
,
default
=
'F1'
,
choices
=
[
'F1'
,
'mean-F1'
,
'ANLS'
,
'mean-Precision'
,
'mean-Recall'
,
'WTQ'
],
help
=
'Return WR-like mean-F1 score'
,
)
parser
.
add_argument
(
'--line-by-line'
,
action
=
'store_true'
,
default
=
False
,
help
=
'Return retults example-based'
)
parser
.
add_argument
(
'--columns'
,
type
=
str
,
nargs
=
'+'
,
default
=
[
'Precision'
,
'Recall'
,
'F1'
],
help
=
'Columns'
,
)
parser
.
add_argument
(
'--print-format'
,
default
=
'text'
,
type
=
str
,
choices
=
[
'text'
,
'latex'
,
'json'
],
help
=
'Print feature table in the given format'
,
)
parser
.
add_argument
(
'--properties'
,
nargs
=
'+'
,
type
=
str
,
help
=
'Property set to be limitted to'
)
parser
.
add_argument
(
'--ignore-case'
,
'-i'
,
action
=
'store_true'
,
default
=
False
,
help
=
'Property set to be limitted to'
,
)
return
parser
.
parse_args
()
def
cli_main
(
args
:
argparse
.
Namespace
):
"""CLI main.
Args:
args: cli arguments
"""
reference
=
[
json
.
loads
(
line
)
for
line
in
args
.
reference
]
evaluators
=
[]
for
out_file
in
args
.
out_files
:
predictions
=
[
json
.
loads
(
line
)
for
line
in
out_file
]
property_set
:
Optional
[
Set
[
str
]]
if
args
.
properties
:
property_set
=
args
.
properties
else
:
property_set
=
None
evaluators
.
append
(
DueEvaluator
(
reference
,
predictions
,
property_set
,
args
.
ignore_case
,
out_file
.
name
,
args
.
metric
)
)
prop_str
=
property_scores_to_string
(
evaluators
,
args
.
print_format
,
args
.
columns
)
if
args
.
print_format
!=
'json'
:
print
(
prop_str
,
file
=
sys
.
stderr
)
if
args
.
line_by_line
:
for
idx
,
score
in
enumerate
(
evaluators
[
0
].
line_by_line
()):
print
(
f
'
{
idx
}
:
{
score
}
'
,
file
=
sys
.
stderr
)
return
prop_str
def
main
()
->
None
:
"""Main."""
args
=
parse_args
()
cli_main
(
args
)
if
__name__
==
'__main__'
:
main
()
evaluation/due_evaluator/__version__.py
0 → 100644
View file @
58d33d4c
"""Version specification."""
VERSION
=
(
0
,
0
,
8
)
__version__
=
'.'
.
join
(
map
(
str
,
VERSION
))
evaluation/due_evaluator/due_evaluator.py
0 → 100644
View file @
58d33d4c
import
sys
from
collections
import
defaultdict
from
typing
import
Callable
,
DefaultDict
,
List
,
Optional
,
Set
,
Tuple
,
TypeVar
,
Union
,
Generic
from
copy
import
deepcopy
from
due_evaluator.scorers
import
AnlsScorer
,
BaseScorer
,
FScorer
,
MeanFScorer
,
WtqScorer
,
GevalScorer
,
GroupAnlsScorer
TScorer
=
TypeVar
(
"TScorer"
,
bound
=
BaseScorer
)
class
DueEvaluator
:
"""Due Evaluator."""
def
__init__
(
self
,
reference
:
List
[
List
[
dict
]],
answers
:
List
[
List
[
dict
]],
property_set
:
Optional
[
Set
[
str
]]
=
None
,
ignore_case
:
bool
=
False
,
path
:
Optional
[
str
]
=
None
,
metric
:
Optional
[
str
]
=
'F1'
,
):
"""Initialize DueEvaluator.
Arguments:
reference: reference
answers: answers to be evaluated
separator: property name and property value separator
property_set: if given, the score will be computed taking into account only these properties.
ignore_case: if true, compute scores ignoring casing.
path: Optional, the path to the evaluated files.
"""
self
.
reference
=
reference
self
.
answers
=
answers
self
.
property_set
=
property_set
self
.
ignore_case
=
ignore_case
self
.
metric
=
metric
self
.
__path
=
path
self
.
__general_scorer
,
self
.
__property_scorers
=
self
.
_evalute
()
@
property
def
general_scorer
(
self
)
->
BaseScorer
:
"""Get general scorer.
Returns:
FScorer: the general scorer.
"""
return
self
.
__general_scorer
@
property
def
property_scorers
(
self
)
->
DefaultDict
[
str
,
BaseScorer
]:
"""Get a scorer for each property.
Returns:
Fscorer: the general scorer.
"""
return
self
.
__property_scorers
@
property
def
path
(
self
)
->
Optional
[
str
]:
"""Return the path of the evaluated file or None--in case when not ealuating a file.
Returns:
Optional[str]: the path of the evaluated file or None.
"""
return
self
.
__path
def
create_scorer
(
self
)
->
BaseScorer
:
scorer
:
BaseScorer
if
self
.
metric
==
'F1'
:
scorer
=
FScorer
()
elif
self
.
metric
==
'ANLS'
:
scorer
=
AnlsScorer
()
elif
self
.
metric
==
'MEAN-F1'
:
scorer
=
MeanFScorer
()
elif
self
.
metric
==
'WTQ'
:
scorer
=
WtqScorer
()
elif
self
.
metric
==
'GROUP-ANLS'
:
scorer
=
GroupAnlsScorer
()
elif
self
.
metric
==
'GEVAL'
:
scorer
=
GevalScorer
()
else
:
raise
ValueError
(
self
.
metric
)
return
scorer
def
filter_properties
(
self
,
doc
:
dict
,
values
:
Union
[
str
,
List
[
str
],
Set
[
str
]])
->
List
[
str
]:
"""Filter the list of properties by provided property name(s).
Args:
doc: document with annotations
values: a property name(s)
Returns:
doc: with filtered annotations
"""
if
isinstance
(
values
,
str
):
values
=
[
values
]
doc_copy
=
deepcopy
(
doc
)
doc_copy
[
'annotations'
]
=
[
a
for
a
in
doc_copy
[
'annotations'
]
if
a
[
'key'
]
in
values
]
return
doc_copy
def
_evalute
(
self
)
->
Tuple
[
BaseScorer
,
DefaultDict
[
str
,
BaseScorer
]]:
"""Evaluate the output file.
Returns:
tuple: general fscorer and a dict with fscorer per label.
"""
label_scorers
:
DefaultDict
[
str
,
BaseScorer
]
=
defaultdict
(
self
.
create_scorer
)
general_scorer
=
self
.
create_scorer
()
reference_labels
:
Set
[
str
]
=
set
()
for
ans_items
,
ref_items
in
zip
(
self
.
answers
,
self
.
reference
):
if
self
.
ignore_case
:
ans_items
=
self
.
uppercase_items
(
ans_items
)
ref_items
=
self
.
uppercase_items
(
ref_items
)
if
general_scorer
.
support_feature_scores
():
reference_labels
|=
set
(
a
[
'key'
]
for
a
in
ref_items
[
'annotations'
])
for
label
in
set
(
item
[
'key'
]
for
item
in
ref_items
[
'annotations'
]
+
ans_items
[
'annotations'
]):
if
self
.
property_set
and
label
not
in
self
.
property_set
:
continue
label_out
=
self
.
filter_properties
(
ans_items
,
label
)
label_ref
=
self
.
filter_properties
(
ref_items
,
label
)
label_scorers
[
label
].
add
(
label_out
,
label_ref
)
if
general_scorer
.
support_feature_scores
()
and
self
.
property_set
:
ans_items
=
self
.
filter_properties
(
ans_items
,
self
.
property_set
)
ref_items
=
self
.
filter_properties
(
ref_items
,
self
.
property_set
)
general_scorer
.
add
(
ans_items
,
ref_items
)
for
label
in
list
(
label_scorers
.
keys
()):
if
label
not
in
reference_labels
:
del
label_scorers
[
label
]
return
general_scorer
,
label_scorers
def
uppercase_items
(
self
,
document
:
dict
)
->
List
[
str
]:
"""Upperecase annotation values.
Args:
document: document with annotations that should be uppercased.
Returns:
document: with with uppercased annotations.
"""
for
item
in
document
[
'annotations'
]:
for
value_dict
in
item
[
'values'
]:
if
'value'
in
value_dict
:
value_dict
[
'value'
]
=
value_dict
[
'value'
].
upper
()
if
'value_variants'
in
value_dict
:
value_dict
[
'value_variants'
]
=
[
variant
.
upper
()
for
variant
in
value_dict
[
'value_variants'
]]
if
'children'
in
value_dict
:
value_dict
[
'children'
]
=
self
.
uppercase_items
({
'annotations'
:
value_dict
[
'children'
]})[
'annotations'
]
return
document
def
line_by_line
(
self
):
"""Compute scores line by line.
Returns:
List: list with scorers.
"""
scores
=
[]
for
ans_items
,
ref_items
in
zip
(
self
.
answers
,
self
.
reference
):
fscorer
=
self
.
create_scorer
()
if
self
.
ignore_case
:
ans_items
=
self
.
uppercase_items
(
ans_items
)
ref_items
=
self
.
uppercase_items
(
ref_items
)
fscorer
.
add
(
ans_items
,
ref_items
)
scores
.
append
(
fscorer
.
score
())
return
scores
evaluation/due_evaluator/py.typed
0 → 100644
View file @
58d33d4c
evaluation/due_evaluator/scorers/__init__.py
0 → 100644
View file @
58d33d4c
from
.anls_scorer
import
AnlsScorer
from
.base_scorer
import
BaseScorer
from
.fscorer
import
FScorer
from
.mean_fscorer
import
MeanFScorer
from
.wtq_scorer
import
WtqScorer
from
.group_anls
import
GroupAnlsScorer
from
.geval_scorer
import
GevalScorer
__all__
=
[
'AnlsScorer'
,
'BaseScorer'
,
'FScorer'
,
'MeanFScorer'
,
'WtqScorer'
,
'GevalScorer'
,
'GroupAnlsScorer'
]
evaluation/due_evaluator/scorers/accuracy_scorer.py
0 → 100644
View file @
58d33d4c
import
logging
from
typing
import
List
from
operator
import
itemgetter
from
.base_scorer
import
BaseScorer
logger
=
logging
.
getLogger
(
__name__
)
class
AccuracyScorer
(
BaseScorer
):
"""Accuracy Scorer."""
def
__init__
(
self
,
threshold
:
float
=
0.5
):
self
.
__scores
:
List
[
float
]
=
[]
self
.
threshold
=
threshold
@
property
def
scores
(
self
):
return
self
.
__scores
def
check_denotation
(
self
,
out
:
list
,
ref
:
list
)
->
bool
:
return
out
==
ref
def
add
(
self
,
out_items
:
List
[
dict
],
ref_items
:
List
[
dict
]):
"""Add more items for computing corpus level scores.
Args:
out_items: outs from a single document (line)
ref_items: reference of the evaluated document (line)
"""
out_ann
=
sorted
(
out_items
[
'annotations'
],
key
=
itemgetter
(
'key'
))
ref_ann
=
sorted
(
ref_items
[
'annotations'
],
key
=
itemgetter
(
'key'
))
assert
[
a
[
'key'
]
for
a
in
out_ann
]
==
[
a
[
'key'
]
for
a
in
ref_ann
]
for
out
,
ref
in
zip
(
out_ann
,
ref_ann
):
o_values
=
[
v
[
'value'
]
for
v
in
out
[
'values'
]]
r_values
=
[
v
[
'value'
]
for
v
in
ref
[
'values'
]]
score
=
int
(
self
.
check_denotation
(
o_values
,
r_values
))
self
.
__scores
.
append
(
score
)
def
score
(
self
)
->
float
:
if
self
.
__scores
:
return
sum
(
self
.
__scores
)
/
len
(
self
.
__scores
)
return
0.0
@
classmethod
def
support_feature_scores
(
cls
)
->
bool
:
return
False
@
classmethod
def
metric_name
(
cls
)
->
str
:
return
"Accuracy"
evaluation/due_evaluator/scorers/anls_scorer.py
0 → 100644
View file @
58d33d4c
import
logging
from
typing
import
List
from
operator
import
itemgetter
import
textdistance
from
due_evaluator.scorers.base_scorer
import
BaseScorer
logger
=
logging
.
getLogger
(
__name__
)
class
AnlsScorer
(
BaseScorer
):
"""ANSL Scorer."""
def
__init__
(
self
,
threshold
:
float
=
0.5
):
self
.
__scores
:
List
[
float
]
=
[]
self
.
threshold
=
threshold
@
property
def
scores
(
self
):
return
self
.
__scores
def
add
(
self
,
out_items
:
List
[
dict
],
ref_items
:
List
[
dict
]):
"""Add more items for computing corpus level scores.
Args:
out_items: outs from a single document (line)
ref_items: reference of the evaluated document (line)
"""
out_ann
=
sorted
(
out_items
[
'annotations'
],
key
=
itemgetter
(
'key'
))
ref_ann
=
sorted
(
ref_items
[
'annotations'
],
key
=
itemgetter
(
'key'
))
assert
[
a
[
'key'
][:
100
]
for
a
in
out_ann
]
==
[
a
[
'key'
][:
100
]
for
a
in
ref_ann
]
"""try:
# assert [a['key'][:100] for a in out_ann] == [a['key'][:100] for a in ref_ann]
out_keys = [a['key'][:100] for a in out_ann]
ref_keys = [a['key'][:100] for a in ref_ann]
# assert out_keys == ref_keys
for i in range(len(out_keys)):
try:
assert out_keys[i] == ref_keys[i]
except AssertionError as e:
print(out_keys[i])
print(ref_keys[i])
print('==============')
# exit(0)
except AssertionError as e:
print('key of pred and gt unmatched:')
# print('pred:', out_keys)
# print('gt:', ref_keys)
exit(0)"""
for
out
,
ref
in
zip
(
out_ann
,
ref_ann
):
assert
len
(
out
[
'values'
])
==
1
val
=
out
[
'values'
][
0
][
'value'
]
possible_vals
=
ref
[
'values'
][
0
][
'value_variants'
]
best_score
=
max
([
textdistance
.
levenshtein
.
normalized_similarity
(
val
,
pos
)
for
pos
in
possible_vals
])
if
1
-
self
.
threshold
>=
best_score
:
best_score
=
0.0
self
.
__scores
.
append
(
best_score
)
def
score
(
self
)
->
float
:
if
self
.
__scores
:
return
sum
(
self
.
__scores
)
/
len
(
self
.
__scores
)
return
0.0
@
classmethod
def
support_feature_scores
(
cls
)
->
bool
:
return
False
@
classmethod
def
metric_name
(
cls
)
->
str
:
return
"ANLS"
evaluation/due_evaluator/scorers/base_scorer.py
0 → 100644
View file @
58d33d4c
import
abc
from
typing
import
List
class
BaseScorer
(
abc
.
ABC
):
"""Abstract class for scorers."""
@
abc
.
abstractmethod
def
add
(
self
,
out_items
:
List
[
dict
],
ref_items
:
List
[
dict
]):
pass
@
abc
.
abstractmethod
def
score
(
self
):
pass
@
abc
.
abstractclassmethod
def
support_feature_scores
(
cls
)
->
bool
:
pass
@
abc
.
abstractclassmethod
def
metric_name
(
cls
)
->
str
:
pass
evaluation/due_evaluator/scorers/fscorer.py
0 → 100644
View file @
58d33d4c
# -*- coding: utf-8 -*-
"""F1 Scorer."""
from
dataclasses
import
dataclass
,
field
from
typing
import
Any
,
Dict
,
List
from
due_evaluator.scorers.base_scorer
import
BaseScorer
@
dataclass
(
eq
=
False
,
frozen
=
True
)
class
Annotation
:
key
:
str
value
:
str
value_variants
:
List
[
str
]
=
field
(
default_factory
=
list
)
def
__eq__
(
self
,
other
):
if
self
.
key
==
other
.
key
:
if
self
.
value
==
other
.
value
:
return
True
elif
self
.
value
in
other
.
value_variants
:
return
True
elif
other
.
value
in
self
.
value_variants
:
return
True
return
False
class
FScorer
(
BaseScorer
):
"""Corpus level F1 Score evaluator."""
def
__init__
(
self
):
"""Initialize class."""
self
.
__precision
=
[]
self
.
__recall
=
[]
@
classmethod
def
from_scorers
(
cls
,
scorers
:
List
[
'FScorer'
])
->
'FScorer'
:
"""Get new scorers that is the ensamble of the scorers.
Args:
scorers: list of scorers
Returns:
FScorer: a new FScorer
"""
new_scorer
=
cls
()
for
scorer
in
scorers
:
new_scorer
.
__precision
.
extend
(
scorer
.
__precision
)
new_scorer
.
__recall
.
extend
(
scorer
.
__recall
)
return
new_scorer
def
flatten_annotations
(
self
,
annotations
:
List
[
Dict
[
str
,
Any
]])
->
List
[
Annotation
]:
flatten_items
=
[]
for
annotation
in
annotations
:
for
value
in
annotation
[
'values'
]:
flatten_items
.
append
(
Annotation
(
key
=
annotation
[
'key'
],
value
=
value
[
'value'
],
value_variants
=
value
[
'value_variants'
]
if
'value_variants'
in
value
else
[]))
return
flatten_items
def
add
(
self
,
out_items
:
Dict
[
str
,
Any
],
ref_items
:
Dict
[
str
,
Any
]):
"""Add more items for computing corpus level scores.
Args:
out_items: outs from a single document (line)
ref_items: reference of the evaluated document (line)
"""
prediction_annotations
=
self
.
flatten_annotations
(
out_items
[
'annotations'
])
ref_annotations
=
self
.
flatten_annotations
(
ref_items
[
'annotations'
])
ref_annotations_copy
=
ref_annotations
.
copy
()
indicators
=
[]
for
prediction
in
prediction_annotations
:
if
prediction
in
ref_annotations_copy
:
indicators
.
append
(
1
)
ref_annotations_copy
.
remove
(
prediction
)
else
:
indicators
.
append
(
0
)
self
.
__add_to_precision
(
indicators
)
indicators
=
[]
prediction_annotations_copy
=
prediction_annotations
.
copy
()
for
ref
in
ref_annotations
:
if
ref
in
prediction_annotations_copy
:
indicators
.
append
(
1
)
prediction_annotations_copy
.
remove
(
ref
)
else
:
indicators
.
append
(
0
)
self
.
__add_to_recall
(
indicators
)
def
__add_to_precision
(
self
,
item
:
List
[
int
]):
if
isinstance
(
item
,
list
):
self
.
__precision
.
extend
(
item
)
else
:
self
.
__precision
.
append
(
item
)
def
__add_to_recall
(
self
,
item
:
List
[
int
]):
if
isinstance
(
item
,
list
):
self
.
__recall
.
extend
(
item
)
else
:
self
.
__recall
.
append
(
item
)
def
precision
(
self
)
->
float
:
"""Compute precision.
Returns:
float: corpus level precision
"""
if
self
.
__precision
:
precision
=
sum
(
self
.
__precision
)
/
len
(
self
.
__precision
)
else
:
precision
=
0.0
return
precision
@
property
def
precision_support
(
self
):
return
self
.
__precision
@
property
def
recall_support
(
self
):
return
self
.
__recall
def
recall
(
self
)
->
float
:
"""Compute recall.
Returns:
float: corpus level recall
"""
if
self
.
__recall
:
recall
=
sum
(
self
.
__recall
)
/
len
(
self
.
__recall
)
else
:
recall
=
0.0
return
recall
def
f_score
(
self
)
->
float
:
"""Compute F1 score.
Returns:
float: corpus level F1 score.
"""
precision
=
self
.
precision
()
recall
=
self
.
recall
()
if
precision
or
recall
:
fscore
=
2
*
precision
*
recall
/
(
precision
+
recall
)
else
:
fscore
=
0.0
return
fscore
def
false_negative
(
self
)
->
int
:
"""Return the number of false negatives.
Returns:
int: number of false negatives.
"""
return
len
(
self
.
__recall
)
-
sum
(
self
.
__recall
)
def
false_positive
(
self
)
->
int
:
"""Return the number of false positives.
Returns:
int: number of false positives.
"""
return
len
(
self
.
__precision
)
-
sum
(
self
.
__precision
)
def
true_positive
(
self
)
->
int
:
"""Return number of true positives.
Returns:
int: number of true positives.
"""
return
sum
(
self
.
__precision
)
def
condition_positive
(
self
)
->
int
:
"""Return number of condition positives.
Returns:
int: number of condition positives.
"""
return
len
(
self
.
__precision
)
def
score
(
self
):
return
self
.
f_score
()
@
classmethod
def
support_feature_scores
(
cls
)
->
bool
:
return
True
@
classmethod
def
metric_name
(
cls
)
->
str
:
return
"F1"
evaluation/due_evaluator/scorers/geval_scorer.py
0 → 100644
View file @
58d33d4c
from
typing
import
List
import
tempfile
from
collections
import
defaultdict
import
os
from
due_evaluator.scorers.fscorer
import
FScorer
from
due_evaluator.scorers.base_scorer
import
BaseScorer
GEVAL_BINARY
=
os
.
getenv
(
'GEVAL_BINARY'
,
'/data/shared/bin/geval'
)
GEVAL_METRIC
=
os
.
getenv
(
'GEVAL_METRIC'
,
'MultiLabel-F1:cN'
)
class
GevalScorer
(
BaseScorer
):
def
__init__
(
self
):
self
.
__ref
=
tempfile
.
NamedTemporaryFile
(
'w+t'
)
self
.
__out
=
tempfile
.
NamedTemporaryFile
(
'w+t'
)
self
.
__ref_data
=
defaultdict
(
set
)
self
.
__out_data
=
defaultdict
(
set
)
@
staticmethod
def
add_to_geval_data
(
data
,
line
):
name
=
line
[
'name'
]
for
annotation
in
line
[
'annotations'
]:
for
idx
,
val
in
enumerate
(
annotation
[
'values'
],
1
):
for
child
in
val
[
'children'
]:
new_name
=
child
[
'key'
]
+
'__'
+
str
(
idx
)
if
'__'
in
child
[
'key'
]
else
child
[
'key'
]
if
child
[
'values'
]
and
child
[
'values'
]
!=
[
''
]:
new_value
=
'|'
.
join
([
v
[
'value'
].
replace
(
' '
,
'_'
)
for
v
in
child
[
'values'
]])
data
[
name
].
add
(
f
'
{
new_name
}
=
{
new_value
}
'
)
def
save_geval_files
(
self
):
for
name
in
sorted
(
self
.
__ref_data
.
keys
()):
self
.
__ref
.
write
(
' '
.
join
(
self
.
__ref_data
[
name
])
+
'
\n
'
)
self
.
__out
.
write
(
' '
.
join
(
self
.
__out_data
[
name
])
+
'
\n
'
)
def
add
(
self
,
out_items
:
List
[
str
],
ref_items
:
List
[
str
]):
self
.
add_to_geval_data
(
self
.
__out_data
,
out_items
)
self
.
add_to_geval_data
(
self
.
__ref_data
,
ref_items
)
def
support_feature_scores
(
cls
)
->
bool
:
return
False
def
metric_name
(
cls
)
->
str
:
return
"GEVAL"
def
run_geval
(
self
):
self
.
__ref
.
flush
()
self
.
__out
.
flush
()
try
:
return
float
(
os
.
popen
(
f
'
{
GEVAL_BINARY
}
-o
{
self
.
__out
.
name
}
-e
{
self
.
__ref
.
name
}
--metric
{
GEVAL_METRIC
}
'
).
read
())
except
:
return
-
1
def
score
(
self
)
->
float
:
self
.
save_geval_files
()
return
self
.
run_geval
()
evaluation/due_evaluator/scorers/group_anls.py
0 → 100644
View file @
58d33d4c
from
typing
import
Any
,
List
,
Dict
import
itertools
from
dataclasses
import
dataclass
,
field
import
numpy
as
np
from
scipy.optimize
import
linear_sum_assignment
import
textdistance
from
due_evaluator.scorers.fscorer
import
FScorer
from
due_evaluator.scorers.base_scorer
import
BaseScorer
@
dataclass
(
eq
=
False
,
frozen
=
True
)
class
FuzzyAnnotation
:
key
:
str
value
:
str
value_variants
:
List
[
str
]
=
field
(
default_factory
=
list
)
def
__eq__
(
self
,
other
):
def
_is_float
(
val
):
try
:
float
(
val
)
except
ValueError
:
return
False
return
True
def
_comp
(
val
,
pos
)
->
float
:
if
_is_float
(
val
)
or
_is_float
(
pos
):
return
float
(
val
==
pos
)
return
textdistance
.
levenshtein
.
normalized_similarity
(
val
,
pos
)
def
_is_acceptable
(
val
,
possible_vals
,
threshold
=
.
5
):
best_score
=
max
([
_comp
(
val
,
pos
)
for
pos
in
possible_vals
]
+
[
0.
])
return
best_score
>=
threshold
if
self
.
key
==
other
.
key
:
if
_is_acceptable
(
other
.
value
,
[
self
.
value
]):
return
True
elif
_is_acceptable
(
self
.
value
,
other
.
value_variants
):
return
True
elif
_is_acceptable
(
other
.
value
,
self
.
value_variants
):
return
True
return
False
class
FuzzyFScorer
(
FScorer
):
def
flatten_annotations
(
self
,
annotations
:
List
[
Dict
[
str
,
Any
]])
->
List
[
FuzzyAnnotation
]:
flatten_items
=
[]
for
annotation
in
annotations
:
for
value
in
annotation
[
'values'
]:
flatten_items
.
append
(
FuzzyAnnotation
(
key
=
annotation
[
'key'
],
value
=
value
[
'value'
],
value_variants
=
value
[
'value_variants'
]
if
'value_variants'
in
value
else
[]))
return
flatten_items
class
GroupAnlsScorer
(
BaseScorer
):
def
__init__
(
self
):
self
.
__inner_scorer
=
FuzzyFScorer
()
def
pseudo_documents
(
self
,
doc
:
dict
)
->
List
[
dict
]:
docs
=
[]
for
ann
in
doc
[
'annotations'
]:
for
val
in
ann
[
'values'
]:
assert
'children'
in
val
docs
.
append
({
'name'
:
''
,
'annotations'
:
val
[
'children'
]
})
return
docs
def
best_permutation
(
self
,
out_items
:
List
[
dict
],
ref_items
:
List
[
dict
]):
out_items
=
self
.
pseudo_documents
(
out_items
)
ref_items
=
self
.
pseudo_documents
(
ref_items
)
target_length
=
max
(
len
(
out_items
),
len
(
ref_items
))
out_items
=
self
.
pad
(
out_items
,
target_length
)
ref_items
=
self
.
pad
(
ref_items
,
target_length
)
matrix
=
[]
for
o
in
out_items
:
row
=
[]
for
ri
,
r
in
enumerate
(
ref_items
):
fscorer
=
FuzzyFScorer
()
fscorer
.
add
(
o
,
r
)
row
.
append
(
1
-
fscorer
.
f_score
())
matrix
.
append
(
row
)
row_ind
,
col_ind
=
linear_sum_assignment
(
np
.
array
(
matrix
))
best_out
=
[
out_items
[
i
]
for
i
in
row_ind
]
best_ref
=
[
ref_items
[
i
]
for
i
in
col_ind
]
return
(
best_out
,
best_ref
)
def
pad
(
self
,
items
:
List
[
dict
],
target_length
:
int
):
for
_
in
range
(
target_length
-
len
(
items
)):
items
.
append
({
'name'
:
''
,
'annotations'
:
[]})
return
items
def
add
(
self
,
out_items
:
List
[
str
],
ref_items
:
List
[
str
]):
if
len
(
self
.
pseudo_documents
(
out_items
))
==
0
and
len
(
self
.
pseudo_documents
(
ref_items
))
==
0
:
return
out_perm
,
ref_perm
=
self
.
best_permutation
(
out_items
,
ref_items
)
for
o
,
r
in
zip
(
out_perm
,
ref_perm
):
self
.
__inner_scorer
.
add
(
o
,
r
)
def
support_feature_scores
(
cls
)
->
bool
:
return
False
def
metric_name
(
cls
)
->
str
:
return
"GROUP-ANLS"
def
score
(
self
)
->
float
:
return
self
.
__inner_scorer
.
score
()
evaluation/due_evaluator/scorers/mean_fscorer.py
0 → 100644
View file @
58d33d4c
from
typing
import
List
from
due_evaluator.scorers.fscorer
import
FScorer
from
due_evaluator.scorers.base_scorer
import
BaseScorer
class
MeanFScorer
(
BaseScorer
):
def
__init__
(
self
):
self
.
__scores
:
List
[
float
]
=
[]
def
add
(
self
,
out_items
:
List
[
str
],
ref_items
:
List
[
str
]):
fscorer
=
FScorer
()
fscorer
.
add
(
out_items
,
ref_items
)
self
.
__scores
.
append
(
fscorer
.
f_score
())
def
support_feature_scores
(
cls
)
->
bool
:
return
False
def
metric_name
(
cls
)
->
str
:
return
"MEAN-F1"
def
score
(
self
)
->
float
:
if
self
.
__scores
:
return
sum
(
self
.
__scores
)
/
len
(
self
.
__scores
)
return
0.0
evaluation/due_evaluator/scorers/wtq_scorer.py
0 → 100644
View file @
58d33d4c
"""
Based on the official implementation from:
https://github.com/ppasupat/WikiTableQuestions/blob/master/evaluator.py
"""
import
logging
from
typing
import
List
from
operator
import
itemgetter
import
re
from
math
import
isnan
,
isinf
from
abc
import
ABCMeta
,
abstractmethod
import
unicodedata
from
due_evaluator.scorers.accuracy_scorer
import
AccuracyScorer
logger
=
logging
.
getLogger
(
__name__
)
def
normalize
(
x
):
# Remove diacritics
x
=
''
.
join
(
c
for
c
in
unicodedata
.
normalize
(
'NFKD'
,
x
)
if
unicodedata
.
category
(
c
)
!=
'Mn'
)
# Normalize quotes and dashes
x
=
re
.
sub
(
r
"[‘’´`]"
,
"'"
,
x
)
x
=
re
.
sub
(
r
"[“”]"
,
"
\"
"
,
x
)
x
=
re
.
sub
(
r
"[‐‑‒–—−]"
,
"-"
,
x
)
while
True
:
old_x
=
x
# Remove citations
x
=
re
.
sub
(
r
"((?<!^)\[[^\]]*\]|\[\d+\]|[•♦†‡*#+])*$"
,
""
,
x
.
strip
())
# Remove details in parenthesis
x
=
re
.
sub
(
r
"(?<!^)( \([^)]*\))*$"
,
""
,
x
.
strip
())
# Remove outermost quotation mark
x
=
re
.
sub
(
r
'^"([^"]*)"$'
,
r
'\1'
,
x
.
strip
())
if
x
==
old_x
:
break
# Remove final '.'
if
x
and
x
[
-
1
]
==
'.'
:
x
=
x
[:
-
1
]
# Collapse whitespaces and convert to lower case
x
=
re
.
sub
(
r
'\s+'
,
' '
,
x
,
flags
=
re
.
U
).
lower
().
strip
()
return
x
class
Value
(
object
):
__metaclass__
=
ABCMeta
# Should be populated with the normalized string
_normalized
=
None
@
abstractmethod
def
match
(
self
,
other
):
"""Return True if the value matches the other value.
Args:
other (Value)
Returns:
a boolean
"""
pass
@
property
def
normalized
(
self
):
return
self
.
_normalized
class
StringValue
(
Value
):
def
__init__
(
self
,
content
):
assert
isinstance
(
content
,
str
)
self
.
_normalized
=
normalize
(
content
)
self
.
_hash
=
hash
(
self
.
_normalized
)
def
__eq__
(
self
,
other
):
return
isinstance
(
other
,
StringValue
)
and
self
.
normalized
==
other
.
normalized
def
__hash__
(
self
):
return
self
.
_hash
def
__str__
(
self
):
return
'S'
+
str
([
self
.
normalized
])
__repr__
=
__str__
def
match
(
self
,
other
):
assert
isinstance
(
other
,
Value
)
return
self
.
normalized
==
other
.
normalized
class
NumberValue
(
Value
):
def
__init__
(
self
,
amount
,
original_string
=
None
):
assert
isinstance
(
amount
,
(
int
,
float
))
if
abs
(
amount
-
round
(
amount
))
<
1e-6
:
self
.
_amount
=
int
(
amount
)
else
:
self
.
_amount
=
float
(
amount
)
if
not
original_string
:
self
.
_normalized
=
unicode
(
self
.
_amount
)
else
:
self
.
_normalized
=
normalize
(
original_string
)
self
.
_hash
=
hash
(
self
.
_amount
)
@
property
def
amount
(
self
):
return
self
.
_amount
def
__eq__
(
self
,
other
):
return
isinstance
(
other
,
NumberValue
)
and
self
.
amount
==
other
.
amount
def
__hash__
(
self
):
return
self
.
_hash
def
__str__
(
self
):
return
(
'N(%f)'
%
self
.
amount
)
+
str
([
self
.
normalized
])
__repr__
=
__str__
def
match
(
self
,
other
):
assert
isinstance
(
other
,
Value
)
if
self
.
normalized
==
other
.
normalized
:
return
True
if
isinstance
(
other
,
NumberValue
):
return
abs
(
self
.
amount
-
other
.
amount
)
<
1e-6
return
False
@
staticmethod
def
parse
(
text
):
"""Try to parse into a number.
Return:
the number (int or float) if successful; otherwise None.
"""
try
:
return
int
(
text
)
except
:
try
:
amount
=
float
(
text
)
assert
not
isnan
(
amount
)
and
not
isinf
(
amount
)
return
amount
except
:
return
None
class
DateValue
(
Value
):
def
__init__
(
self
,
year
,
month
,
day
,
original_string
=
None
):
"""Create a new DateValue. Placeholders are marked as -1."""
assert
isinstance
(
year
,
int
)
assert
isinstance
(
month
,
int
)
and
(
month
==
-
1
or
1
<=
month
<=
12
)
assert
isinstance
(
day
,
int
)
and
(
day
==
-
1
or
1
<=
day
<=
31
)
assert
not
(
year
==
month
==
day
==
-
1
)
self
.
_year
=
year
self
.
_month
=
month
self
.
_day
=
day
if
not
original_string
:
self
.
_normalized
=
'{}-{}-{}'
.
format
(
year
if
year
!=
-
1
else
'xx'
,
month
if
month
!=
-
1
else
'xx'
,
day
if
day
!=
'-1'
else
'xx'
)
else
:
self
.
_normalized
=
normalize
(
original_string
)
self
.
_hash
=
hash
((
self
.
_year
,
self
.
_month
,
self
.
_day
))
@
property
def
ymd
(
self
):
return
(
self
.
_year
,
self
.
_month
,
self
.
_day
)
def
__eq__
(
self
,
other
):
return
isinstance
(
other
,
DateValue
)
and
self
.
ymd
==
other
.
ymd
def
__hash__
(
self
):
return
self
.
_hash
def
__str__
(
self
):
return
((
'D(%d,%d,%d)'
%
(
self
.
_year
,
self
.
_month
,
self
.
_day
))
+
str
([
self
.
_normalized
]))
__repr__
=
__str__
def
match
(
self
,
other
):
assert
isinstance
(
other
,
Value
)
if
self
.
normalized
==
other
.
normalized
:
return
True
if
isinstance
(
other
,
DateValue
):
return
self
.
ymd
==
other
.
ymd
return
False
@
staticmethod
def
parse
(
text
):
"""Try to parse into a date.
Return:
tuple (year, month, date) if successful; otherwise None.
"""
try
:
ymd
=
text
.
lower
().
split
(
'-'
)
assert
len
(
ymd
)
==
3
year
=
-
1
if
ymd
[
0
]
in
(
'xx'
,
'xxxx'
)
else
int
(
ymd
[
0
])
month
=
-
1
if
ymd
[
1
]
==
'xx'
else
int
(
ymd
[
1
])
day
=
-
1
if
ymd
[
2
]
==
'xx'
else
int
(
ymd
[
2
])
assert
not
(
year
==
month
==
day
==
-
1
)
assert
month
==
-
1
or
1
<=
month
<=
12
assert
day
==
-
1
or
1
<=
day
<=
31
return
(
year
,
month
,
day
)
except
:
return
None
class
WtqScorer
(
AccuracyScorer
):
"""WTQ Scorer."""
def
__init__
(
self
,
threshold
:
float
=
0.5
):
self
.
__scores
:
List
[
float
]
=
[]
self
.
threshold
=
threshold
@
property
def
scores
(
self
):
return
self
.
__scores
def
to_value
(
self
,
original_string
,
corenlp_value
=
None
):
"""Convert the string to Value object.
Args:
original_string (str): Original string
corenlp_value (str): Optional value returned from CoreNLP
Returns:
Value
"""
if
isinstance
(
original_string
,
Value
):
# Already a Value
return
original_string
if
not
corenlp_value
:
corenlp_value
=
original_string
# Number?
amount
=
NumberValue
.
parse
(
corenlp_value
)
if
amount
is
not
None
:
return
NumberValue
(
amount
,
original_string
)
# Date?
ymd
=
DateValue
.
parse
(
corenlp_value
)
if
ymd
is
not
None
:
if
ymd
[
1
]
==
ymd
[
2
]
==
-
1
:
return
NumberValue
(
ymd
[
0
],
original_string
)
else
:
return
DateValue
(
ymd
[
0
],
ymd
[
1
],
ymd
[
2
],
original_string
)
# String.
return
StringValue
(
original_string
)
def
to_value_list
(
self
,
original_strings
,
corenlp_values
=
None
):
"""Convert a list of strings to a list of Values
Args:
original_strings (list[str])
corenlp_values (list[str or None])
Returns:
list[Value]
"""
assert
isinstance
(
original_strings
,
(
list
,
tuple
,
set
))
if
corenlp_values
is
not
None
:
assert
isinstance
(
corenlp_values
,
(
list
,
tuple
,
set
))
assert
len
(
original_strings
)
==
len
(
corenlp_values
)
return
list
(
set
(
to_value
(
x
,
y
)
for
(
x
,
y
)
in
zip
(
original_strings
,
corenlp_values
)))
else
:
return
list
(
set
(
self
.
to_value
(
x
)
for
x
in
original_strings
))
def
check_denotation
(
self
,
predicted_values
:
list
,
target_values
:
list
):
"""Return True if the predicted denotation is correct.
Args:
predicted_values (list[Value])
target_values (list[Value])
Returns:
bool
"""
target_values
=
self
.
to_value_list
(
target_values
)
predicted_values
=
self
.
to_value_list
(
predicted_values
)
# Check size
if
len
(
target_values
)
!=
len
(
predicted_values
):
return
False
# Check items
for
target
in
target_values
:
if
not
any
(
target
.
match
(
pred
)
for
pred
in
predicted_values
):
return
False
return
True
def
add
(
self
,
out_items
:
List
[
dict
],
ref_items
:
List
[
dict
]):
"""Add more items for computing corpus level scores.
Args:
out_items: outs from a single document (line)
ref_items: reference of the evaluated document (line)
"""
out_ann
=
sorted
(
out_items
[
'annotations'
],
key
=
itemgetter
(
'key'
))
ref_ann
=
sorted
(
ref_items
[
'annotations'
],
key
=
itemgetter
(
'key'
))
assert
[
a
[
'key'
][:
100
]
for
a
in
out_ann
]
==
[
a
[
'key'
][:
100
]
for
a
in
ref_ann
]
for
out
,
ref
in
zip
(
out_ann
,
ref_ann
):
o_values
=
[
v
[
'value'
]
for
v
in
out
[
'values'
]]
r_values
=
[
v
[
'value'
]
for
v
in
ref
[
'values'
]]
score
=
int
(
self
.
check_denotation
(
o_values
,
r_values
))
self
.
__scores
.
append
(
score
)
def
score
(
self
)
->
float
:
if
self
.
__scores
:
return
sum
(
self
.
__scores
)
/
len
(
self
.
__scores
)
return
0.0
@
classmethod
def
support_feature_scores
(
cls
)
->
bool
:
return
False
@
classmethod
def
metric_name
(
cls
)
->
str
:
return
"WTQ"
evaluation/due_evaluator/utils.py
0 → 100644
View file @
58d33d4c
from
due_evaluator.scorers.fscorer
import
FScorer
from
typing
import
Dict
,
List
,
Optional
,
Sequence
,
Union
import
pandas
as
pd
from
due_evaluator.due_evaluator
import
DueEvaluator
def
dataframe_to_print
(
df
:
pd
.
DataFrame
,
print_format
:
Optional
[
str
]
=
'text'
)
->
str
:
"""Export dataframe to json or plain text.
Args:
df (pd.DataFrame): data
print_format (str, optional): Print format. Defaults to 'text'.
Raises:
ValueError: unknown print_format
Returns:
str: printed version of dataframe
"""
out
:
str
if
print_format
==
'latex'
:
out
=
df
.
reset_index
().
to_latex
(
index
=
False
)
elif
print_format
==
'text'
:
out
=
df
.
reset_index
().
to_string
(
index
=
False
)
elif
print_format
==
'json'
:
out
=
df
.
to_json
(
orient
=
'index'
)
else
:
raise
ValueError
()
return
out
def
property_scores_to_string
(
dues
:
List
[
DueEvaluator
],
print_format
:
str
=
'text'
,
columns
:
Sequence
[
str
]
=
(
'Precision'
,
'Recall'
,
'F-1'
),
)
->
str
:
"""Print out scores per property.
Args:
dues: List of DueEvaluators
print_format: output format: text or latex
columns: a list of metrics to print
Returns:
str: string table with feature scores.
"""
data
=
[]
for
property_name
in
sorted
(
dues
[
0
].
property_scorers
.
keys
())
+
[
'ALL'
]:
row_data
:
Dict
[
str
,
Union
[
str
,
float
]]
=
{}
row_data
[
'Label'
]
=
property_name
for
due
in
dues
:
if
len
(
dues
)
==
1
:
suffix
=
''
else
:
suffix
=
f
' (
{
due
.
path
}
)'
if
property_name
==
'ALL'
:
scorer
=
due
.
general_scorer
else
:
scorer
=
due
.
property_scorers
[
property_name
]
row_data
[
scorer
.
metric_name
()
+
suffix
]
=
scorer
.
score
()
if
isinstance
(
scorer
,
FScorer
):
if
'Precision'
in
columns
:
row_data
[
'Precision'
+
suffix
]
=
scorer
.
precision
()
if
'Recall'
in
columns
:
row_data
[
'Recall'
+
suffix
]
=
scorer
.
recall
()
data
.
append
(
row_data
)
df
=
pd
.
DataFrame
(
data
)
df
.
set_index
(
'Label'
,
drop
=
True
,
inplace
=
True
)
return
dataframe_to_print
(
df
,
print_format
)
evaluation/evaluator.py
0 → 100644
View file @
58d33d4c
import
collections
import
itertools
from
typing
import
Any
,
Callable
,
Dict
,
List
,
Mapping
,
Optional
,
Sequence
,
Tuple
,
Union
from
icecream
import
ic
import
re
from
pycocoevalcap.tokenizer.ptbtokenizer
import
PTBTokenizer
from
pycocoevalcap.cider.cider
import
Cider
from
pycocoevalcap.bleu.bleu
import
Bleu
from
pycocoevalcap.rouge.rouge
import
Rouge
from
pycocoevalcap.meteor.meteor
import
Meteor
import
editdistance
"""
this script support:
ANLS for DocVQA
RelaxedAccuracy for ChartQA
ContainAccuracy for MultimodalOCR LLM zero-shot text-recognition
"""
def
anls_metric
(
target
:
str
,
prediction
:
str
,
theta
:
float
=
0.5
):
"""Calculates ANLS for DocVQA.
There does not seem to be an official evaluation script.
Public implementation on which this implementation is based:
https://github.com/herobd/layoutlmv2/blob/main/eval_docvqa.py#L92
Original paper (see Eq 1): https://arxiv.org/pdf/1907.00490.pdf
Args:
target: Target string.
prediction: Predicted string.
theta: Filter threshold set to 0.5 for DocVQA.
Returns:
ANLS score.
"""
edit_distance
=
editdistance
.
eval
(
target
,
prediction
)
normalized_ld
=
edit_distance
/
max
(
len
(
target
),
len
(
prediction
))
return
1.0
-
normalized_ld
if
normalized_ld
<
theta
else
0.0
def
relaxed_correctness
(
target
:
str
,
prediction
:
str
,
max_relative_change
:
float
=
0.05
)
->
bool
:
"""Calculates relaxed correctness.
The correctness tolerates certain error ratio defined by max_relative_change.
See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
“Following Methani et al. (2020), we use a relaxed accuracy measure for the
numeric answers to allow a minor inaccuracy that may result from the automatic
data extraction process. We consider an answer to be correct if it is within
5% of the gold answer. For non-numeric answers, we still need an exact match
to consider an answer to be correct.”
Args:
target: Target string.
prediction: Predicted string.
max_relative_change: Maximum relative change.
Returns:
Whether the prediction was correct given the specified tolerance.
"""
def
_to_float
(
text
:
str
)
->
Optional
[
float
]:
try
:
if
text
.
endswith
(
"%"
):
# Convert percentages to floats.
return
float
(
text
.
rstrip
(
"%"
))
/
100.0
else
:
return
float
(
text
)
except
ValueError
:
return
None
prediction_float
=
_to_float
(
prediction
)
target_float
=
_to_float
(
target
)
if
prediction_float
is
not
None
and
target_float
:
relative_change
=
abs
(
prediction_float
-
target_float
)
/
abs
(
target_float
)
return
float
(
relative_change
<=
max_relative_change
)
else
:
return
float
(
prediction
.
lower
()
==
target
.
lower
())
def
exact_match
(
target
:
str
,
prediction
:
str
):
return
float
(
target
==
prediction
)
def
iou_match
(
target
:
list
,
prediction
:
list
,
threshold
=
0.5
):
"""
target/prediction: normalized bbox (list(float)), xyxy
"""
g_x1
,
g_y1
,
g_x2
,
g_y2
=
target
p_x1
,
p_y1
,
p_x2
,
p_y2
=
prediction
g_w
=
g_x2
-
g_x1
p_w
=
p_x2
-
p_x1
g_h
=
g_y2
-
g_y1
p_h
=
p_y2
-
p_y1
W
=
(
min
(
g_x2
,
p_x2
)
-
max
(
g_x1
,
p_x1
))
H
=
(
min
(
g_y2
,
p_y2
)
-
max
(
g_y1
,
p_y1
))
Intersection
=
W
*
H
if
Intersection
<=
0
:
return
0.0
Union
=
g_w
*
g_h
+
p_w
*
p_h
-
Intersection
# ic(W, H, Intersection, Union)
if
Intersection
/
Union
>=
threshold
:
return
1.0
else
:
return
0.0
def
remove_special_chars_and_lower
(
s
):
pattern
=
r
"[^a-zA-Z0-9\s]"
# print('raw:', s)
s
=
re
.
sub
(
pattern
,
""
,
s
)
# print('new:', s)
return
s
.
lower
()
def
contain_match
(
target
:
str
,
prediction
:
str
):
def
has_word
(
sentence
,
word
):
pattern
=
r
"\b"
+
re
.
escape
(
word
)
+
r
"\b"
match
=
re
.
search
(
pattern
,
sentence
)
if
match
:
return
True
else
:
return
False
# print(prediction, target, float(has_word(prediction, target)))
return
float
(
has_word
(
prediction
,
target
))
def
cider
(
targets
:
Sequence
[
Sequence
[
str
]],
predictions
:
Sequence
[
str
])
->
float
:
"""Compute CIDEr score."""
coco_tokenizer
=
PTBTokenizer
()
scorer
=
Cider
()
score
,
scores
=
scorer
.
compute_score
(
gts
=
coco_tokenizer
.
tokenize
({
str
(
i
):
[{
"caption"
:
t
}
for
t
in
target
]
for
i
,
target
in
enumerate
(
targets
)
}),
res
=
coco_tokenizer
.
tokenize
({
str
(
i
):
[{
"caption"
:
prediction
}]
for
i
,
prediction
in
enumerate
(
predictions
)
}))
score
=
float
(
score
)
*
100.0
scores
=
[
float
(
s
)
*
100.0
for
s
in
scores
.
tolist
()]
return
score
,
scores
def
rouge
(
targets
:
Sequence
[
Sequence
[
str
]],
predictions
:
Sequence
[
str
])
->
float
:
"""Compute CIDEr score."""
coco_tokenizer
=
PTBTokenizer
()
scorer
=
Rouge
()
score
,
scores
=
scorer
.
compute_score
(
gts
=
coco_tokenizer
.
tokenize
({
str
(
i
):
[{
"caption"
:
t
}
for
t
in
target
]
for
i
,
target
in
enumerate
(
targets
)
}),
res
=
coco_tokenizer
.
tokenize
({
str
(
i
):
[{
"caption"
:
prediction
}]
for
i
,
prediction
in
enumerate
(
predictions
)
}))
score
=
float
(
score
)
*
100.0
scores
=
[
float
(
s
)
*
100.0
for
s
in
scores
.
tolist
()]
return
score
,
scores
def
meteor
(
targets
:
Sequence
[
Sequence
[
str
]],
predictions
:
Sequence
[
str
])
->
float
:
"""Compute CIDEr score."""
coco_tokenizer
=
PTBTokenizer
()
scorer
=
Meteor
()
score
,
scores
=
scorer
.
compute_score
(
gts
=
coco_tokenizer
.
tokenize
({
str
(
i
):
[{
"caption"
:
t
}
for
t
in
target
]
for
i
,
target
in
enumerate
(
targets
)
}),
res
=
coco_tokenizer
.
tokenize
({
str
(
i
):
[{
"caption"
:
prediction
}]
for
i
,
prediction
in
enumerate
(
predictions
)
}))
score
=
float
(
score
)
*
100.0
scores
=
[
float
(
s
)
*
100.0
for
s
in
scores
]
return
score
,
scores
def
bleu
(
ngram
:
int
,
targets
:
Sequence
[
Sequence
[
str
]],
predictions
:
Sequence
[
str
])
->
float
:
"""Compute Bleu score."""
assert
ngram
<=
4
coco_tokenizer
=
PTBTokenizer
()
scorer
=
Bleu
(
4
)
score
,
scores
=
scorer
.
compute_score
(
gts
=
coco_tokenizer
.
tokenize
({
str
(
i
):
[{
"caption"
:
t
}
for
t
in
target
]
for
i
,
target
in
enumerate
(
targets
)
}),
res
=
coco_tokenizer
.
tokenize
({
str
(
i
):
[{
"caption"
:
prediction
}]
for
i
,
prediction
in
enumerate
(
predictions
)
}))
score
=
score
[
ngram
-
1
]
scores
=
scores
[
ngram
-
1
]
# ic(score)
# ic(scores)
score
=
float
(
score
)
*
100.0
scores
=
[
float
(
s
)
*
100.0
for
s
in
scores
]
return
score
,
scores
def
metric_calculate
(
targets
:
Sequence
[
Sequence
[
str
]],
predictions
:
Sequence
[
str
],
metric_fn
:
Callable
[[
str
,
str
],
Any
],
normalize_fn
:
Callable
[[
str
],
str
]
=
lambda
v
:
v
):
"""Aggregate target-prediction pair metrics over a dataset."""
assert
len
(
targets
)
==
len
(
predictions
)
total
=
0
scores
=
[]
for
prediction
,
target
in
zip
(
predictions
,
targets
):
p
=
normalize_fn
(
prediction
)
score
=
max
(
metric_fn
(
normalize_fn
(
t
),
p
)
for
t
in
target
)
scores
.
append
(
score
)
total
+=
score
score
=
(
100.0
*
total
)
/
len
(
targets
)
return
score
,
scores
def
doc_evaluate
(
metric
:
str
,
targets
:
Sequence
[
Sequence
[
str
]],
predictions
:
Sequence
[
str
]):
"""Calculates evaluation metrics.
Args:
metrcs: metric names
targets: list of list of strings.
predictions: list of strings.
Returns:
dictionary with metric names as keys and metric value as values.
"""
results
=
{}
assert
metric
in
[
'ExactAccuracy'
,
'RelaxedAccuracy'
,
'ANLS'
,
'ContainAccuracy'
,
'CIDEr'
,
'BLEU1'
,
'BLEU2'
,
'BLEU3'
,
'BLEU4'
,
'RougeL'
,
'Meteor'
,
'IOU@0.5'
]
if
metric
==
'ExactAccuracy'
:
# case sensitive
score
,
scores
=
metric_calculate
(
targets
,
predictions
,
metric_fn
=
exact_match
)
elif
metric
==
'IOU@0.5'
:
score
,
scores
=
metric_calculate
(
targets
,
predictions
,
metric_fn
=
iou_match
)
elif
metric
==
'ANLS'
:
score
,
scores
=
metric_calculate
(
targets
,
predictions
,
metric_fn
=
anls_metric
,
normalize_fn
=
lambda
v
:
v
.
lower
())
elif
metric
==
'RelaxedAccuracy'
:
score
,
scores
=
metric_calculate
(
targets
,
predictions
,
metric_fn
=
relaxed_correctness
)
elif
metric
==
'ContainAccuracy'
:
score
,
scores
=
metric_calculate
(
targets
,
predictions
,
metric_fn
=
contain_match
,
normalize_fn
=
remove_special_chars_and_lower
)
elif
metric
==
'CIDEr'
:
score
,
scores
=
cider
(
targets
,
predictions
)
elif
metric
==
'BLEU1'
:
score
,
scores
=
bleu
(
1
,
targets
,
predictions
)
elif
metric
==
'BLEU2'
:
score
,
scores
=
bleu
(
2
,
targets
,
predictions
)
elif
metric
==
'BLEU3'
:
score
,
scores
=
bleu
(
3
,
targets
,
predictions
)
elif
metric
==
'BLEU4'
:
score
,
scores
=
bleu
(
4
,
targets
,
predictions
)
elif
metric
==
'RougeL'
:
score
,
scores
=
rouge
(
targets
,
predictions
)
elif
metric
==
'Meteor'
:
score
,
scores
=
meteor
(
targets
,
predictions
)
return
score
,
scores
Prev
1
2
3
4
5
6
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment