Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Qwen2-VL_pytorch
Commits
bc5ebf0f
Commit
bc5ebf0f
authored
Dec 27, 2024
by
luopl
Browse files
Initial commit
parents
Pipeline
#2167
canceled with stages
Changes
260
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
6861 additions
and
0 deletions
+6861
-0
VLMEvalKit/vlmeval/api/taichu.py
VLMEvalKit/vlmeval/api/taichu.py
+217
-0
VLMEvalKit/vlmeval/api/taiyi.py
VLMEvalKit/vlmeval/api/taiyi.py
+192
-0
VLMEvalKit/vlmeval/config.py
VLMEvalKit/vlmeval/config.py
+410
-0
VLMEvalKit/vlmeval/dataset/__init__.py
VLMEvalKit/vlmeval/dataset/__init__.py
+233
-0
VLMEvalKit/vlmeval/dataset/cmmmu.py
VLMEvalKit/vlmeval/dataset/cmmmu.py
+354
-0
VLMEvalKit/vlmeval/dataset/dude.py
VLMEvalKit/vlmeval/dataset/dude.py
+211
-0
VLMEvalKit/vlmeval/dataset/dynamath.py
VLMEvalKit/vlmeval/dataset/dynamath.py
+240
-0
VLMEvalKit/vlmeval/dataset/image_base.py
VLMEvalKit/vlmeval/dataset/image_base.py
+172
-0
VLMEvalKit/vlmeval/dataset/image_caption.py
VLMEvalKit/vlmeval/dataset/image_caption.py
+75
-0
VLMEvalKit/vlmeval/dataset/image_mcq.py
VLMEvalKit/vlmeval/dataset/image_mcq.py
+899
-0
VLMEvalKit/vlmeval/dataset/image_mt.py
VLMEvalKit/vlmeval/dataset/image_mt.py
+128
-0
VLMEvalKit/vlmeval/dataset/image_vqa.py
VLMEvalKit/vlmeval/dataset/image_vqa.py
+1330
-0
VLMEvalKit/vlmeval/dataset/image_yorn.py
VLMEvalKit/vlmeval/dataset/image_yorn.py
+95
-0
VLMEvalKit/vlmeval/dataset/longvideobench.py
VLMEvalKit/vlmeval/dataset/longvideobench.py
+328
-0
VLMEvalKit/vlmeval/dataset/miabench.py
VLMEvalKit/vlmeval/dataset/miabench.py
+167
-0
VLMEvalKit/vlmeval/dataset/mlvu.py
VLMEvalKit/vlmeval/dataset/mlvu.py
+455
-0
VLMEvalKit/vlmeval/dataset/mmbench_video.py
VLMEvalKit/vlmeval/dataset/mmbench_video.py
+256
-0
VLMEvalKit/vlmeval/dataset/mmgenbench.py
VLMEvalKit/vlmeval/dataset/mmgenbench.py
+69
-0
VLMEvalKit/vlmeval/dataset/mmlongbench.py
VLMEvalKit/vlmeval/dataset/mmlongbench.py
+584
-0
VLMEvalKit/vlmeval/dataset/mmmath.py
VLMEvalKit/vlmeval/dataset/mmmath.py
+446
-0
No files found.
VLMEvalKit/vlmeval/api/taichu.py
0 → 100644
View file @
bc5ebf0f
from
vlmeval.smp
import
*
from
vlmeval.api.base
import
BaseAPI
import
os
import
re
import
json
from
PIL
import
Image
import
base64
from
io
import
BytesIO
class
ChatResponse
(
dict
):
def
__getattr__
(
self
,
name
):
value
=
self
.
get
(
name
)
if
isinstance
(
value
,
dict
):
return
ChatResponse
(
value
)
# 如果值是字典,递归包装成 DotDict
elif
isinstance
(
value
,
list
):
return
[
ChatResponse
(
v
)
if
isinstance
(
v
,
dict
)
else
v
for
v
in
value
]
# 如果值是列表,处理其中的字典
return
value
def
__setattr__
(
self
,
name
,
value
):
self
[
name
]
=
value
def
__delattr__
(
self
,
name
):
del
self
[
name
]
from
..dataset
import
DATASET_TYPE
class
TaichuVLWrapper
(
BaseAPI
):
is_api
:
bool
=
True
def
__init__
(
self
,
model
:
str
=
'Taichu-VL-2B'
,
retry
:
int
=
5
,
wait
:
int
=
5
,
verbose
:
bool
=
True
,
temperature
:
float
=
0.0
,
system_prompt
:
str
=
None
,
max_tokens
:
int
=
4096
,
key
:
str
=
None
,
url
:
str
=
None
,
**
kwargs
):
self
.
model
=
model
self
.
kwargs
=
kwargs
self
.
max_tokens
=
max_tokens
self
.
system_prompt
=
'[sys]You are a helpful assistant.[/sys]'
self
.
hint_prompt
=
'|<Hint>|'
self
.
mcq_prompt
=
'|<MCQ>|'
self
.
datasets_use_system
=
[
'MMVet'
]
self
.
datasets_use_multichoice
=
[
'MathVista'
,
'MathVision'
]
openai_key
=
os
.
environ
.
get
(
'OPENAI_API_KEY'
,
None
)
use_openai
=
os
.
environ
.
get
(
'USE_OPENAI_EVAL'
,
True
)
self
.
use_openai_evaluate
=
(
isinstance
(
openai_key
,
str
)
and
openai_key
.
startswith
(
'sk-'
)
and
use_openai
)
self
.
api_key
=
os
.
environ
.
get
(
'TAICHU_API_KEY'
,
key
)
self
.
api_url
=
url
assert
self
.
api_key
is
not
None
,
'Please set the API Key'
super
().
__init__
(
wait
=
wait
,
retry
=
retry
,
system_prompt
=
self
.
system_prompt
,
verbose
=
verbose
,
**
kwargs
)
def
set_dump_image
(
self
,
dump_image_func
):
self
.
dump_image_func
=
dump_image_func
def
dump_image
(
self
,
line
,
dataset
):
return
self
.
dump_image_func
(
line
)
def
use_custom_prompt
(
self
,
dataset
):
if
listinstr
([
'MCQ'
,
'VQA'
],
DATASET_TYPE
(
dataset
)):
return
True
elif
dataset
is
not
None
and
listinstr
([
'HallusionBench'
],
dataset
):
return
True
return
False
def
clear_prompt
(
self
,
prompt
):
prompt
=
re
.
sub
(
r
"Hint:.*?Question:"
,
""
,
prompt
,
flags
=
re
.
S
).
strip
()
prompt
=
re
.
sub
(
r
"\nChoices:\n.*"
,
""
,
prompt
,
flags
=
re
.
S
).
strip
()
return
prompt
def
encode_image
(
self
,
pil_image
):
buffer
=
BytesIO
()
pil_image
.
save
(
buffer
,
format
=
'PNG'
)
base64_str
=
base64
.
b64encode
(
buffer
.
getvalue
()).
decode
(
"utf-8"
)
return
base64_str
def
build_prompt
(
self
,
line
,
dataset
=
None
):
if
isinstance
(
line
,
int
):
line
=
self
.
data
.
iloc
[
line
]
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
'question'
]
hint
=
None
if
listinstr
(
self
.
datasets_use_system
,
dataset
):
system_prompt
=
self
.
system_prompt
else
:
system_prompt
=
''
mcq
=
False
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
or
listinstr
(
self
.
datasets_use_multichoice
,
dataset
):
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
if
listinstr
(
self
.
datasets_use_multichoice
,
dataset
):
options
=
{}
if
not
pd
.
isna
(
line
[
'choices'
]):
for
i
,
c
in
enumerate
(
eval
(
line
[
'choices'
])):
options
[
string
.
ascii_uppercase
[
i
]]
=
c
question
=
self
.
clear_prompt
(
question
)
# support chinese
if
listinstr
([
'_CN'
,
'_cn'
],
dataset
):
options_prompt
=
'
\n
选项:
\n
'
else
:
options_prompt
=
'
\n
OPTIONS:
\n
'
options_prompt
+=
'
\n
'
.
join
(
f
"
{
key
}
:
{
value
}
"
for
key
,
value
in
options
.
items
())
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
mcq
=
True
if
len
(
options
)
else
False
if
len
(
options
):
prompt
=
question
+
options_prompt
else
:
prompt
=
question
else
:
prompt
=
question
msgs
=
[]
if
system_prompt
:
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
system_prompt
))
if
isinstance
(
tgt_path
,
list
):
msgs
.
extend
([
dict
(
type
=
'image'
,
value
=
p
)
for
p
in
tgt_path
])
else
:
msgs
.
append
(
dict
(
type
=
'image'
,
value
=
tgt_path
))
if
hint
:
prompt
=
'Hint: '
+
hint
+
'
\n
'
+
prompt
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
prompt
))
if
mcq
:
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
self
.
mcq_prompt
))
return
msgs
def
prompt_to_request_messages
(
self
,
inputs
):
messages
=
[
{
'role'
:
'user'
,
'content'
:
[]}
]
is_mcq
=
False
for
x
in
inputs
:
if
x
[
'type'
]
==
'text'
:
if
x
[
'value'
]
==
self
.
system_prompt
:
messages
=
[{
'role'
:
'system'
,
'content'
:
[{
"type"
:
"text"
,
"text"
:
x
[
'value'
]}]}]
+
messages
elif
self
.
mcq_prompt
==
x
[
'value'
]:
is_mcq
=
True
else
:
messages
[
-
1
][
'content'
].
append
(
{
"type"
:
"text"
,
"text"
:
x
[
'value'
]},
)
if
x
[
'type'
]
==
'image'
:
_url
=
self
.
encode_image
(
Image
.
open
(
x
[
'value'
]))
messages
[
-
1
][
'content'
].
append
(
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
f
"data:image/jpeg;base64,
{
_url
}
"
}},
)
else
:
continue
return
messages
,
is_mcq
def
generate_inner
(
self
,
inputs
,
**
kwargs
)
->
str
:
messages
,
is_mcq
=
self
.
prompt_to_request_messages
(
inputs
)
data
=
{
"model"
:
self
.
model
,
"messages"
:
messages
,
"max_tokens"
:
self
.
max_tokens
,
"temperature"
:
0
,
"top_p"
:
0.8
,
"stream"
:
False
,
"extra_body"
:
{
"repetition_penalty"
:
1
}
}
headers
=
{
'Authorization'
:
self
.
api_key
,
'Content-Type'
:
'application/json'
}
try
:
chat_response
=
requests
.
post
(
self
.
api_url
,
json
=
data
,
headers
=
headers
)
response
=
ChatResponse
(
json
.
loads
(
chat_response
.
content
))
result
=
response
.
choices
[
0
].
message
.
content
# Extract index to exact matching when ChatGPT is unavailable.
if
self
.
use_openai_evaluate
is
False
and
is_mcq
is
True
:
try
:
result
=
result
[
0
]
except
:
result
=
'A'
return
0
,
result
,
'Succeeded! '
except
Exception
as
err
:
if
self
.
verbose
:
self
.
logger
.
error
(
f
'
{
type
(
err
)
}
:
{
err
}
'
)
self
.
logger
.
error
(
f
'The input messages are
{
inputs
}
.'
)
return
-
1
,
''
,
''
class
TaichuVLAPI
(
TaichuVLWrapper
):
def
generate
(
self
,
message
,
dataset
=
None
):
return
super
(
TaichuVLAPI
,
self
).
generate
(
message
,
dataset
=
dataset
)
VLMEvalKit/vlmeval/api/taiyi.py
0 → 100644
View file @
bc5ebf0f
from
vlmeval.smp
import
*
from
vlmeval.api.base
import
BaseAPI
from
vlmeval.dataset
import
DATASET_TYPE
,
img_root_map
class
TaiyiWrapper
(
BaseAPI
):
is_api
:
bool
=
True
def
__init__
(
self
,
model
:
str
=
'taiyi'
,
retry
:
int
=
5
,
wait
:
int
=
5
,
key
:
str
=
None
,
verbose
:
bool
=
False
,
system_prompt
:
str
=
None
,
temperature
:
float
=
0
,
timeout
:
int
=
60
,
url
:
str
=
"https://taiyi.megvii.com/v1/chat/completions"
,
max_tokens
:
int
=
1024
,
**
kwargs
):
self
.
model
=
model
self
.
fail_msg
=
'Failed to obtain answer via API. '
self
.
max_tokens
=
max_tokens
self
.
temperature
=
temperature
if
key
is
None
:
key
=
os
.
environ
.
get
(
'TAIYI_API_KEY'
,
None
)
assert
key
is
not
None
,
(
'Please set the API Key '
)
self
.
key
=
key
self
.
timeout
=
timeout
super
().
__init__
(
wait
=
wait
,
retry
=
retry
,
system_prompt
=
system_prompt
,
verbose
=
verbose
,
**
kwargs
)
assert
url
is
not
None
,
(
'Please set the url '
)
self
.
url
=
url
self
.
logger
.
info
(
f
'Using url:
{
self
.
url
}
; API Key:
{
self
.
key
}
'
)
def
use_custom_prompt
(
self
,
dataset
):
if
DATASET_TYPE
(
dataset
)
==
'Y/N'
or
DATASET_TYPE
(
dataset
)
==
'MCQ'
or
DATASET_TYPE
(
dataset
)
==
'VQA'
:
return
True
return
False
def
prepare_inputs
(
self
,
inputs
):
input_msgs
=
[]
if
self
.
system_prompt
is
not
None
:
input_msgs
.
append
(
dict
(
role
=
'system'
,
content
=
self
.
system_prompt
))
has_images
=
np
.
sum
([
x
[
'type'
]
==
'image'
for
x
in
inputs
])
if
has_images
:
content_list
=
[]
for
msg
in
inputs
:
if
msg
[
'type'
]
==
'text'
:
content_list
.
append
(
dict
(
type
=
'text'
,
text
=
msg
[
'value'
]))
elif
msg
[
'type'
]
==
'image'
:
imgbytes
=
open
(
msg
[
'value'
],
'rb'
).
read
()
b64
=
base64
.
b64encode
(
imgbytes
).
decode
(
'ascii'
)
img_struct
=
dict
(
url
=
f
'data:image/jpeg;base64,
{
b64
}
'
)
content_list
.
append
(
dict
(
type
=
'image_url'
,
image_url
=
img_struct
))
input_msgs
.
append
(
dict
(
role
=
'user'
,
content
=
content_list
))
else
:
assert
all
([
x
[
'type'
]
==
'text'
for
x
in
inputs
])
text
=
'
\n
'
.
join
([
x
[
'value'
]
for
x
in
inputs
])
input_msgs
.
append
(
dict
(
role
=
'user'
,
content
=
text
))
return
input_msgs
def
set_dump_image
(
self
,
dump_image_func
):
self
.
dump_image_func
=
dump_image_func
def
dump_image
(
self
,
line
,
dataset
):
return
self
.
dump_image_func
(
line
)
def
image_first
(
self
,
msgs
):
nr_img
=
0
for
s
in
msgs
:
if
s
[
'type'
]
==
'image'
:
nr_img
+=
1
if
nr_img
==
1
:
new_msgs
=
[]
img_msg
=
None
for
s
in
msgs
:
if
s
[
'type'
]
==
'text'
:
new_msgs
.
append
(
s
)
else
:
img_msg
=
s
new_msgs
.
insert
(
0
,
img_msg
)
else
:
new_msgs
=
msgs
return
new_msgs
def
build_multi_choice_prompt
(
self
,
line
,
dataset
=
None
):
question
=
line
[
'question'
]
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
if
hint
is
not
None
:
question
=
hint
+
'
\n
'
+
question
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
for
key
,
item
in
options
.
items
():
question
+=
f
'
\n
{
key
}
.
{
item
}
'
prompt
=
question
if
len
(
options
):
prompt
+=
'
\n
请直接回答选项字母。'
if
cn_string
(
prompt
)
else
"
\n
Answer with the option's letter from the given choices directly."
else
:
prompt
+=
'
\n
请直接回答问题。'
if
cn_string
(
prompt
)
else
'
\n
Answer the question directly.'
return
prompt
def
build_yorn_prompt
(
self
,
line
,
dataset
=
None
):
if
listinstr
([
'HallusionBench'
],
dataset
):
pre_prompt
=
'Read the following question carefully, think and solve it step by step.
\n\n
'
else
:
pre_prompt
=
''
prompt
=
pre_prompt
+
line
[
'question'
]
+
' Please answer yes or no as the final answer.'
return
prompt
def
build_vqa_prompt
(
self
,
line
,
dataset
=
None
):
if
listinstr
([
'OCRBench'
],
dataset
):
pre_prompt
=
'Carefully identify the text in the image and answer the question.
\n\n
'
else
:
pre_prompt
=
''
if
listinstr
([
'MMVet'
],
dataset
):
post_prompt
=
'
\n
Answer this question in detail.'
else
:
post_prompt
=
''
prompt
=
pre_prompt
+
line
[
'question'
]
+
post_prompt
return
prompt
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
self
.
use_custom_prompt
(
dataset
)
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
prompt
=
self
.
build_multi_choice_prompt
(
line
,
dataset
)
elif
DATASET_TYPE
(
dataset
)
==
'Y/N'
:
prompt
=
self
.
build_yorn_prompt
(
line
,
dataset
)
elif
DATASET_TYPE
(
dataset
)
==
'VQA'
:
prompt
=
self
.
build_vqa_prompt
(
line
,
dataset
)
else
:
raise
RuntimeError
(
f
'Invalid dataset type:
{
DATASET_TYPE
(
dataset
)
}
'
)
message
=
[]
message
.
extend
([
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
])
message
.
extend
([
dict
(
type
=
'text'
,
value
=
prompt
)])
# interleave dataset
if
dataset
.
startswith
(
'MMMU_'
):
from
..
import
MMMUDataset
message
=
MMMUDataset
.
split_MMMU
(
message
)
message
=
self
.
image_first
(
message
)
return
message
def
generate_inner
(
self
,
inputs
,
**
kwargs
)
->
str
:
input_msgs
=
self
.
prepare_inputs
(
inputs
)
temperature
=
kwargs
.
pop
(
'temperature'
,
self
.
temperature
)
headers
=
{
'Authorization'
:
f
'Bearer
{
self
.
key
}
'
}
payload
=
dict
(
model
=
self
.
model
,
messages
=
input_msgs
,
n
=
1
,
temperature
=
temperature
,
**
kwargs
)
response
=
requests
.
post
(
self
.
url
,
headers
=
headers
,
data
=
json
.
dumps
(
payload
),
timeout
=
self
.
timeout
*
1.1
)
ret_code
=
response
.
status_code
ret_code
=
0
if
(
200
<=
int
(
ret_code
)
<
300
)
else
ret_code
answer
=
self
.
fail_msg
try
:
resp_struct
=
json
.
loads
(
response
.
text
)
answer
=
resp_struct
[
'choices'
][
0
][
'message'
][
'content'
].
strip
()
except
:
pass
return
ret_code
,
answer
,
response
class
TaiyiAPI
(
TaiyiWrapper
):
def
generate
(
self
,
message
,
dataset
=
None
):
return
super
(
TaiyiAPI
,
self
).
generate
(
message
)
VLMEvalKit/vlmeval/config.py
0 → 100644
View file @
bc5ebf0f
from
vlmeval.vlm
import
*
from
vlmeval.api
import
*
from
functools
import
partial
PandaGPT_ROOT
=
None
MiniGPT4_ROOT
=
None
TransCore_ROOT
=
None
Yi_ROOT
=
None
OmniLMM_ROOT
=
None
Mini_Gemini_ROOT
=
None
VXVERSE_ROOT
=
None
VideoChat2_ROOT
=
None
VideoChatGPT_ROOT
=
None
PLLaVA_ROOT
=
None
RBDash_ROOT
=
None
LLAVA_V1_7B_MODEL_PTH
=
'Please set your local path to LLaVA-7B-v1.1 here, the model weight is obtained by merging LLaVA delta weight based on vicuna-7b-v1.1 in https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md with vicuna-7b-v1.1. '
video_models
=
{
'Video-LLaVA-7B'
:
partial
(
VideoLLaVA
,
model_path
=
'LanguageBind/Video-LLaVA-7B'
),
'Video-LLaVA-7B-HF'
:
partial
(
VideoLLaVA_HF
,
model_path
=
'LanguageBind/Video-LLaVA-7B-hf'
),
'VideoChat2-HD'
:
partial
(
VideoChat2_HD
,
model_path
=
'OpenGVLab/VideoChat2_HD_stage4_Mistral_7B'
,
root
=
VideoChat2_ROOT
,
config_file
=
'./vlmeval/vlm/video_llm/configs/videochat2_hd.json'
),
'Chat-UniVi-7B'
:
partial
(
Chatunivi
,
model_path
=
"Chat-UniVi/Chat-UniVi"
),
'Chat-UniVi-7B-v1.5'
:
partial
(
Chatunivi
,
model_path
=
"Chat-UniVi/Chat-UniVi-7B-v1.5"
),
'LLaMA-VID-7B'
:
partial
(
LLaMAVID
,
model_path
=
'YanweiLi/llama-vid-7b-full-224-video-fps-1'
),
'Video-ChatGPT'
:
partial
(
VideoChatGPT
,
model_path
=
'MBZUAI/Video-ChatGPT-7B'
,
dir_root
=
VideoChatGPT_ROOT
),
'PLLaVA-7B'
:
partial
(
PLLaVA
,
model_path
=
'ermu2001/pllava-7b'
,
dir_root
=
PLLaVA_ROOT
),
'PLLaVA-13B'
:
partial
(
PLLaVA
,
model_path
=
'ermu2001/pllava-13b'
,
dir_root
=
PLLaVA_ROOT
),
'PLLaVA-34B'
:
partial
(
PLLaVA
,
model_path
=
'ermu2001/pllava-34b'
,
dir_root
=
PLLaVA_ROOT
),
}
ungrouped
=
{
'TransCore_M'
:
partial
(
TransCoreM
,
root
=
TransCore_ROOT
),
'PandaGPT_13B'
:
partial
(
PandaGPT
,
name
=
'PandaGPT_13B'
,
root
=
PandaGPT_ROOT
),
'flamingov2'
:
partial
(
OpenFlamingo
,
name
=
'v2'
,
mpt_pth
=
'anas-awadalla/mpt-7b'
,
ckpt_pth
=
'openflamingo/OpenFlamingo-9B-vitl-mpt7b'
),
'VisualGLM_6b'
:
partial
(
VisualGLM
,
model_path
=
'THUDM/visualglm-6b'
),
'mPLUG-Owl2'
:
partial
(
mPLUG_Owl2
,
model_path
=
'MAGAer13/mplug-owl2-llama2-7b'
),
'mPLUG-Owl3'
:
partial
(
mPLUG_Owl3
,
model_path
=
'mPLUG/mPLUG-Owl3-7B-240728'
),
'emu2_chat'
:
partial
(
Emu
,
model_path
=
'BAAI/Emu2-Chat'
),
'OmniLMM_12B'
:
partial
(
OmniLMM12B
,
model_path
=
'openbmb/OmniLMM-12B'
,
root
=
OmniLMM_ROOT
),
'MGM_7B'
:
partial
(
Mini_Gemini
,
model_path
=
'YanweiLi/MGM-7B-HD'
,
root
=
Mini_Gemini_ROOT
),
'Bunny-llama3-8B'
:
partial
(
BunnyLLama3
,
model_path
=
'BAAI/Bunny-v1_1-Llama-3-8B-V'
),
'VXVERSE'
:
partial
(
VXVERSE
,
model_name
=
'XVERSE-V-13B'
,
root
=
VXVERSE_ROOT
),
'paligemma-3b-mix-448'
:
partial
(
PaliGemma
,
model_path
=
'google/paligemma-3b-mix-448'
),
'360VL-70B'
:
partial
(
QH_360VL
,
model_path
=
'qihoo360/360VL-70B'
),
'Llama-3-MixSenseV1_1'
:
partial
(
LLama3Mixsense
,
model_path
=
'Zero-Vision/Llama-3-MixSenseV1_1'
),
'Parrot'
:
partial
(
Parrot
,
model_path
=
'AIDC-AI/Parrot-7B'
),
'OmChat'
:
partial
(
OmChat
,
model_path
=
'omlab/omchat-v2.0-13B-single-beta_hf'
),
'RBDash_72b'
:
partial
(
RBDash
,
model_path
=
'RBDash-Team/RBDash-v1.5'
,
root
=
RBDash_ROOT
),
'Pixtral-12B'
:
partial
(
Pixtral
,
model_path
=
'mistralai/Pixtral-12B-2409'
),
'Falcon2-VLM-11B'
:
partial
(
Falcon2VLM
,
model_path
=
'tiiuae/falcon-11B-vlm'
)
}
api_models
=
{
# GPT
'GPT4V'
:
partial
(
GPT4V
,
model
=
'gpt-4-1106-vision-preview'
,
temperature
=
0
,
img_size
=
512
,
img_detail
=
'low'
,
retry
=
10
,
verbose
=
False
),
'GPT4V_HIGH'
:
partial
(
GPT4V
,
model
=
'gpt-4-1106-vision-preview'
,
temperature
=
0
,
img_size
=-
1
,
img_detail
=
'high'
,
retry
=
10
,
verbose
=
False
),
'GPT4V_20240409'
:
partial
(
GPT4V
,
model
=
'gpt-4-turbo-2024-04-09'
,
temperature
=
0
,
img_size
=
512
,
img_detail
=
'low'
,
retry
=
10
,
verbose
=
False
),
'GPT4V_20240409_HIGH'
:
partial
(
GPT4V
,
model
=
'gpt-4-turbo-2024-04-09'
,
temperature
=
0
,
img_size
=-
1
,
img_detail
=
'high'
,
retry
=
10
,
verbose
=
False
),
'GPT4o'
:
partial
(
GPT4V
,
model
=
'gpt-4o-2024-05-13'
,
temperature
=
0
,
img_size
=
512
,
img_detail
=
'low'
,
retry
=
10
,
verbose
=
False
),
'GPT4o_HIGH'
:
partial
(
GPT4V
,
model
=
'gpt-4o-2024-05-13'
,
temperature
=
0
,
img_size
=-
1
,
img_detail
=
'high'
,
retry
=
10
,
verbose
=
False
),
'GPT4o_20240806'
:
partial
(
GPT4V
,
model
=
'gpt-4o-2024-08-06'
,
temperature
=
0
,
img_size
=-
1
,
img_detail
=
'high'
,
retry
=
10
,
verbose
=
False
),
'GPT4o_20241120'
:
partial
(
GPT4V
,
model
=
'gpt-4o-2024-11-20'
,
temperature
=
0
,
img_size
=-
1
,
img_detail
=
'high'
,
retry
=
10
,
verbose
=
False
),
'GPT4o_MINI'
:
partial
(
GPT4V
,
model
=
'gpt-4o-mini-2024-07-18'
,
temperature
=
0
,
img_size
=-
1
,
img_detail
=
'high'
,
retry
=
10
,
verbose
=
False
),
# Gemini
'GeminiPro1-0'
:
partial
(
GeminiProVision
,
model
=
'gemini-1.0-pro'
,
temperature
=
0
,
retry
=
10
),
# now GeminiPro1-0 is only supported by vertex backend
'GeminiPro1-5'
:
partial
(
GeminiProVision
,
model
=
'gemini-1.5-pro'
,
temperature
=
0
,
retry
=
10
),
'GeminiFlash1-5'
:
partial
(
GeminiProVision
,
model
=
'gemini-1.5-flash'
,
temperature
=
0
,
retry
=
10
),
'GeminiFlash2-0'
:
partial
(
GeminiProVision
,
model
=
'gemini-2.0-flash-exp'
,
temperature
=
0
,
retry
=
10
),
'GeminiPro1-5-002'
:
partial
(
GPT4V
,
model
=
'gemini-1.5-pro-002'
,
temperature
=
0
,
retry
=
10
),
# Internal Use Only
'GeminiFlash1-5-002'
:
partial
(
GPT4V
,
model
=
'gemini-1.5-flash-002'
,
temperature
=
0
,
retry
=
10
),
# Internal Use Only
# Qwen-VL
'QwenVLPlus'
:
partial
(
QwenVLAPI
,
model
=
'qwen-vl-plus'
,
temperature
=
0
,
retry
=
10
),
'QwenVLMax'
:
partial
(
QwenVLAPI
,
model
=
'qwen-vl-max'
,
temperature
=
0
,
retry
=
10
),
# Reka
'RekaEdge'
:
partial
(
Reka
,
model
=
'reka-edge-20240208'
),
'RekaFlash'
:
partial
(
Reka
,
model
=
'reka-flash-20240226'
),
'RekaCore'
:
partial
(
Reka
,
model
=
'reka-core-20240415'
),
# Step1V
'Step1V'
:
partial
(
GPT4V
,
model
=
'step-1v-32k'
,
api_base
=
"https://api.stepfun.com/v1/chat/completions"
,
temperature
=
0
,
retry
=
10
,
img_size
=-
1
,
img_detail
=
'high'
),
'Step1.5V-mini'
:
partial
(
GPT4V
,
model
=
'step-1.5v-mini'
,
api_base
=
"https://api.stepfun.com/v1/chat/completions"
,
temperature
=
0
,
retry
=
10
,
img_size
=-
1
,
img_detail
=
'high'
),
# Yi-Vision
'Yi-Vision'
:
partial
(
GPT4V
,
model
=
'yi-vision'
,
api_base
=
"https://api.lingyiwanwu.com/v1/chat/completions"
,
temperature
=
0
,
retry
=
10
),
# Claude
'Claude3V_Opus'
:
partial
(
Claude3V
,
model
=
'claude-3-opus-20240229'
,
temperature
=
0
,
retry
=
10
,
verbose
=
False
),
'Claude3V_Sonnet'
:
partial
(
Claude3V
,
model
=
'claude-3-sonnet-20240229'
,
temperature
=
0
,
retry
=
10
,
verbose
=
False
),
'Claude3V_Haiku'
:
partial
(
Claude3V
,
model
=
'claude-3-haiku-20240307'
,
temperature
=
0
,
retry
=
10
,
verbose
=
False
),
'Claude3-5V_Sonnet'
:
partial
(
Claude3V
,
model
=
'claude-3-5-sonnet-20240620'
,
temperature
=
0
,
retry
=
10
,
verbose
=
False
),
'Claude3-5V_Sonnet_20241022'
:
partial
(
Claude3V
,
model
=
'claude-3-5-sonnet-20241022'
,
temperature
=
0
,
retry
=
10
,
verbose
=
False
),
# GLM4V
'GLM4V'
:
partial
(
GLMVisionAPI
,
model
=
'glm4v-biz-eval'
,
temperature
=
0
,
retry
=
10
),
'GLM4V_PLUS'
:
partial
(
GLMVisionAPI
,
model
=
'cogvlm-evaluation-241203'
,
temperature
=
0
,
retry
=
10
),
# MiniMax abab
'abab6.5s'
:
partial
(
GPT4V
,
model
=
'abab6.5s-chat'
,
api_base
=
'https://api.minimax.chat/v1/chat/completions'
,
temperature
=
0
,
retry
=
10
),
'abab7-preview'
:
partial
(
GPT4V
,
model
=
'abab7-chat-preview'
,
api_base
=
'https://api.minimax.chat/v1/chat/completions'
,
temperature
=
0
,
retry
=
10
),
# CongRong
'CloudWalk'
:
partial
(
CWWrapper
,
model
=
'cw-congrong-v1.5'
,
temperature
=
0
,
retry
=
10
),
# SenseChat-V
'SenseChat-Vision'
:
partial
(
SenseChatVisionAPI
,
model
=
'SenseChat-Vision'
,
temperature
=
0
,
retry
=
10
),
'HunYuan-Vision'
:
partial
(
HunyuanVision
,
model
=
'hunyuan-vision'
,
temperature
=
0
,
retry
=
10
),
'bailingMM'
:
partial
(
bailingMMAPI
,
model
=
'bailingMM-mini'
,
temperature
=
0
,
retry
=
10
),
# BlueLM-V
"BlueLM_V"
:
partial
(
BlueLM_V_API
,
model
=
'BlueLM-VL-v3.0'
,
temperature
=
0
,
retry
=
10
),
# JiuTian-VL
"JTVL"
:
partial
(
JTVLChatAPI
,
model
=
'jt-vl-chat'
,
temperature
=
0
,
retry
=
10
),
"Taiyi"
:
partial
(
TaiyiAPI
,
model
=
'taiyi'
,
temperature
=
0
,
retry
=
10
),
# TeleMM
'TeleMM'
:
partial
(
TeleMMAPI
,
model
=
'TeleAI/TeleMM'
,
temperature
=
0
,
retry
=
10
),
# lmdeploy api
'lmdeploy'
:
partial
(
LMDeployAPI
,
api_base
=
'http://0.0.0.0:23333/v1/chat/completions'
,
temperature
=
0
,
retry
=
10
),
# Taichu-VL
'Taichu-VL-2B'
:
partial
(
TaichuVLAPI
,
model
=
'Taichu-VL-2B'
,
url
=
'https://platform.wair.ac.cn/api/v1/infer/10381/v1/chat/completions'
),
}
mmalaya_series
=
{
'MMAlaya'
:
partial
(
MMAlaya
,
model_path
=
'DataCanvas/MMAlaya'
),
'MMAlaya2'
:
partial
(
MMAlaya2
,
model_path
=
'DataCanvas/MMAlaya2'
),
}
minicpm_series
=
{
'MiniCPM-V'
:
partial
(
MiniCPM_V
,
model_path
=
'openbmb/MiniCPM-V'
),
'MiniCPM-V-2'
:
partial
(
MiniCPM_V
,
model_path
=
'openbmb/MiniCPM-V-2'
),
'MiniCPM-Llama3-V-2_5'
:
partial
(
MiniCPM_Llama3_V
,
model_path
=
'openbmb/MiniCPM-Llama3-V-2_5'
),
'MiniCPM-V-2_6'
:
partial
(
MiniCPM_V_2_6
,
model_path
=
'openbmb/MiniCPM-V-2_6'
),
}
xtuner_series
=
{
'llava-internlm2-7b'
:
partial
(
LLaVA_XTuner
,
llm_path
=
'internlm/internlm2-chat-7b'
,
llava_path
=
'xtuner/llava-internlm2-7b'
,
visual_select_layer
=-
2
,
prompt_template
=
'internlm2_chat'
),
'llava-internlm2-20b'
:
partial
(
LLaVA_XTuner
,
llm_path
=
'internlm/internlm2-chat-20b'
,
llava_path
=
'xtuner/llava-internlm2-20b'
,
visual_select_layer
=-
2
,
prompt_template
=
'internlm2_chat'
),
'llava-internlm-7b'
:
partial
(
LLaVA_XTuner
,
llm_path
=
'internlm/internlm-chat-7b'
,
llava_path
=
'xtuner/llava-internlm-7b'
,
visual_select_layer
=-
2
,
prompt_template
=
'internlm_chat'
),
'llava-v1.5-7b-xtuner'
:
partial
(
LLaVA_XTuner
,
llm_path
=
'lmsys/vicuna-7b-v1.5'
,
llava_path
=
'xtuner/llava-v1.5-7b-xtuner'
,
visual_select_layer
=-
2
,
prompt_template
=
'vicuna'
),
'llava-v1.5-13b-xtuner'
:
partial
(
LLaVA_XTuner
,
llm_path
=
'lmsys/vicuna-13b-v1.5'
,
llava_path
=
'xtuner/llava-v1.5-13b-xtuner'
,
visual_select_layer
=-
2
,
prompt_template
=
'vicuna'
),
'llava-llama-3-8b'
:
partial
(
LLaVA_XTuner
,
llm_path
=
'xtuner/llava-llama-3-8b-v1_1'
,
llava_path
=
'xtuner/llava-llama-3-8b-v1_1'
,
visual_select_layer
=-
2
,
prompt_template
=
'llama3_chat'
),
}
qwen_series
=
{
'qwen_base'
:
partial
(
QwenVL
,
model_path
=
'Qwen/Qwen-VL'
),
'qwen_chat'
:
partial
(
QwenVLChat
,
model_path
=
'Qwen/Qwen-VL-Chat'
),
'monkey'
:
partial
(
Monkey
,
model_path
=
'echo840/Monkey'
),
'monkey-chat'
:
partial
(
MonkeyChat
,
model_path
=
'echo840/Monkey-Chat'
),
'minimonkey'
:
partial
(
MiniMonkey
,
model_path
=
'mx262/MiniMonkey'
)
}
llava_series
=
{
'llava_v1.5_7b'
:
partial
(
LLaVA
,
model_path
=
'liuhaotian/llava-v1.5-7b'
),
'llava_v1.5_13b'
:
partial
(
LLaVA
,
model_path
=
'liuhaotian/llava-v1.5-13b'
),
'llava_v1_7b'
:
partial
(
LLaVA
,
model_path
=
LLAVA_V1_7B_MODEL_PTH
),
'sharegpt4v_7b'
:
partial
(
LLaVA
,
model_path
=
'Lin-Chen/ShareGPT4V-7B'
),
'sharegpt4v_13b'
:
partial
(
LLaVA
,
model_path
=
'Lin-Chen/ShareGPT4V-13B'
),
'llava_next_vicuna_7b'
:
partial
(
LLaVA_Next
,
model_path
=
'llava-hf/llava-v1.6-vicuna-7b-hf'
),
'llava_next_vicuna_13b'
:
partial
(
LLaVA_Next
,
model_path
=
'llava-hf/llava-v1.6-vicuna-13b-hf'
),
'llava_next_mistral_7b'
:
partial
(
LLaVA_Next
,
model_path
=
'llava-hf/llava-v1.6-mistral-7b-hf'
),
'llava_next_yi_34b'
:
partial
(
LLaVA_Next
,
model_path
=
'llava-hf/llava-v1.6-34b-hf'
),
'llava_next_llama3'
:
partial
(
LLaVA_Next
,
model_path
=
'llava-hf/llama3-llava-next-8b-hf'
),
'llava_next_72b'
:
partial
(
LLaVA_Next
,
model_path
=
'llava-hf/llava-next-72b-hf'
),
'llava_next_110b'
:
partial
(
LLaVA_Next
,
model_path
=
'llava-hf/llava-next-110b-hf'
),
'llava_next_qwen_32b'
:
partial
(
LLaVA_Next2
,
model_path
=
'lmms-lab/llava-next-qwen-32b'
),
'llava_next_interleave_7b'
:
partial
(
LLaVA_Next
,
model_path
=
'llava-hf/llava-interleave-qwen-7b-hf'
),
'llava_next_interleave_7b_dpo'
:
partial
(
LLaVA_Next
,
model_path
=
'llava-hf/llava-interleave-qwen-7b-dpo-hf'
),
'llava-onevision-qwen2-0.5b-ov-hf'
:
partial
(
LLaVA_OneVision_HF
,
model_path
=
'llava-hf/llava-onevision-qwen2-0.5b-ov-hf'
),
'llava-onevision-qwen2-0.5b-si-hf'
:
partial
(
LLaVA_OneVision_HF
,
model_path
=
'llava-hf/llava-onevision-qwen2-0.5b-si-hf'
),
'llava-onevision-qwen2-7b-ov-hf'
:
partial
(
LLaVA_OneVision_HF
,
model_path
=
'llava-hf/llava-onevision-qwen2-7b-ov-hf'
),
'llava-onevision-qwen2-7b-si-hf'
:
partial
(
LLaVA_OneVision_HF
,
model_path
=
'llava-hf/llava-onevision-qwen2-7b-si-hf'
),
'llava_onevision_qwen2_0.5b_si'
:
partial
(
LLaVA_OneVision
,
model_path
=
'lmms-lab/llava-onevision-qwen2-0.5b-si'
),
'llava_onevision_qwen2_7b_si'
:
partial
(
LLaVA_OneVision
,
model_path
=
'lmms-lab/llava-onevision-qwen2-7b-si'
),
'llava_onevision_qwen2_72b_si'
:
partial
(
LLaVA_OneVision
,
model_path
=
'lmms-lab/llava-onevision-qwen2-72b-si'
),
'llava_onevision_qwen2_0.5b_ov'
:
partial
(
LLaVA_OneVision
,
model_path
=
'lmms-lab/llava-onevision-qwen2-0.5b-ov'
),
'llava_onevision_qwen2_7b_ov'
:
partial
(
LLaVA_OneVision
,
model_path
=
'lmms-lab/llava-onevision-qwen2-7b-ov'
),
'llava_onevision_qwen2_72b_ov'
:
partial
(
LLaVA_OneVision
,
model_path
=
'lmms-lab/llava-onevision-qwen2-72b-ov-sft'
),
'Aquila-VL-2B'
:
partial
(
LLaVA_OneVision
,
model_path
=
'BAAI/Aquila-VL-2B-llava-qwen'
),
'llava_video_qwen2_7b'
:
partial
(
LLaVA_OneVision
,
model_path
=
'lmms-lab/LLaVA-Video-7B-Qwen2'
),
'llava_video_qwen2_72b'
:
partial
(
LLaVA_OneVision
,
model_path
=
'lmms-lab/LLaVA-Video-72B-Qwen2'
),
'varco-vision-hf'
:
partial
(
LLaVA_OneVision_HF
,
model_path
=
'NCSOFT/VARCO-VISION-14B-HF'
),
}
internvl_series
=
{
'InternVL-Chat-V1-1'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL-Chat-V1-1'
,
version
=
'V1.1'
),
'InternVL-Chat-V1-2'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL-Chat-V1-2'
,
version
=
'V1.2'
),
'InternVL-Chat-V1-2-Plus'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL-Chat-V1-2-Plus'
,
version
=
'V1.2'
),
# InternVL1.5 series
'InternVL-Chat-V1-5'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL-Chat-V1-5'
,
version
=
'V1.5'
),
'Mini-InternVL-Chat-2B-V1-5'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/Mini-InternVL-Chat-2B-V1-5'
,
version
=
'V1.5'
),
'Mini-InternVL-Chat-4B-V1-5'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/Mini-InternVL-Chat-4B-V1-5'
,
version
=
'V1.5'
),
# InternVL2 series
'InternVL2-1B'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL2-1B'
,
version
=
'V2.0'
),
'InternVL2-2B'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL2-2B'
,
version
=
'V2.0'
),
'InternVL2-4B'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL2-4B'
,
version
=
'V2.0'
),
'InternVL2-8B'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL2-8B'
,
version
=
'V2.0'
),
'InternVL2-26B'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL2-26B'
,
version
=
'V2.0'
),
'InternVL2-40B'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL2-40B'
,
version
=
'V2.0'
),
'InternVL2-76B'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL2-Llama3-76B'
,
version
=
'V2.0'
),
# InternVL2 MPO series
'InternVL2-8B-MPO'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL2-8B-MPO'
,
version
=
'V2.0'
),
'InternVL2-8B-MPO-CoT'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL2-8B-MPO'
,
version
=
'V2.0'
,
use_mpo_prompt
=
True
),
# InternVL2.5 series
'InternVL2_5-1B'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL2_5-1B'
,
version
=
'V2.0'
),
'InternVL2_5-2B'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL2_5-2B'
,
version
=
'V2.0'
),
'InternVL2_5-4B'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL2_5-4B'
,
version
=
'V2.0'
),
'InternVL2_5-8B'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL2_5-8B'
,
version
=
'V2.0'
),
'InternVL2_5-26B'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL2_5-26B'
,
version
=
'V2.0'
),
'InternVL2_5-38B'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL2_5-38B'
,
version
=
'V2.0'
),
'InternVL2_5-78B'
:
partial
(
InternVLChat
,
model_path
=
'OpenGVLab/InternVL2_5-78B'
,
version
=
'V2.0'
),
}
sail_series
=
{
'SAIL-VL-2B'
:
partial
(
SailVL
,
model_path
=
'BytedanceDouyinContent/SAIL-VL-2B'
)
}
yivl_series
=
{
'Yi_VL_6B'
:
partial
(
Yi_VL
,
model_path
=
'01-ai/Yi-VL-6B'
,
root
=
Yi_ROOT
),
'Yi_VL_34B'
:
partial
(
Yi_VL
,
model_path
=
'01-ai/Yi-VL-34B'
,
root
=
Yi_ROOT
),
}
xcomposer_series
=
{
'XComposer'
:
partial
(
XComposer
,
model_path
=
'internlm/internlm-xcomposer-vl-7b'
),
'sharecaptioner'
:
partial
(
ShareCaptioner
,
model_path
=
'Lin-Chen/ShareCaptioner'
),
'XComposer2'
:
partial
(
XComposer2
,
model_path
=
'internlm/internlm-xcomposer2-vl-7b'
),
'XComposer2_1.8b'
:
partial
(
XComposer2
,
model_path
=
'internlm/internlm-xcomposer2-vl-1_8b'
),
'XComposer2_4KHD'
:
partial
(
XComposer2_4KHD
,
model_path
=
'internlm/internlm-xcomposer2-4khd-7b'
),
'XComposer2d5'
:
partial
(
XComposer2d5
,
model_path
=
'internlm/internlm-xcomposer2d5-7b'
),
}
minigpt4_series
=
{
'MiniGPT-4-v2'
:
partial
(
MiniGPT4
,
mode
=
'v2'
,
root
=
MiniGPT4_ROOT
),
'MiniGPT-4-v1-7B'
:
partial
(
MiniGPT4
,
mode
=
'v1_7b'
,
root
=
MiniGPT4_ROOT
),
'MiniGPT-4-v1-13B'
:
partial
(
MiniGPT4
,
mode
=
'v1_13b'
,
root
=
MiniGPT4_ROOT
),
}
idefics_series
=
{
'idefics_9b_instruct'
:
partial
(
IDEFICS
,
model_path
=
'HuggingFaceM4/idefics-9b-instruct'
),
'idefics_80b_instruct'
:
partial
(
IDEFICS
,
model_path
=
'HuggingFaceM4/idefics-80b-instruct'
),
'idefics2_8b'
:
partial
(
IDEFICS2
,
model_path
=
'HuggingFaceM4/idefics2-8b'
),
# Idefics3 follows Idefics2 Pattern
'Idefics3-8B-Llama3'
:
partial
(
IDEFICS2
,
model_path
=
'HuggingFaceM4/Idefics3-8B-Llama3'
),
}
smolvlm_series
=
{
'SmolVLM'
:
partial
(
SmolVLM
,
model_path
=
'HuggingFaceTB/SmolVLM-Instruct'
),
'SmolVLM-DPO'
:
partial
(
SmolVLM
,
model_path
=
'HuggingFaceTB/SmolVLM-Instruct-DPO'
),
'SmolVLM-Synthetic'
:
partial
(
SmolVLM
,
model_path
=
'HuggingFaceTB/SmolVLM-Instruct'
),
}
instructblip_series
=
{
'instructblip_7b'
:
partial
(
InstructBLIP
,
name
=
'instructblip_7b'
),
'instructblip_13b'
:
partial
(
InstructBLIP
,
name
=
'instructblip_13b'
),
}
deepseekvl_series
=
{
'deepseek_vl_7b'
:
partial
(
DeepSeekVL
,
model_path
=
'deepseek-ai/deepseek-vl-7b-chat'
),
'deepseek_vl_1.3b'
:
partial
(
DeepSeekVL
,
model_path
=
'deepseek-ai/deepseek-vl-1.3b-chat'
),
}
janus_series
=
{
'Janus-1.3B'
:
partial
(
Janus
,
model_path
=
'deepseek-ai/Janus-1.3B'
)
}
cogvlm_series
=
{
'cogvlm-grounding-generalist'
:
partial
(
CogVlm
,
model_path
=
'THUDM/cogvlm-grounding-generalist-hf'
,
tokenizer_name
=
'lmsys/vicuna-7b-v1.5'
),
'cogvlm-chat'
:
partial
(
CogVlm
,
model_path
=
'THUDM/cogvlm-chat-hf'
,
tokenizer_name
=
'lmsys/vicuna-7b-v1.5'
),
'cogvlm2-llama3-chat-19B'
:
partial
(
CogVlm
,
model_path
=
'THUDM/cogvlm2-llama3-chat-19B'
),
'glm-4v-9b'
:
partial
(
GLM4v
,
model_path
=
'THUDM/glm-4v-9b'
)
}
wemm_series
=
{
'WeMM'
:
partial
(
WeMM
,
model_path
=
'feipengma/WeMM'
),
}
cambrian_series
=
{
'cambrian_8b'
:
partial
(
Cambrian
,
model_path
=
'nyu-visionx/cambrian-8b'
),
'cambrian_13b'
:
partial
(
Cambrian
,
model_path
=
'nyu-visionx/cambrian-13b'
),
'cambrian_34b'
:
partial
(
Cambrian
,
model_path
=
'nyu-visionx/cambrian-34b'
),
}
chameleon_series
=
{
'chameleon_7b'
:
partial
(
Chameleon
,
model_path
=
'facebook/chameleon-7b'
),
'chameleon_30b'
:
partial
(
Chameleon
,
model_path
=
'facebook/chameleon-30b'
),
}
vila_series
=
{
'VILA1.5-3b'
:
partial
(
VILA
,
model_path
=
'Efficient-Large-Model/VILA1.5-3b'
),
'Llama-3-VILA1.5-8b'
:
partial
(
VILA
,
model_path
=
'Efficient-Large-Model/Llama-3-VILA1.5-8b'
),
'VILA1.5-13b'
:
partial
(
VILA
,
model_path
=
'Efficient-Large-Model/VILA1.5-13b'
),
'VILA1.5-40b'
:
partial
(
VILA
,
model_path
=
'Efficient-Large-Model/VILA1.5-40b'
),
}
ovis_series
=
{
'Ovis1.5-Llama3-8B'
:
partial
(
Ovis
,
model_path
=
'AIDC-AI/Ovis1.5-Llama3-8B'
),
'Ovis1.5-Gemma2-9B'
:
partial
(
Ovis
,
model_path
=
'AIDC-AI/Ovis1.5-Gemma2-9B'
),
'Ovis1.6-Gemma2-9B'
:
partial
(
Ovis1_6
,
model_path
=
'AIDC-AI/Ovis1.6-Gemma2-9B'
),
'Ovis1.6-Llama3.2-3B'
:
partial
(
Ovis1_6
,
model_path
=
'AIDC-AI/Ovis1.6-Llama3.2-3B'
),
'Ovis1.6-Gemma2-27B'
:
partial
(
Ovis1_6_Plus
,
model_path
=
'AIDC-AI/Ovis1.6-Gemma2-27B'
)
}
mantis_series
=
{
'Mantis-8B-siglip-llama3'
:
partial
(
Mantis
,
model_path
=
'TIGER-Lab/Mantis-8B-siglip-llama3'
),
'Mantis-8B-clip-llama3'
:
partial
(
Mantis
,
model_path
=
'TIGER-Lab/Mantis-8B-clip-llama3'
),
'Mantis-8B-Idefics2'
:
partial
(
Mantis
,
model_path
=
'TIGER-Lab/Mantis-8B-Idefics2'
),
'Mantis-8B-Fuyu'
:
partial
(
Mantis
,
model_path
=
'TIGER-Lab/Mantis-8B-Fuyu'
)
}
phi3_series
=
{
'Phi-3-Vision'
:
partial
(
Phi3Vision
,
model_path
=
'microsoft/Phi-3-vision-128k-instruct'
),
'Phi-3.5-Vision'
:
partial
(
Phi3_5Vision
,
model_path
=
'microsoft/Phi-3.5-vision-instruct'
)
}
xgen_mm_series
=
{
'xgen-mm-phi3-interleave-r-v1.5'
:
partial
(
XGenMM
,
model_path
=
'Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5'
),
'xgen-mm-phi3-dpo-r-v1.5'
:
partial
(
XGenMM
,
model_path
=
'Salesforce/xgen-mm-phi3-mini-instruct-dpo-r-v1.5'
),
}
qwen2vl_series
=
{
'Qwen-VL-Max-0809'
:
partial
(
Qwen2VLAPI
,
model
=
'qwen-vl-max-0809'
,
min_pixels
=
1280
*
28
*
28
,
max_pixels
=
16384
*
28
*
28
),
'Qwen-VL-Plus-0809'
:
partial
(
Qwen2VLAPI
,
model
=
'qwen-vl-plus-0809'
,
min_pixels
=
1280
*
28
*
28
,
max_pixels
=
16384
*
28
*
28
),
'Qwen2-VL-72B-Instruct'
:
partial
(
Qwen2VLChat
,
model_path
=
'Qwen/Qwen2-VL-72B-Instruct'
,
min_pixels
=
1280
*
28
*
28
,
max_pixels
=
16384
*
28
*
28
),
'Qwen2-VL-7B-Instruct'
:
partial
(
Qwen2VLChat
,
model_path
=
'/home/luopl1/Qwen2-VL-7B-Instruct'
,
min_pixels
=
1280
*
28
*
28
,
max_pixels
=
16384
*
28
*
28
),
'Qwen2-VL-7B-Instruct-AWQ'
:
partial
(
Qwen2VLChat
,
model_path
=
'Qwen/Qwen2-VL-7B-Instruct-AWQ'
,
min_pixels
=
1280
*
28
*
28
,
max_pixels
=
16384
*
28
*
28
),
'Qwen2-VL-7B-Instruct-GPTQ-Int4'
:
partial
(
Qwen2VLChat
,
model_path
=
'Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4'
,
min_pixels
=
1280
*
28
*
28
,
max_pixels
=
16384
*
28
*
28
),
'Qwen2-VL-7B-Instruct-GPTQ-Int8'
:
partial
(
Qwen2VLChat
,
model_path
=
'Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8'
,
min_pixels
=
1280
*
28
*
28
,
max_pixels
=
16384
*
28
*
28
),
'Qwen2-VL-2B-Instruct'
:
partial
(
Qwen2VLChat
,
model_path
=
'Qwen/Qwen2-VL-2B-Instruct'
,
min_pixels
=
1280
*
28
*
28
,
max_pixels
=
16384
*
28
*
28
),
'Qwen2-VL-2B-Instruct-AWQ'
:
partial
(
Qwen2VLChat
,
model_path
=
'Qwen/Qwen2-VL-2B-Instruct-AWQ'
,
min_pixels
=
1280
*
28
*
28
,
max_pixels
=
16384
*
28
*
28
),
'Qwen2-VL-2B-Instruct-GPTQ-Int4'
:
partial
(
Qwen2VLChat
,
model_path
=
'Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4'
,
min_pixels
=
1280
*
28
*
28
,
max_pixels
=
16384
*
28
*
28
),
'Qwen2-VL-2B-Instruct-GPTQ-Int8'
:
partial
(
Qwen2VLChat
,
model_path
=
'Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8'
,
min_pixels
=
1280
*
28
*
28
,
max_pixels
=
16384
*
28
*
28
),
'XinYuan-VL-2B-Instruct'
:
partial
(
Qwen2VLChat
,
model_path
=
'Cylingo/Xinyuan-VL-2B'
,
min_pixels
=
1280
*
28
*
28
,
max_pixels
=
16384
*
28
*
28
),
}
slime_series
=
{
'Slime-7B'
:
partial
(
SliME
,
model_path
=
'yifanzhang114/SliME-vicuna-7B'
),
'Slime-8B'
:
partial
(
SliME
,
model_path
=
'yifanzhang114/SliME-Llama3-8B'
),
'Slime-13B'
:
partial
(
SliME
,
model_path
=
'yifanzhang114/SliME-vicuna-13B'
),
}
eagle_series
=
{
'Eagle-X4-8B-Plus'
:
partial
(
Eagle
,
model_path
=
'NVEagle/Eagle-X4-8B-Plus'
),
'Eagle-X4-13B-Plus'
:
partial
(
Eagle
,
model_path
=
'NVEagle/Eagle-X4-13B-Plus'
),
'Eagle-X5-7B'
:
partial
(
Eagle
,
model_path
=
'NVEagle/Eagle-X5-7B'
),
'Eagle-X5-13B'
:
partial
(
Eagle
,
model_path
=
'NVEagle/Eagle-X5-13B'
),
'Eagle-X5-13B-Chat'
:
partial
(
Eagle
,
model_path
=
'NVEagle/Eagle-X5-13B-Chat'
),
'Eagle-X5-34B-Chat'
:
partial
(
Eagle
,
model_path
=
'NVEagle/Eagle-X5-34B-Chat'
),
'Eagle-X5-34B-Plus'
:
partial
(
Eagle
,
model_path
=
'NVEagle/Eagle-X5-34B-Plus'
),
}
moondream_series
=
{
'Moondream1'
:
partial
(
Moondream1
,
model_path
=
'vikhyatk/moondream1'
),
'Moondream2'
:
partial
(
Moondream2
,
model_path
=
'vikhyatk/moondream2'
),
}
llama_series
=
{
'Llama-3.2-11B-Vision-Instruct'
:
partial
(
llama_vision
,
model_path
=
'meta-llama/Llama-3.2-11B-Vision-Instruct'
),
'LLaVA-CoT'
:
partial
(
llama_vision
,
model_path
=
'Xkev/Llama-3.2V-11B-cot'
),
'Llama-3.2-90B-Vision-Instruct'
:
partial
(
llama_vision
,
model_path
=
'meta-llama/Llama-3.2-90B-Vision-Instruct'
),
}
molmo_series
=
{
'molmoE-1B-0924'
:
partial
(
molmo
,
model_path
=
'allenai/MolmoE-1B-0924'
),
'molmo-7B-D-0924'
:
partial
(
molmo
,
model_path
=
'allenai/Molmo-7B-D-0924'
),
'molmo-7B-O-0924'
:
partial
(
molmo
,
model_path
=
'allenai/Molmo-7B-O-0924'
),
'molmo-72B-0924'
:
partial
(
molmo
,
model_path
=
'allenai/Molmo-72B-0924'
),
}
kosmos_series
=
{
'Kosmos2'
:
partial
(
Kosmos2
,
model_path
=
'microsoft/kosmos-2-patch14-224'
)
}
points_series
=
{
'POINTS-Yi-1.5-9B-Chat'
:
partial
(
POINTS
,
model_path
=
'WePOINTS/POINTS-Yi-1-5-9B-Chat'
),
'POINTS-Qwen-2.5-7B-Chat'
:
partial
(
POINTS
,
model_path
=
'WePOINTS/POINTS-Qwen-2-5-7B-Chat'
),
'POINTSV15-Qwen-2.5-7B-Chat'
:
partial
(
POINTSV15
,
model_path
=
'WePOINTS/POINTS-1-5-Qwen-2-5-7B-Chat'
),
}
nvlm_series
=
{
'NVLM'
:
partial
(
NVLM
,
model_path
=
'nvidia/NVLM-D-72B'
),
}
vintern_series
=
{
'Vintern-3B-beta'
:
partial
(
VinternChat
,
model_path
=
'5CD-AI/Vintern-3B-beta'
),
'Vintern-1B-v2'
:
partial
(
VinternChat
,
model_path
=
'5CD-AI/Vintern-1B-v2'
),
}
aria_series
=
{
"Aria"
:
partial
(
Aria
,
model_path
=
'rhymes-ai/Aria'
)
}
h2ovl_series
=
{
'h2ovl-mississippi-2b'
:
partial
(
H2OVLChat
,
model_path
=
'h2oai/h2ovl-mississippi-2b'
),
'h2ovl-mississippi-1b'
:
partial
(
H2OVLChat
,
model_path
=
'h2oai/h2ovl-mississippi-800m'
),
}
valley_series
=
{
'valley_eagle'
:
partial
(
ValleyEagleChat
,
model_path
=
'bytedance-research/Valley-Eagle-7B'
),
}
supported_VLM
=
{}
model_groups
=
[
ungrouped
,
api_models
,
xtuner_series
,
qwen_series
,
llava_series
,
internvl_series
,
yivl_series
,
xcomposer_series
,
minigpt4_series
,
idefics_series
,
instructblip_series
,
deepseekvl_series
,
janus_series
,
minicpm_series
,
cogvlm_series
,
wemm_series
,
cambrian_series
,
chameleon_series
,
video_models
,
ovis_series
,
vila_series
,
mantis_series
,
mmalaya_series
,
phi3_series
,
xgen_mm_series
,
qwen2vl_series
,
slime_series
,
eagle_series
,
moondream_series
,
llama_series
,
molmo_series
,
kosmos_series
,
points_series
,
nvlm_series
,
vintern_series
,
h2ovl_series
,
aria_series
,
smolvlm_series
,
sail_series
,
valley_series
]
for
grp
in
model_groups
:
supported_VLM
.
update
(
grp
)
VLMEvalKit/vlmeval/dataset/__init__.py
0 → 100644
View file @
bc5ebf0f
import
warnings
from
.image_base
import
img_root_map
,
ImageBaseDataset
from
.image_caption
import
ImageCaptionDataset
from
.image_yorn
import
ImageYORNDataset
from
.image_mcq
import
(
ImageMCQDataset
,
MMMUDataset
,
CustomMCQDataset
,
MUIRDataset
,
GMAIMMBenchDataset
,
MMERealWorld
,
HRBenchDataset
,
NaturalBenchDataset
)
from
.image_mt
import
MMDUDataset
from
.image_vqa
import
(
ImageVQADataset
,
MathVision
,
OCRBench
,
MathVista
,
LLaVABench
,
MMVet
,
MTVQADataset
,
TableVQABench
,
CustomVQADataset
,
CRPE
,
MathVerse
,
OlympiadBench
,
QSpatial
,
VizWiz
,
MMNIAH
)
from
.text_mcq
import
CustomTextMCQDataset
,
TextMCQDataset
from
.vcr
import
VCRDataset
from
.mmlongbench
import
MMLongBench
from
.dude
import
DUDE
from
.slidevqa
import
SlideVQA
from
.mmbench_video
import
MMBenchVideo
from
.videomme
import
VideoMME
from
.mvbench
import
MVBench
,
MVBench_MP4
from
.mlvu
import
MLVU
,
MLVU_MCQ
,
MLVU_OpenEnded
from
.tempcompass
import
TempCompass
,
TempCompass_Captioning
,
TempCompass_MCQ
,
TempCompass_YorN
from
.longvideobench
import
LongVideoBench
from
.video_concat_dataset
import
ConcatVideoDataset
from
.mmgenbench
import
MMGenBench
from
.miabench
import
MIABench
from
.cmmmu
import
CMMMU
from
.wildvision
import
WildVision
from
.mmmath
import
MMMath
from
.dynamath
import
Dynamath
from
.utils
import
*
from
.video_dataset_config
import
*
from
..smp
import
*
class
ConcatDataset
(
ImageBaseDataset
):
# This dataset takes multiple dataset names as input and aggregate them into a single dataset.
# Each single dataset should not have a field named `SUB_DATASET`
DATASET_SETS
=
{
'MMMB'
:
[
'MMMB_ar'
,
'MMMB_cn'
,
'MMMB_en'
,
'MMMB_pt'
,
'MMMB_ru'
,
'MMMB_tr'
],
'MTL_MMBench_DEV'
:
[
'MMBench_dev_ar'
,
'MMBench_dev_cn'
,
'MMBench_dev_en'
,
'MMBench_dev_pt'
,
'MMBench_dev_ru'
,
'MMBench_dev_tr'
]
}
def
__init__
(
self
,
dataset
):
datasets
=
self
.
DATASET_SETS
[
dataset
]
self
.
dataset_map
=
{}
# The name of the compliation
self
.
dataset_name
=
dataset
self
.
datasets
=
datasets
for
dname
in
datasets
:
dataset
=
build_dataset
(
dname
)
assert
dataset
is
not
None
,
dataset
self
.
dataset_map
[
dname
]
=
dataset
TYPES
=
[
x
.
TYPE
for
x
in
self
.
dataset_map
.
values
()]
MODALITIES
=
[
x
.
MODALITY
for
x
in
self
.
dataset_map
.
values
()]
assert
np
.
all
([
x
==
TYPES
[
0
]
for
x
in
TYPES
]),
(
datasets
,
TYPES
)
assert
np
.
all
([
x
==
MODALITIES
[
0
]
for
x
in
MODALITIES
]),
(
datasets
,
MODALITIES
)
self
.
TYPE
=
TYPES
[
0
]
self
.
MODALITY
=
MODALITIES
[
0
]
data_all
=
[]
for
dname
in
datasets
:
data
=
self
.
dataset_map
[
dname
].
data
data
[
'SUB_DATASET'
]
=
[
dname
]
*
len
(
data
)
data_new
=
localize_df
(
data
,
dname
,
nproc
=
16
)
data_all
.
append
(
data_new
)
data
=
pd
.
concat
(
data_all
)
data
[
'original_index'
]
=
data
.
pop
(
'index'
)
data
[
'index'
]
=
np
.
arange
(
len
(
data
))
self
.
data
=
data
def
build_prompt
(
self
,
line
):
if
isinstance
(
line
,
int
):
line
=
self
.
data
.
iloc
[
line
]
idx
=
line
[
'original_index'
]
dname
=
line
[
'SUB_DATASET'
]
org_data
=
self
.
dataset_map
[
dname
].
data
org_line
=
cp
.
deepcopy
(
org_data
[
org_data
[
'index'
]
==
idx
]).
iloc
[
0
]
return
self
.
dataset_map
[
dname
].
build_prompt
(
org_line
)
def
dump_image
(
self
,
line
):
# Assert all images are pre-dumped
assert
'image'
not
in
line
assert
'image_path'
in
line
tgt_path
=
toliststr
(
line
[
'image_path'
])
return
tgt_path
@
classmethod
def
supported_datasets
(
cls
):
return
list
(
cls
.
DATASET_SETS
)
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
# First, split the eval_file by dataset
data_all
=
load
(
eval_file
)
for
dname
in
self
.
datasets
:
tgt
=
eval_file
.
replace
(
self
.
dataset_name
,
dname
)
data_sub
=
data_all
[
data_all
[
'SUB_DATASET'
]
==
dname
]
data_sub
.
pop
(
'index'
)
data_sub
[
'index'
]
=
data_sub
.
pop
(
'original_index'
)
data_sub
.
pop
(
'SUB_DATASET'
)
dump
(
data_sub
,
tgt
)
# Then, evaluate each dataset separately
results_all
=
[]
for
dname
in
self
.
datasets
:
tgt
=
eval_file
.
replace
(
self
.
dataset_name
,
dname
)
res
=
self
.
dataset_map
[
dname
].
evaluate
(
tgt
,
**
judge_kwargs
)
assert
isinstance
(
res
,
pd
.
DataFrame
)
res
[
'DATASET'
]
=
[
dname
]
*
len
(
res
)
results_all
.
append
(
res
)
result
=
pd
.
concat
(
results_all
)
score_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
'_acc.csv'
)
dump
(
result
,
score_file
)
return
result
# Add new supported dataset class here
IMAGE_DATASET
=
[
ImageCaptionDataset
,
ImageYORNDataset
,
ImageMCQDataset
,
ImageVQADataset
,
MathVision
,
MMMUDataset
,
OCRBench
,
MathVista
,
LLaVABench
,
MMVet
,
MTVQADataset
,
TableVQABench
,
MMLongBench
,
VCRDataset
,
MMDUDataset
,
DUDE
,
SlideVQA
,
MUIRDataset
,
GMAIMMBenchDataset
,
MMERealWorld
,
HRBenchDataset
,
CRPE
,
MathVerse
,
NaturalBenchDataset
,
MIABench
,
OlympiadBench
,
WildVision
,
MMMath
,
QSpatial
,
Dynamath
,
MMGenBench
,
VizWiz
,
MMNIAH
,
CMMMU
]
VIDEO_DATASET
=
[
MMBenchVideo
,
VideoMME
,
MVBench
,
MVBench_MP4
,
LongVideoBench
,
MLVU
,
MLVU_MCQ
,
MLVU_OpenEnded
,
TempCompass
,
TempCompass_MCQ
,
TempCompass_Captioning
,
TempCompass_YorN
]
TEXT_DATASET
=
[
TextMCQDataset
]
CUSTOM_DATASET
=
[
CustomMCQDataset
,
CustomVQADataset
,
CustomTextMCQDataset
]
DATASET_COLLECTION
=
[
ConcatDataset
,
ConcatVideoDataset
]
DATASET_CLASSES
=
IMAGE_DATASET
+
VIDEO_DATASET
+
TEXT_DATASET
+
CUSTOM_DATASET
+
DATASET_COLLECTION
SUPPORTED_DATASETS
=
[]
for
DATASET_CLS
in
DATASET_CLASSES
:
SUPPORTED_DATASETS
.
extend
(
DATASET_CLS
.
supported_datasets
())
def
DATASET_TYPE
(
dataset
,
*
,
default
:
str
=
'MCQ'
)
->
str
:
for
cls
in
DATASET_CLASSES
:
if
dataset
in
cls
.
supported_datasets
():
if
hasattr
(
cls
,
'TYPE'
):
return
cls
.
TYPE
# Have to add specific routine to handle ConcatDataset
if
dataset
in
ConcatDataset
.
DATASET_SETS
:
dataset_list
=
ConcatDataset
.
DATASET_SETS
[
dataset
]
TYPES
=
[
DATASET_TYPE
(
dname
)
for
dname
in
dataset_list
]
assert
np
.
all
([
x
==
TYPES
[
0
]
for
x
in
TYPES
]),
(
dataset_list
,
TYPES
)
return
TYPES
[
0
]
if
'openended'
in
dataset
.
lower
():
return
'VQA'
warnings
.
warn
(
f
'Dataset
{
dataset
}
is a custom one and not annotated as `openended`, will treat as
{
default
}
. '
)
return
default
def
DATASET_MODALITY
(
dataset
,
*
,
default
:
str
=
'IMAGE'
)
->
str
:
if
dataset
is
None
:
warnings
.
warn
(
f
'Dataset is not specified, will treat modality as
{
default
}
. '
)
return
default
for
cls
in
DATASET_CLASSES
:
if
dataset
in
cls
.
supported_datasets
():
if
hasattr
(
cls
,
'MODALITY'
):
return
cls
.
MODALITY
# Have to add specific routine to handle ConcatDataset
if
dataset
in
ConcatDataset
.
DATASET_SETS
:
dataset_list
=
ConcatDataset
.
DATASET_SETS
[
dataset
]
MODALITIES
=
[
DATASET_MODALITY
(
dname
)
for
dname
in
dataset_list
]
assert
np
.
all
([
x
==
MODALITIES
[
0
]
for
x
in
MODALITIES
]),
(
dataset_list
,
MODALITIES
)
return
MODALITIES
[
0
]
if
'VIDEO'
in
dataset
.
lower
():
return
'VIDEO'
elif
'IMAGE'
in
dataset
.
lower
():
return
'IMAGE'
warnings
.
warn
(
f
'Dataset
{
dataset
}
is a custom one, will treat modality as
{
default
}
. '
)
return
default
def
build_dataset
(
dataset_name
,
**
kwargs
):
for
cls
in
DATASET_CLASSES
:
if
dataset_name
in
supported_video_datasets
:
return
supported_video_datasets
[
dataset_name
](
**
kwargs
)
elif
dataset_name
in
cls
.
supported_datasets
():
return
cls
(
dataset
=
dataset_name
,
**
kwargs
)
warnings
.
warn
(
f
'Dataset
{
dataset_name
}
is not officially supported. '
)
data_file
=
osp
.
join
(
LMUDataRoot
(),
f
'
{
dataset_name
}
.tsv'
)
if
not
osp
.
exists
(
data_file
):
warnings
.
warn
(
f
'Data file
{
data_file
}
does not exist. Dataset building failed. '
)
return
None
data
=
load
(
data_file
)
if
'question'
not
in
[
x
.
lower
()
for
x
in
data
.
columns
]:
warnings
.
warn
(
f
'Data file
{
data_file
}
does not have a `question` column. Dataset building failed. '
)
return
None
if
'A'
in
data
and
'B'
in
data
:
if
'image'
in
data
or
'image_path'
in
data
:
warnings
.
warn
(
f
'Will assume unsupported dataset
{
dataset_name
}
as a Custom MCQ dataset. '
)
return
CustomMCQDataset
(
dataset
=
dataset_name
,
**
kwargs
)
else
:
warnings
.
warn
(
f
'Will assume unsupported dataset
{
dataset_name
}
as a Custom Text MCQ dataset. '
)
return
CustomTextMCQDataset
(
dataset
=
dataset_name
,
**
kwargs
)
else
:
warnings
.
warn
(
f
'Will assume unsupported dataset
{
dataset_name
}
as a Custom VQA dataset. '
)
return
CustomVQADataset
(
dataset
=
dataset_name
,
**
kwargs
)
__all__
=
[
'build_dataset'
,
'img_root_map'
,
'build_judge'
,
'extract_answer_from_item'
,
'prefetch_answer'
,
'DEBUG_MESSAGE'
]
+
[
cls
.
__name__
for
cls
in
DATASET_CLASSES
]
VLMEvalKit/vlmeval/dataset/cmmmu.py
0 → 100644
View file @
bc5ebf0f
from
.image_base
import
ImageBaseDataset
import
random
from
collections
import
Counter
import
os
import
re
import
tempfile
from
..smp
import
*
def
get_multi_choice_prediction
(
response
,
all_choices
,
index2ans
):
for
char
in
[
','
,
'.'
,
'!'
,
'?'
,
';'
,
':'
,
"'"
]:
response
=
response
.
strip
(
char
)
response
=
" "
+
response
+
" "
# add space to avoid partial match
candidates
=
[]
for
choice
in
all_choices
:
# (A) (B) (C) (D)
# Add the choice to candidates each time it appears in the response
candidates
.
extend
([
choice
for
_
in
range
(
response
.
count
(
f
'(
{
choice
}
)'
))])
if
len
(
candidates
)
==
0
:
for
choice
in
all_choices
:
# A B C D
# Similarly, add the choice for each occurrence
candidates
.
extend
([
choice
for
_
in
range
(
response
.
count
(
f
'
{
choice
}
'
))])
if
len
(
candidates
)
==
0
and
len
(
response
.
split
())
>=
1
:
for
index
,
ans
in
index2ans
.
items
():
# Add index for each occurrence of ans in response
candidates
.
extend
([
index
for
_
in
range
(
response
.
count
(
ans
))])
# if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
if
len
(
candidates
)
==
0
and
len
(
response
.
split
())
>=
1
:
for
index
,
ans
in
index2ans
.
items
():
if
ans
in
response
:
candidates
.
append
(
index
)
# index_ans = False # it's content ans.
if
len
(
candidates
)
==
0
:
# still not get answer, randomly choose one.
return
random
.
choice
(
all_choices
)
# return ''
else
:
# Count the occurrence of each candidate
candidate_counts
=
Counter
(
candidates
)
# Select the most frequent candidates
max_count
=
max
(
candidate_counts
.
values
())
most_frequent_candidates
=
[
c
for
c
in
all_choices
if
candidate_counts
.
get
(
c
,
0
)
==
max_count
]
# Combine the most frequent candidates in ABCD order
return
''
.
join
(
most_frequent_candidates
)
def
extract_numbers
(
string
):
# Pattern for numbers with Chinese commas
pattern_commas
=
r
'-?\d{1,3}(?:,\d{3})+'
# Pattern for scientific notation
pattern_scientific
=
r
'-?\d+(?:\.\d+)?[eE][+-]?\d+'
# Pattern for simple numbers without Chinese commas
pattern_simple
=
r
'-?(?:\d+\.\d+|\.\d+|\d+)(?![eE][+-]?\d+)(?!,\d)'
# Extract numbers with Chinese commas
numbers_with_commas
=
re
.
findall
(
pattern_commas
,
string
)
# Extract numbers in scientific notation
numbers_scientific
=
re
.
findall
(
pattern_scientific
,
string
)
# Extract simple numbers without Chinese commas
numbers_simple
=
re
.
findall
(
pattern_simple
,
string
)
# Combine all extracted numbers
all_numbers
=
numbers_with_commas
+
numbers_scientific
+
numbers_simple
return
all_numbers
def
check_is_number
(
string
):
try
:
float
(
string
.
replace
(
','
,
''
))
return
True
except
ValueError
:
# check if there's comma inside
return
False
def
count_letters
(
string
):
return
sum
(
c
.
isalpha
()
and
'a'
<=
c
<=
'z'
or
'A'
<=
c
<=
'Z'
for
c
in
string
)
def
normalize_str
(
string
,
answer
):
# check if characters in the string
# if number, numerize it.
if
string
is
None
:
return
[
string
]
string
=
string
.
strip
()
is_number
=
check_is_number
(
string
)
if
is_number
:
string
=
string
.
replace
(
','
,
''
)
string
=
float
(
string
)
# leave 2 decimal
string
=
round
(
string
,
2
)
return
[
string
]
else
:
# it's likely to be a string
if
len
(
string
)
>
len
(
answer
)
+
20
or
count_letters
(
string
)
>
count_letters
(
answer
)
+
2
:
return
[]
return
[
string
]
def
get_fill_blank_prediction
(
response
,
answer
):
"""get the prediction from the generated response,
return a list of predicted strings or numbers"""
def
get_key_subresponses
(
response
):
response
=
response
.
strip
(
"。"
).
strip
()
sub_responses
=
re
.
split
(
r
'。|\n'
,
response
)
indicators_of_keys
=
[
'是'
,
'为'
,
'所以'
,
'等于'
,
'方案'
,
'选择'
,
'正确答案'
,
'因此'
,
'最后'
,
'答案'
,
'结果'
]
key_responses
=
[]
for
index
,
resp
in
enumerate
(
sub_responses
):
# if last one, accept it's an equation (the entire response can be just one sentence with equation)
if
index
==
len
(
sub_responses
)
-
1
:
indicators_of_keys
.
extend
([
'='
])
shortest_key_response
=
None
# the shortest response that may contain the answer (tail part of the response)
for
indicator
in
indicators_of_keys
:
if
indicator
in
resp
:
if
not
shortest_key_response
:
shortest_key_response
=
resp
.
split
(
indicator
)[
-
1
].
strip
()
else
:
if
len
(
resp
.
split
(
indicator
)[
-
1
].
strip
())
<
len
(
shortest_key_response
):
shortest_key_response
=
resp
.
split
(
indicator
)[
-
1
].
strip
()
if
shortest_key_response
:
# and it's not trivial
if
shortest_key_response
.
strip
()
not
in
[
":"
,
","
,
"."
,
"!"
,
"?"
,
";"
,
":"
,
"'"
]:
key_responses
.
append
(
shortest_key_response
)
if
len
(
key_responses
)
==
0
:
# did not found any
return
[
response
]
return
key_responses
key_responses
=
get_key_subresponses
(
response
)
pred_list
=
key_responses
.
copy
()
# keep the original string response
for
resp
in
key_responses
:
pred_list
.
extend
(
extract_numbers
(
resp
))
tmp_pred_list
=
[]
for
i
in
range
(
len
(
pred_list
)):
tmp_pred_list
.
extend
(
normalize_str
(
pred_list
[
i
],
answer
))
pred_list
=
tmp_pred_list
# remove duplicates
pred_list
=
list
(
set
(
pred_list
))
return
pred_list
def
get_TF_prediction
(
response
):
"""get the prediction from the generated response,
return a list of predicted strings or numbers"""
def
get_key_subresponses
(
response
):
response
=
response
.
strip
(
"。"
).
strip
()
sub_responses
=
re
.
split
(
r
'。|\n'
,
response
)
indicators_of_keys
=
[
'是'
,
'为'
,
'所以'
,
'判断'
,
'陈述'
,
'说法'
,
'表达'
,
'答案'
,
'结果'
]
key_responses
=
[]
for
index
,
resp
in
enumerate
(
sub_responses
):
shortest_key_response
=
None
# the shortest response that may contain the answer (tail part of the response)
for
indicator
in
indicators_of_keys
:
if
indicator
in
resp
:
if
not
shortest_key_response
:
shortest_key_response
=
resp
.
split
(
indicator
)[
-
1
].
strip
()
else
:
if
len
(
resp
.
split
(
indicator
)[
-
1
].
strip
())
<
len
(
shortest_key_response
):
shortest_key_response
=
resp
.
split
(
indicator
)[
-
1
].
strip
()
if
shortest_key_response
:
# and it's not trivial
if
shortest_key_response
.
strip
()
not
in
[
":"
,
","
,
"."
,
"!"
,
"?"
,
";"
,
":"
,
"'"
]:
key_responses
.
append
(
shortest_key_response
)
if
len
(
key_responses
)
==
0
:
# did not found any
return
[
response
]
return
key_responses
key_responses
=
get_key_subresponses
(
response
)
pred_list
=
key_responses
.
copy
()
# keep the original string response
# remove duplicates
pred_list
=
list
(
set
(
pred_list
))
return
pred_list
class
CMMMU
(
ImageBaseDataset
):
TYPE
=
'VQA'
DATASET_URL
=
{
'CMMMU_VAL'
:
'https://opencompass.openxlab.space/utils/VLMEval/CMMMU_VAL.tsv'
}
DATASET_MD5
=
{
'CMMMU_VAL'
:
'b4727e2fce2415bf646379e60c11a726'
}
def
dump_image
(
self
,
line
):
os
.
makedirs
(
self
.
img_root
,
exist_ok
=
True
)
tgt_path_z
=
[]
if
isinstance
(
line
[
'image'
],
list
):
for
i
in
range
(
len
(
line
[
'image'
])):
tgt_path
=
osp
.
join
(
self
.
img_root
,
f
"
{
line
[
'index'
]
}
--
{
i
+
1
}
.jpg"
)
if
not
read_ok
(
tgt_path
):
decode_base64_to_image_file
(
line
[
'image'
][
i
],
tgt_path
)
tgt_path_z
.
append
(
tgt_path
)
else
:
tgt_path
=
osp
.
join
(
self
.
img_root
,
f
"
{
line
[
'index'
]
}
.jpg"
)
if
not
read_ok
(
tgt_path
):
decode_base64_to_image_file
(
line
[
'image'
],
tgt_path
)
tgt_path_z
.
append
(
tgt_path
)
return
tgt_path_z
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
result_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
'_acc.csv'
)
if
not
osp
.
exists
(
result_file
):
data
=
load
(
eval_file
)
assert
'answer'
in
data
and
'prediction'
in
data
data
[
'prediction'
]
=
[
str
(
x
)
for
x
in
data
[
'prediction'
]]
data
[
'answer'
]
=
[
str
(
x
)
for
x
in
data
[
'answer'
]]
correct_count
=
0
correct_category
=
{
'技术与工程'
:
[
0
,
0
],
'科学'
:
[
0
,
0
],
'健康与医学'
:
[
0
,
0
],
'商业'
:
[
0
,
0
],
'艺术与设计'
:
[
0
,
0
],
'人文社会科学'
:
[
0
,
0
],
}
for
i
in
tqdm
(
data
.
iterrows
()):
line
=
i
[
1
]
correct_category
[
line
[
'category'
]][
0
]
+=
1
# Options
if
line
[
'type'
]
==
'选择'
:
index2ans
=
{
'A'
:
line
[
'option1'
],
'B'
:
line
[
'option2'
],
'C'
:
line
[
'option3'
],
'D'
:
line
[
'option4'
]
}
fact_option
=
get_multi_choice_prediction
(
line
[
'prediction'
],
[
'A'
,
'B'
,
'C'
,
'D'
],
index2ans
)
if
fact_option
==
line
[
'answer'
]:
correct_count
+=
1
correct_category
[
line
[
'category'
]][
1
]
+=
1
# Binary
elif
line
[
'type'
]
==
'判断'
:
positive_keywords
=
[
'正确'
,
'对'
,
'准确'
,
'肯定'
,
'对的'
]
negative_keywords
=
[
'不对'
,
'错误'
,
'不正确'
,
'不准确'
,
'不合适'
,
'否定'
,
'错的'
,
'错'
]
ambiguous_keywords
=
[
'对错'
,
'是否正确'
,
'否正确'
,
'或者'
,
'是否'
,
'正确性'
,
'对不'
]
def
judge_similarity
(
pred_list
,
positive_keywords
,
negative_keywords
):
positive_count
=
0
negative_count
=
0
for
pred
in
pred_list
:
if
any
(
pos_word
in
pred
for
pos_word
in
positive_keywords
):
positive_count
+=
1
elif
any
(
neg_word
in
pred
for
neg_word
in
negative_keywords
):
negative_count
+=
1
if
positive_count
>
negative_count
:
return
"对"
elif
negative_count
>
positive_count
:
return
"错"
else
:
return
random
.
choice
([
'对'
,
'错'
])
answer
=
get_TF_prediction
(
line
[
'prediction'
])
answer
=
[
word
for
word
in
answer
if
not
any
(
ambiguous
in
word
for
ambiguous
in
ambiguous_keywords
)]
fact_answer
=
judge_similarity
(
answer
,
positive_keywords
,
negative_keywords
)
if
fact_answer
==
line
[
'answer'
]:
correct_count
+=
1
correct_category
[
line
[
'category'
]][
1
]
+=
1
# Fill the Blank
else
:
norm_answers
=
normalize_str
(
line
[
'answer'
],
line
[
'answer'
])
predicted_answer
=
get_fill_blank_prediction
(
line
[
'prediction'
],
line
[
'answer'
])
for
pred
in
predicted_answer
:
# already normalized
if
isinstance
(
pred
,
str
):
# if it's a string, then find if ans in the pred_i
for
norm_ans
in
norm_answers
:
# only see if the string answer in the string pred
# print(norm_ans, pred)
if
isinstance
(
norm_ans
,
str
)
and
norm_ans
in
pred
:
correct_count
+=
1
correct_category
[
line
[
'category'
]][
1
]
+=
1
else
:
# it's a number
if
pred
in
norm_answers
:
correct_count
+=
1
correct_category
[
line
[
'category'
]][
1
]
+=
1
accuracyz
=
{}
accuracyz
[
'总准确率'
]
=
correct_count
/
len
(
data
)
for
i
in
correct_category
.
keys
():
accuracyz
[
i
]
=
correct_category
[
i
][
1
]
/
correct_category
[
i
][
0
]
accuracyz
=
d2df
(
accuracyz
)
accuracyz
.
round
(
10
)
dump
(
accuracyz
,
result_file
)
result
=
pd
.
read_csv
(
result_file
)
return
result
def
build_prompt
(
self
,
line
):
if
line
[
'type'
]
==
'选择'
:
tgt_path
=
self
.
dump_image
(
line
)
question
=
line
[
'question'
]
options_prompt
=
'Options:
\n
'
for
i
in
[[
'A'
,
'1'
],
[
'B'
,
'2'
],
[
'C'
,
'3'
],
[
'D'
,
'4'
]]:
options_prompt
+=
i
[
0
]
+
'. '
+
line
[
'option'
+
i
[
1
]]
+
'
\n
'
prompt
=
(
f
'问题:
{
question
}
\n
'
+
options_prompt
+
'请回答上述多项选择题,并选出正确选项。这些题目可能包括单选和多选题型。如果所提供的信息不足以确定一个明确的答案,那么请根据可用的数据和你的判断来选择最可能正确的选项。'
)
msgs
=
[]
if
isinstance
(
tgt_path
,
list
):
msgs
.
extend
([
dict
(
type
=
'image'
,
value
=
p
)
for
p
in
tgt_path
])
else
:
msgs
=
[
dict
(
type
=
'image'
,
value
=
tgt_path
)]
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
prompt
))
return
msgs
elif
line
[
'type'
]
==
'判断'
:
msgs
=
super
().
build_prompt
(
line
)
assert
msgs
[
-
1
][
'type'
]
==
'text'
msgs
[
-
1
][
'value'
]
+=
'
\n
请回答上述判断题,并根据题目描述和所给的信息来判断问题中陈述的对错。如果信息不完整或不足以作出绝对判断,请运用你的逻辑推理和现有信息来做出最可能的判断。'
return
msgs
else
:
msgs
=
super
().
build_prompt
(
line
)
assert
msgs
[
-
1
][
'type'
]
==
'text'
msgs
[
-
1
][
'value'
]
+=
'
\n
请回答上述填空题,并根据题目的要求和所提供的信息来给出最恰当的答案。如果信息不足以确切回答,那么请依据现有的数据和你的推理能力来填写最合理的答案。'
return
msgs
VLMEvalKit/vlmeval/dataset/dude.py
0 → 100644
View file @
bc5ebf0f
import
math
from
typing
import
List
from
.utils.judge_util
import
build_judge
from
.image_base
import
ImageBaseDataset
from
.mmlongbench
import
concat_images
,
MMLongBench_auxeval
,
anls_compute
from
..smp
import
*
FAIL_MSG
=
'Failed to obtain answer via API.'
def
DUDE_acc
(
result_file
):
data
=
load
(
result_file
)
overall_score
=
0.0
score_list
=
list
()
for
i
in
range
(
len
(
data
)):
item
=
data
.
iloc
[
i
]
if
isinstance
(
item
[
'answer'
],
float
)
and
math
.
isnan
(
item
[
'answer'
]):
item
[
'answer'
]
=
'Not answerable'
item
[
'answer'
]
=
item
[
'answer'
].
lower
()
item
[
'pred'
]
=
item
[
'pred'
].
lower
()
score
=
anls_compute
(
item
[
'answer'
],
item
[
'pred'
])
score_list
.
append
(
score
)
overall_score
+=
score
data
[
'score'
]
=
score_list
dump
(
data
,
result_file
)
res
=
dict
()
res
[
'category'
],
res
[
'num'
],
res
[
'avg_score'
]
=
[
'anls'
],
[
len
(
data
)],
[
overall_score
/
len
(
data
)]
res
=
pd
.
DataFrame
(
res
)
return
res
class
DUDE
(
ImageBaseDataset
):
TYPE
=
'VQA'
DATASET_URL
=
{
'DUDE'
:
'https://opencompass.openxlab.space/utils/VLMEval/DUDE.tsv'
,
'DUDE_MINI'
:
'https://opencompass.openxlab.space/utils/VLMEval/DUDE_MINI.tsv'
,
}
DATASET_MD5
=
{
'DUDE'
:
'130d860d08206e1e407cd77150c10d88'
,
'DUDE_MINI'
:
'e0c0d998114f0cca7516d12039d2b538'
,
}
SUPPORTED_MODELS
=
{
'GPT4'
:
(
1
,
1
),
'GPT4V'
:
(
1
,
1
),
'GPT4V_HIGH'
:
(
1
,
1
),
'GPT4o'
:
(
1
,
1
),
'GPT4o_HIGH'
:
(
1
,
1
),
'GPT4o_MINI'
:
(
1
,
1
),
'XComposer2d5'
:
(
1
,
-
1
),
'XComposer2_4KHD'
:
(
1
,
-
1
),
'MiniCPM-Llama3-V-2_5'
:
(
1
,
5
),
'InternVL-Chat-V1-5'
:
(
5
,
2
),
}
def
__init__
(
self
,
dataset
,
**
kwargs
):
self
.
model_list
=
list
(
self
.
SUPPORTED_MODELS
.
keys
())
model_name
=
kwargs
[
'model'
]
if
not
listinstr
(
self
.
model_list
,
model_name
):
raise
AssertionError
(
"{} doesn't support the evaluation on DUDE."
.
format
(
model_name
))
super
(
DUDE
,
self
).
__init__
(
dataset
)
self
.
is_api
=
True
if
listinstr
([
'GPT4'
],
model_name
)
else
False
self
.
max_pages
=
120
concat_num
,
column_num
=
self
.
SUPPORTED_MODELS
.
get
(
model_name
)
self
.
concat_num
=
concat_num
self
.
column_num
=
column_num
def
prepare_tsv
(
self
,
url
,
file_md5
=
None
):
data_root
=
LMUDataRoot
()
os
.
makedirs
(
data_root
,
exist_ok
=
True
)
file_name
=
url
.
split
(
'/'
)[
-
1
]
data_path
=
osp
.
join
(
data_root
,
file_name
)
if
osp
.
exists
(
data_path
)
and
(
file_md5
is
None
or
md5
(
data_path
)
==
file_md5
):
pass
else
:
warnings
.
warn
(
'The dataset tsv is not downloaded'
)
download_file
(
url
,
data_path
)
return
load
(
data_path
)
def
dump_image
(
self
,
origin_line
):
os
.
makedirs
(
self
.
img_root
,
exist_ok
=
True
)
try
:
import
fitz
except
Exception
as
e
:
logging
.
critical
(
f
'
{
type
(
e
)
}
:
{
e
}
'
)
logging
.
critical
(
'Please use `pip install pymupdf` to parse PDF files.'
)
line
=
origin_line
.
copy
()
if
not
isinstance
(
line
[
'image_path'
],
List
):
line
[
'image_path'
]
=
[
line
[
'image_path'
]]
line
[
'image_path'
]
=
line
[
'image_path'
][:
self
.
max_pages
]
skip_pdf_parse
=
True
for
im_name
in
line
[
'image_path'
]:
path
=
osp
.
join
(
self
.
img_root
,
im_name
)
if
not
read_ok
(
path
):
skip_pdf_parse
=
False
break
# Just for being compatible with the zooped loop: zip(line['image'], line['image_path'])
if
skip_pdf_parse
:
line
[
'image'
]
=
line
[
'image_path'
]
else
:
pdf_data
=
base64
.
b64decode
(
line
[
'image'
])
pdf_file
=
io
.
BytesIO
(
pdf_data
)
encoded_images
=
[]
with
fitz
.
open
(
stream
=
pdf_file
,
filetype
=
'pdf'
)
as
doc
:
doc
=
doc
[:
self
.
max_pages
]
for
page
in
doc
:
image
=
page
.
get_pixmap
(
dpi
=
144
)
image_file
=
io
.
BytesIO
(
image
.
tobytes
(
output
=
'png'
))
image
=
Image
.
open
(
image_file
)
encoded_image
=
encode_image_to_base64
(
image
)
encoded_images
.
append
(
encoded_image
)
line
[
'image'
]
=
encoded_images
print
(
'process {}'
.
format
(
line
[
'doc_id'
]))
if
'image'
in
line
:
if
isinstance
(
line
[
'image'
],
list
):
tgt_path
=
[]
assert
'image_path'
in
line
for
img
,
im_name
in
zip
(
line
[
'image'
],
line
[
'image_path'
]):
path
=
osp
.
join
(
self
.
img_root
,
im_name
)
if
not
read_ok
(
path
):
decode_base64_to_image_file
(
img
,
path
)
tgt_path
.
append
(
path
)
else
:
tgt_path
=
osp
.
join
(
self
.
img_root
,
f
"
{
line
[
'index'
]
}
.jpg"
)
if
not
read_ok
(
tgt_path
):
decode_base64_to_image_file
(
line
[
'image'
],
tgt_path
)
tgt_path
=
[
tgt_path
]
else
:
assert
'image_path'
in
line
tgt_path
=
toliststr
(
line
[
'image_path'
])
if
self
.
concat_num
>
0
and
not
self
.
is_api
:
concatenated_images
=
concat_images
(
tgt_path
,
max_concat
=
self
.
concat_num
,
column_num
=
self
.
column_num
)
old_tgt_path
=
tgt_path
assert
isinstance
(
old_tgt_path
,
list
)
if
self
.
column_num
!=
-
1
:
tgt_path
=
[
'_'
.
join
(
old_tgt_path
[
0
].
split
(
'_'
)[:
-
1
])
+
'_concat{}_{}.jpg'
.
format
(
self
.
concat_num
,
i
)
for
i
in
range
(
len
(
concatenated_images
))
]
else
:
tgt_path
=
[
'_'
.
join
(
old_tgt_path
[
0
].
split
(
'_'
)[:
-
1
])
+
'_concat_all.jpg'
]
for
path
,
concatenated_image
in
zip
(
tgt_path
,
concatenated_images
):
if
not
read_ok
(
path
):
decode_base64_to_image_file
(
encode_image_to_base64
(
concatenated_image
),
path
)
num_images
,
image_size
=
len
(
old_tgt_path
),
concatenated_image
.
size
print
(
'concat {} images to a new one with size {}. save at {}'
.
format
(
num_images
,
image_size
,
path
))
return
tgt_path
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
logger
=
get_logger
(
'Evaluation'
)
model
=
judge_kwargs
[
'model'
]
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
storage
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
.xlsx'
)
tmp_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
.pkl'
)
if
osp
.
exists
(
storage
):
logger
.
warning
(
f
'GPT scoring file
{
storage
}
already exists, will reuse it in DUDE_eval. '
)
else
:
data
=
load
(
eval_file
)
model
=
build_judge
(
max_tokens
=
128
,
**
judge_kwargs
)
lt
=
len
(
data
)
lines
=
[
data
.
iloc
[
i
]
for
i
in
range
(
lt
)]
tups
=
[(
model
,
line
)
for
line
in
lines
]
indices
=
[
line
[
'index'
]
for
line
in
lines
]
ans
=
{}
if
osp
.
exists
(
tmp_file
):
ans
=
load
(
tmp_file
)
tups
=
[
x
for
x
,
i
in
zip
(
tups
,
indices
)
if
i
not
in
ans
]
indices
=
[
i
for
i
in
indices
if
i
not
in
ans
]
if
len
(
indices
):
new_results
=
list
()
for
model
,
line
in
tqdm
(
tups
):
res
=
MMLongBench_auxeval
(
model
,
line
)
new_results
.
append
(
res
)
log_map
,
res_map
,
pred_map
=
{},
{},
{}
all_inds
=
[
line
[
'index'
]
for
line
in
lines
]
for
k
,
v
in
zip
(
all_inds
,
new_results
):
log_map
[
k
]
=
v
[
'log'
]
res_map
[
k
]
=
v
[
'res'
]
pred_map
[
k
]
=
v
[
'pred'
]
data
[
'res'
]
=
[
res_map
[
idx
]
for
idx
in
data
[
'index'
]]
data
[
'log'
]
=
[
log_map
[
idx
]
for
idx
in
data
[
'index'
]]
data
[
'pred'
]
=
[
pred_map
[
idx
]
for
idx
in
data
[
'index'
]]
dump
(
data
,
storage
)
score
=
DUDE_acc
(
storage
)
score_pth
=
storage
.
replace
(
'.xlsx'
,
'_score.csv'
)
dump
(
score
,
score_pth
)
logger
.
info
(
f
'DUDE successfully finished evaluating
{
eval_file
}
, results saved in
{
score_pth
}
'
)
logger
.
info
(
'Score: '
)
logger
.
info
(
score
)
VLMEvalKit/vlmeval/dataset/dynamath.py
0 → 100644
View file @
bc5ebf0f
import
re
import
json
import
sympy
as
sp
import
numpy
as
np
import
pandas
as
pd
from
sympy
import
simplify
,
Eq
,
sympify
,
Pow
,
pi
from
sympy.parsing.latex
import
parse_latex
import
sys
import
math
import
os
import
os.path
as
osp
import
argparse
from
.image_base
import
ImageBaseDataset
from
.utils
import
build_judge
from
..utils
import
track_progress_rich
from
..smp
import
load
,
dump
,
d2df
,
toliststr
def
preprocess
(
str1
):
if
0
<=
str1
.
find
(
"{"
)
<
str1
.
rfind
(
"}"
):
str1
=
str1
[
str1
.
find
(
"{"
):
str1
.
rfind
(
"}"
)
+
1
]
str2
=
str1
.
replace
(
"
\\
"
,
""
)
str2
=
str2
.
replace
(
"
\\
n"
,
"
\n
"
)
return
str2
def
transfer
(
str1
):
if
"
\u03c0
"
in
str1
:
strs
=
str1
.
split
(
'
\u03c0
'
)
str1
=
strs
[
0
]
return
float
(
str1
)
*
np
.
pi
else
:
return
float
(
str1
)
def
parse_answer
(
answer
,
answer_type
=
"multiple choice"
):
if
answer_type
==
"float"
:
if
answer
.
isdigit
():
return
True
,
float
(
answer
)
else
:
parts
=
answer
.
split
(
' '
)
answer
=
parts
[
0
]
try
:
answer
=
transfer
(
answer
)
return
True
,
answer
except
:
return
False
,
None
elif
answer_type
==
"multiple choice"
:
if
len
(
answer
)
==
1
:
return
True
,
answer
.
upper
()
else
:
in_flag
=
[
ch
in
answer
.
upper
()
for
ch
in
'ABCDE'
]
if
sum
(
in_flag
)
==
1
:
for
ch
in
'ABCDE'
:
if
ch
in
answer
.
upper
():
return
True
,
ch
return
False
,
None
else
:
return
True
,
answer
def
DynaMath_auxeval
(
model
,
line
):
pred
=
line
[
'prediction'
]
pred
=
preprocess
(
pred
)
succeed
,
short_answer
=
None
,
None
try
:
dj
=
json
.
loads
(
pred
,
strict
=
False
)
short_answer
=
dj
.
get
(
"short answer"
)
assert
short_answer
is
not
None
succeed
,
short_answer
=
parse_answer
(
short_answer
,
answer_type
=
line
[
'anwser_type'
])
assert
succeed
except
:
# Failed to parse the JSON, use an auxiliary LLM to get the short answer
if
line
[
'answer_type'
]
==
'multiple choice'
:
inst
=
"Output the corresponing choice option, such as 'A', 'B', 'C', 'D', in a single line."
elif
line
[
'answer_type'
]
==
'float'
:
inst
=
"Output a three-digit floating-point number in a single line."
else
:
inst
=
(
"Output a short answer in a single line. Any float numbers in the answer "
"should be formatted as three-digit floating-point numbers."
)
prompt
=
f
"Free-form answer:
{
pred
}
\n
Instruction:
{
inst
}
"
response
=
pred
succeed
,
short_answer
=
parse_answer
(
response
,
line
[
'answer_type'
])
if
not
succeed
:
response
=
model
.
generate
(
prompt
)
succeed
,
short_answer
=
parse_answer
(
response
,
line
[
'answer_type'
])
if
line
[
'answer_type'
]
==
'float'
:
if
succeed
:
diff
=
float
(
short_answer
)
-
float
(
line
[
'answer'
])
if
abs
(
diff
)
<=
0.001
:
return
dict
(
parse
=
True
,
extracted
=
short_answer
,
correct
=
True
)
else
:
return
dict
(
parse
=
True
,
extracted
=
short_answer
,
correct
=
False
)
else
:
return
dict
(
parse
=
False
,
extracted
=
None
,
correct
=
False
)
elif
line
[
'answer_type'
]
==
'multiple choice'
:
if
succeed
:
return
dict
(
parse
=
True
,
extracted
=
short_answer
,
correct
=
(
short_answer
==
line
[
'answer'
]))
else
:
if
line
[
'answer'
]
in
pred
[:
3
].
upper
():
return
dict
(
parse
=
False
,
extracted
=
None
,
correct
=
True
)
else
:
return
dict
(
parse
=
False
,
extracted
=
None
,
correct
=
False
)
else
:
if
succeed
:
return
dict
(
parse
=
True
,
extracted
=
short_answer
,
correct
=
(
short_answer
.
lower
()
in
line
[
'answer'
].
lower
()))
else
:
return
dict
(
parse
=
False
,
extracted
=
None
,
correct
=
(
short_answer
.
lower
()
in
line
[
'answer'
].
lower
()))
class
Dynamath
(
ImageBaseDataset
):
TYPE
=
'VQA'
DATASET_URL
=
{
'DynaMath'
:
'https://opencompass.openxlab.space/utils/VLMEval/DynaMath.tsv'
}
DATASET_MD5
=
{
'DynaMath'
:
'b8425ad9a7114571fc9366e013699494'
}
GUIDE
=
"""
## Answer Instruction Please provide an answer to the question outlined above. Your response should adhere
\
to the following JSON format, which includes two keys: 'solution' and 'short answer'. The 'solution' key can contain
\
detailed steps needed to solve the question, and the 'short answer' key should provide a concise response. {INST}
Example of expected JSON response format:
"""
EXAMPLE
=
{
"solution"
:
"[Detailed step-by-step explanation]"
,
"short answer"
:
"[Concise Answer]"
}
TEXT_EXAMPLE
=
json
.
dumps
(
EXAMPLE
,
indent
=
4
)
# Given one data record, return the built prompt (a multi-modal message), can override
def
build_prompt
(
self
,
line
):
if
isinstance
(
line
,
int
):
line
=
self
.
data
.
iloc
[
line
]
if
self
.
meta_only
:
tgt_path
=
toliststr
(
line
[
'image_path'
])
else
:
tgt_path
=
self
.
dump_image
(
line
)
prompt
=
f
"## Question
\n
{
line
[
'question'
]
}
"
if
line
[
'answer_type'
]
==
'multiple choice'
:
inst
=
"Provide the corresponing choice option in the 'short answer' key, such as 'A', 'B', 'C', or 'D'."
elif
line
[
'answer_type'
]
==
'float'
:
inst
=
"Format the answer as a three-digit floating-point number and provide it in the 'short answer' key."
else
:
inst
=
"Float numbers in the answer should be formatted as three-digit floating-point numbers."
prompt
=
prompt
+
self
.
GUIDE
.
format
(
INST
=
inst
)
+
self
.
TEXT_EXAMPLE
msgs
=
[]
if
isinstance
(
tgt_path
,
list
):
msgs
.
extend
([
dict
(
type
=
'image'
,
value
=
p
)
for
p
in
tgt_path
])
else
:
msgs
=
[
dict
(
type
=
'image'
,
value
=
tgt_path
)]
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
prompt
))
return
msgs
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
judge_name
=
judge_kwargs
.
pop
(
'model'
,
'gpt-4o-mini'
)
model
=
build_judge
(
model
=
judge_name
,
**
judge_kwargs
)
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
storage
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
judge_name
}
.xlsx'
)
# noqa: F841
score_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
judge_name
}
_score.csv'
)
# noqa: F841
tmp_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
judge_name
}
.pkl'
)
# noqa: F841
nproc
=
judge_kwargs
.
pop
(
'nproc'
,
6
)
# noqa: F841
res
=
load
(
tmp_file
)
if
os
.
path
.
exists
(
tmp_file
)
else
{}
res
=
{
k
:
v
for
k
,
v
in
res
.
items
()
if
v
is
not
None
}
model
.
system_prompt
=
"""
\
You are a helpful assistant that helps me to format free-form answers into a short answer according to the instruction.
"""
if
not
osp
.
exists
(
storage
):
data
=
load
(
eval_file
)
lt
=
len
(
data
)
payloads
=
[
dict
(
model
=
model
,
line
=
data
.
iloc
[
i
])
for
i
in
range
(
lt
)
if
data
.
iloc
[
i
][
'index'
]
not
in
res
]
keys
=
[
idx
for
idx
in
data
[
'index'
]
if
idx
not
in
res
]
if
len
(
keys
):
results
=
track_progress_rich
(
DynaMath_auxeval
,
payloads
,
nproc
=
nproc
,
save
=
tmp_file
,
keys
=
keys
)
for
k
,
r
in
zip
(
keys
,
results
):
res
[
k
]
=
r
data
[
'parse'
]
=
[
res
[
idx
][
'parse'
]
for
idx
in
data
[
'index'
]]
data
[
'extracted'
]
=
[
res
[
idx
][
'extracted'
]
for
idx
in
data
[
'index'
]]
data
[
'correct'
]
=
[
res
[
idx
][
'correct'
]
for
idx
in
data
[
'index'
]]
dump
(
data
,
storage
)
data
=
load
(
storage
)
# Calculate Average Accuracy
score_avg
=
{}
score_avg
[
'Overall'
]
=
np
.
mean
(
data
[
'correct'
])
subs
=
set
(
data
[
'subject'
])
for
sub
in
subs
:
data_sub
=
data
[
data
[
'subject'
]
==
sub
]
score_avg
[
f
'Subject-
{
sub
}
'
]
=
np
.
mean
(
data_sub
[
'correct'
])
lvls
=
set
(
data
[
'knowledge_level'
])
for
lvl
in
lvls
:
data_lvl
=
data
[
data
[
'knowledge_level'
]
==
lvl
]
score_avg
[
f
'Level-
{
lvl
}
'
]
=
np
.
mean
(
data_lvl
[
'correct'
])
# Calculate the Worst Case Accuracy
score_worst
=
{}
data_worst
=
data
[
data
[
'varid'
]
==
1
]
qid2corr
=
{
idx
:
True
for
idx
in
data_worst
[
'index'
]}
lt
=
len
(
data
)
for
i
in
range
(
lt
):
item
=
data
.
iloc
[
i
]
qid2corr
[
item
[
'qid'
]]
*=
item
[
'correct'
]
data_worst
[
'correct'
]
=
[
qid2corr
[
idx
]
for
idx
in
data_worst
[
'qid'
]]
score_worst
[
'Overall'
]
=
np
.
mean
(
data_worst
[
'correct'
])
subs
=
set
(
data_worst
[
'subject'
])
for
sub
in
subs
:
data_sub
=
data_worst
[
data_worst
[
'subject'
]
==
sub
]
score_worst
[
f
'Subject-
{
sub
}
'
]
=
np
.
mean
(
data_sub
[
'correct'
])
lvls
=
set
(
data_worst
[
'knowledge_level'
])
for
lvl
in
lvls
:
data_lvl
=
data_worst
[
data_worst
[
'knowledge_level'
]
==
lvl
]
score_worst
[
f
'Level-
{
lvl
}
'
]
=
np
.
mean
(
data_lvl
[
'correct'
])
d1
=
{
'Setting'
:
'Average'
}
d1
.
update
(
score_avg
)
d2
=
{
'Setting'
:
'Worst Case'
}
d2
.
update
(
score_worst
)
score
=
pd
.
concat
([
d2df
(
d1
),
d2df
(
d2
)],
ignore_index
=
True
)
dump
(
score
,
score_file
)
return
score
VLMEvalKit/vlmeval/dataset/image_base.py
0 → 100644
View file @
bc5ebf0f
import
pandas
as
pd
from
abc
import
abstractmethod
from
..smp
import
*
def
img_root_map
(
dataset
):
if
'MM_NIAH'
in
dataset
:
return
'MMNIAH'
if
'CRPE'
in
dataset
:
return
'CRPE'
if
'OCRVQA'
in
dataset
:
return
'OCRVQA'
if
'COCO_VAL'
==
dataset
:
return
'COCO'
if
'MMMU'
in
dataset
:
return
'MMMU'
if
"QSpatial"
in
dataset
:
return
"QSpatial"
mmbench_root_map
=
{
'MMBench_DEV_EN'
:
'MMBench'
,
'MMBench_TEST_EN'
:
'MMBench'
,
'MMBench_DEV_CN'
:
'MMBench'
,
'MMBench_TEST_CN'
:
'MMBench'
,
'MMBench'
:
'MMBench'
,
'MMBench_CN'
:
'MMBench'
,
'MMBench_DEV_EN_V11'
:
'MMBench_V11'
,
'MMBench_TEST_EN_V11'
:
'MMBench_V11'
,
'MMBench_DEV_CN_V11'
:
'MMBench_V11'
,
'MMBench_TEST_CN_V11'
:
'MMBench_V11'
,
'MMBench_V11'
:
'MMBench'
,
'MMBench_CN_V11'
:
'MMBench'
,
}
if
dataset
in
mmbench_root_map
:
return
mmbench_root_map
[
dataset
]
return
dataset
class
ImageBaseDataset
:
MODALITY
=
'IMAGE'
DATASET_URL
=
{}
DATASET_MD5
=
{}
def
__init__
(
self
,
dataset
=
'MMBench'
,
skip_noimg
=
True
):
ROOT
=
LMUDataRoot
()
# You can override this variable to save image files to a different directory
self
.
dataset_name
=
dataset
self
.
img_root
=
osp
.
join
(
ROOT
,
'images'
,
img_root_map
(
dataset
))
data
=
self
.
load_data
(
dataset
)
self
.
skip_noimg
=
skip_noimg
if
skip_noimg
and
'image'
in
data
:
data
=
data
[
~
pd
.
isna
(
data
[
'image'
])]
data
[
'index'
]
=
[
str
(
x
)
for
x
in
data
[
'index'
]]
self
.
meta_only
=
True
# The image field can store the base64 encoded image or another question index (for saving space)
if
'image'
in
data
:
data
[
'image'
]
=
[
str
(
x
)
for
x
in
data
[
'image'
]]
image_map
=
{
x
:
y
for
x
,
y
in
zip
(
data
[
'index'
],
data
[
'image'
])}
for
k
in
image_map
:
if
len
(
image_map
[
k
])
<=
64
:
idx
=
image_map
[
k
]
assert
idx
in
image_map
and
len
(
image_map
[
idx
])
>
64
image_map
[
k
]
=
image_map
[
idx
]
images
=
[
toliststr
(
image_map
[
k
])
for
k
in
data
[
'index'
]]
data
[
'image'
]
=
[
x
[
0
]
if
len
(
x
)
==
1
else
x
for
x
in
images
]
self
.
meta_only
=
False
if
'image_path'
in
data
:
paths
=
[
toliststr
(
x
)
for
x
in
data
[
'image_path'
]]
data
[
'image_path'
]
=
[
x
[
0
]
if
len
(
x
)
==
1
else
x
for
x
in
paths
]
if
np
.
all
([
istype
(
x
,
int
)
for
x
in
data
[
'index'
]]):
data
[
'index'
]
=
[
int
(
x
)
for
x
in
data
[
'index'
]]
self
.
data
=
data
self
.
post_build
(
dataset
)
def
__len__
(
self
):
return
len
(
self
.
data
)
def
__getitem__
(
self
,
idx
):
return
dict
(
self
.
data
.
iloc
[
idx
])
def
prepare_tsv
(
self
,
url
,
file_md5
=
None
):
data_root
=
LMUDataRoot
()
os
.
makedirs
(
data_root
,
exist_ok
=
True
)
update_flag
=
False
file_name
=
url
.
split
(
'/'
)[
-
1
]
data_path
=
osp
.
join
(
data_root
,
file_name
)
if
osp
.
exists
(
data_path
)
and
(
file_md5
is
None
or
md5
(
data_path
)
==
file_md5
):
pass
else
:
warnings
.
warn
(
'The dataset tsv is not downloaded'
)
download_file
(
url
,
data_path
)
update_flag
=
True
if
file_size
(
data_path
,
'GB'
)
>
1
:
local_path
=
data_path
.
replace
(
'.tsv'
,
'_local.tsv'
)
if
not
osp
.
exists
(
local_path
)
or
os
.
environ
.
get
(
'FORCE_LOCAL'
,
None
)
or
update_flag
:
from
..tools
import
LOCALIZE
LOCALIZE
(
data_path
,
local_path
)
data_path
=
local_path
return
load
(
data_path
)
def
dump_image
(
self
,
line
):
os
.
makedirs
(
self
.
img_root
,
exist_ok
=
True
)
if
'image'
in
line
:
if
isinstance
(
line
[
'image'
],
list
):
tgt_path
=
[]
assert
'image_path'
in
line
for
img
,
im_name
in
zip
(
line
[
'image'
],
line
[
'image_path'
]):
path
=
osp
.
join
(
self
.
img_root
,
im_name
)
if
not
read_ok
(
path
):
decode_base64_to_image_file
(
img
,
path
)
tgt_path
.
append
(
path
)
else
:
tgt_path
=
osp
.
join
(
self
.
img_root
,
f
"
{
line
[
'index'
]
}
.jpg"
)
if
not
read_ok
(
tgt_path
):
decode_base64_to_image_file
(
line
[
'image'
],
tgt_path
)
tgt_path
=
[
tgt_path
]
else
:
assert
'image_path'
in
line
tgt_path
=
toliststr
(
line
[
'image_path'
])
return
tgt_path
def
display
(
self
,
line
):
if
isinstance
(
line
,
int
):
line
=
self
.
data
.
iloc
[
line
]
assert
isinstance
(
line
,
pd
.
Series
)
or
isinstance
(
line
,
dict
)
mmqa_display
(
line
)
# Return a list of dataset names that are supported by this class, can override
@
classmethod
def
supported_datasets
(
cls
):
return
list
(
cls
.
DATASET_URL
)
# Given the dataset name, return the dataset as a pandas dataframe, can override
def
load_data
(
self
,
dataset
):
url
=
self
.
DATASET_URL
[
dataset
]
file_md5
=
self
.
DATASET_MD5
[
dataset
]
if
dataset
in
self
.
DATASET_MD5
else
None
return
self
.
prepare_tsv
(
url
,
file_md5
)
# Post built hook, will be called after the dataset is built, can override
def
post_build
(
self
,
dataset
):
pass
# Given one data record, return the built prompt (a multi-modal message), can override
def
build_prompt
(
self
,
line
):
if
isinstance
(
line
,
int
):
line
=
self
.
data
.
iloc
[
line
]
if
self
.
meta_only
:
tgt_path
=
toliststr
(
line
[
'image_path'
])
else
:
tgt_path
=
self
.
dump_image
(
line
)
question
=
line
[
'question'
]
msgs
=
[]
if
isinstance
(
tgt_path
,
list
):
msgs
.
extend
([
dict
(
type
=
'image'
,
value
=
p
)
for
p
in
tgt_path
])
else
:
msgs
=
[
dict
(
type
=
'image'
,
value
=
tgt_path
)]
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
question
))
return
msgs
# Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
@
abstractmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
pass
VLMEvalKit/vlmeval/dataset/image_caption.py
0 → 100644
View file @
bc5ebf0f
from
.image_base
import
ImageBaseDataset
from
..smp
import
*
class
COCO_Caption_Scorer
():
def
__init__
(
self
,
ref
,
gt
):
from
pycocoevalcap.bleu.bleu
import
Bleu
from
pycocoevalcap.rouge.rouge
import
Rouge
from
pycocoevalcap.cider.cider
import
Cider
self
.
ref
=
ref
self
.
gt
=
gt
print
(
'setting up scorers...'
)
self
.
scorers
=
[
(
Bleu
(
4
),
[
'Bleu_1'
,
'Bleu_2'
,
'Bleu_3'
,
'Bleu_4'
]),
(
Rouge
(),
'ROUGE_L'
),
(
Cider
(),
'CIDEr'
),
]
def
compute_scores
(
self
):
total_scores
=
{}
for
scorer
,
method
in
self
.
scorers
:
print
(
'computing %s score...'
%
(
scorer
.
method
()))
score
,
scores
=
scorer
.
compute_score
(
self
.
gt
,
self
.
ref
)
if
isinstance
(
method
,
list
):
for
sc
,
scs
,
m
in
zip
(
score
,
scores
,
method
):
print
(
'%s: %0.3f'
%
(
m
,
sc
*
100
))
total_scores
[
'Bleu'
]
=
[
x
*
100
for
x
in
score
]
else
:
print
(
'%s: %0.3f'
%
(
method
,
score
*
100
))
total_scores
[
method
]
=
score
*
100
print
(
'*****DONE*****'
)
for
key
,
value
in
total_scores
.
items
():
print
(
'{}:{}'
.
format
(
key
,
value
))
return
total_scores
class
ImageCaptionDataset
(
ImageBaseDataset
):
TYPE
=
'Caption'
DATASET_URL
=
{
'COCO_VAL'
:
'https://opencompass.openxlab.space/utils/VLMEval/COCO_VAL.tsv'
,
}
DATASET_MD5
=
{
'COCO_VAL'
:
'72a5079dead060269ac222c5aa5128af'
,
}
def
load_data
(
self
,
dataset
):
data
=
super
().
load_data
(
dataset
)
if
'question'
not
in
data
:
data
[
'question'
]
=
[(
'Please describe this image in general. Directly provide the description, '
'do not include prefix like "This image depicts". '
)]
*
len
(
data
)
return
data
# It returns a dictionary of scores
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
kwargs
):
data
=
load
(
eval_file
)
lt
=
len
(
data
)
lines
=
[
data
.
iloc
[
i
]
for
i
in
range
(
lt
)]
ref
,
gt
=
{},
{}
for
i
,
line
in
enumerate
(
lines
):
ref
[
str
(
i
)]
=
[
str
(
line
[
'prediction'
])]
gt
[
str
(
i
)]
=
eval
(
line
[
'answer'
])
scorer
=
COCO_Caption_Scorer
(
ref
,
gt
)
coco_caption_score_dict
=
scorer
.
compute_scores
()
score_pth
=
eval_file
.
replace
(
'.xlsx'
,
'_score.json'
)
dump
(
coco_caption_score_dict
,
score_pth
)
return
coco_caption_score_dict
VLMEvalKit/vlmeval/dataset/image_mcq.py
0 → 100644
View file @
bc5ebf0f
import
warnings
from
.image_base
import
ImageBaseDataset
from
.utils
import
build_judge
,
DEBUG_MESSAGE
from
..smp
import
*
import
pandas
as
pd
MMMB_URLS
=
{
'MMMB_ar'
:
'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_ar.tsv'
,
'MMMB_cn'
:
'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_cn.tsv'
,
'MMMB_en'
:
'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_en.tsv'
,
'MMMB_pt'
:
'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_pt.tsv'
,
'MMMB_ru'
:
'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_ru.tsv'
,
'MMMB_tr'
:
'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmmb/mmmb_tr.tsv'
,
}
MTL_MMBench_URLS
=
{
'MMBench_dev_ar'
:
'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_ar.tsv'
,
'MMBench_dev_cn'
:
'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_cn.tsv'
,
'MMBench_dev_en'
:
'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_en.tsv'
,
'MMBench_dev_pt'
:
'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_pt.tsv'
,
'MMBench_dev_tr'
:
'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_tr.tsv'
,
'MMBench_dev_ru'
:
'https://huggingface.co/datasets/AIDC-AI/Parrot-dataset/resolve/main/mmbench/mmbench_dev_ru.tsv'
,
}
MMMB_MD5
=
{
'MMMB_ar'
:
'f3a18b6385f1d9701840aa42de27aead'
,
'MMMB_cn'
:
'13ed82fa89730037292fcaa27f08f430'
,
'MMMB_en'
:
'1cd781a71ec5a2983c090b84105d6a01'
,
'MMMB_pt'
:
'548ea2b3bb2da991790386f0015d30d1'
,
'MMMB_ru'
:
'ce1cc8a0533425ab0d86b326ebfc2984'
,
'MMMB_tr'
:
'0733739d43090327975294292bc5cd67'
}
MTL_MMBench_MD5
=
{
'MMBench_dev_ar'
:
'4271b4a0d0200e1a86380a878e0d64a4'
,
'MMBench_dev_cn'
:
'2ed5135326fed02c8e51ea50dda8222f'
,
'MMBench_dev_en'
:
'd9ab776fc018b3d45785e9a5c23431c2'
,
'MMBench_dev_pt'
:
'4ddfbcd27ef12444b908c03831cd0295'
,
'MMBench_dev_tr'
:
'4fab39d501389d3d6cc90264bb708f11'
,
'MMBench_dev_ru'
:
'5ba1171ff2e68f80637bf78349e402a5'
}
class
ImageMCQDataset
(
ImageBaseDataset
):
TYPE
=
'MCQ'
DATASET_URL
=
{
# MMBench v1.0
'MMBench_DEV_EN'
:
'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_EN.tsv'
,
'MMBench_TEST_EN'
:
'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_EN.tsv'
,
'MMBench_DEV_CN'
:
'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_CN.tsv'
,
'MMBench_TEST_CN'
:
'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_CN.tsv'
,
'MMBench'
:
'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench.tsv'
,
# Internal
'MMBench_CN'
:
'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_CN.tsv'
,
# Internal
# MMBench v1.1
'MMBench_DEV_EN_V11'
:
'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_EN_V11.tsv'
,
'MMBench_TEST_EN_V11'
:
'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_EN_V11.tsv'
,
'MMBench_DEV_CN_V11'
:
'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_DEV_CN_V11.tsv'
,
'MMBench_TEST_CN_V11'
:
'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_TEST_CN_V11.tsv'
,
'MMBench_V11'
:
'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_V11.tsv'
,
# Internal
'MMBench_CN_V11'
:
'https://opencompass.openxlab.space/utils/benchmarks/MMBench/MMBench_CN_V11.tsv'
,
# Internal
# SEEDBench Series
'SEEDBench_IMG'
:
'https://opencompass.openxlab.space/utils/benchmarks/SEEDBench/SEEDBench_IMG.tsv'
,
'SEEDBench2'
:
'https://huggingface.co/datasets/VLMEval/SEEDBench2/resolve/main/SEEDBench2.tsv'
,
'SEEDBench2_Plus'
:
'https://opencompass.openxlab.space/utils/benchmarks/SEEDBench/SEEDBench2_Plus.tsv'
,
# ScienceQA Series
'ScienceQA_VAL'
:
'https://opencompass.openxlab.space/utils/benchmarks/ScienceQA/ScienceQA_VAL.tsv'
,
'ScienceQA_TEST'
:
'https://opencompass.openxlab.space/utils/benchmarks/ScienceQA/ScienceQA_TEST.tsv'
,
# MMT-Bench
'MMT-Bench_ALL_MI'
:
'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_ALL_MI.tsv'
,
'MMT-Bench_ALL'
:
'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_ALL.tsv'
,
'MMT-Bench_VAL_MI'
:
'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_VAL_MI.tsv'
,
'MMT-Bench_VAL'
:
'https://opencompass.openxlab.space/utils/benchmarks/MMT-Bench/MMT-Bench_VAL.tsv'
,
# AesBench
'AesBench_VAL'
:
'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_VAL.tsv'
,
'AesBench_TEST'
:
'https://huggingface.co/datasets/VLMEval/AesBench/resolve/main/AesBench_TEST.tsv'
,
# Q-Bench1
'Q-Bench1_VAL'
:
'https://huggingface.co/datasets/zhangzicheng/qbench_tsv/resolve/main/Q-Bench1_VAL.tsv'
,
'Q-Bench1_TEST'
:
'https://huggingface.co/datasets/zhangzicheng/qbench_tsv/resolve/main/Q-Bench1_TEST.tsv'
,
# A-Bench
'A-Bench_VAL'
:
'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_VAL.tsv'
,
'A-Bench_TEST'
:
'https://huggingface.co/datasets/zhangzicheng/abench_tsv/resolve/main/A-bench_TEST.tsv'
,
# R-Bench
'R-Bench-Dis'
:
'https://huggingface.co/datasets/lcysyzxdxc/R-Bench/blob/main/R-bench-dis.tsv'
,
'R-Bench-Ref'
:
'https://huggingface.co/datasets/lcysyzxdxc/R-Bench/blob/main/R-bench-ref.tsv'
,
# Other Benchmarks
'CCBench'
:
'https://opencompass.openxlab.space/utils/VLMEval/CCBench.tsv'
,
'AI2D_TEST'
:
'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST.tsv'
,
'AI2D_TEST_NO_MASK'
:
'https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST_NO_MASK.tsv'
,
'MMStar'
:
'https://opencompass.openxlab.space/utils/VLMEval/MMStar.tsv'
,
'RealWorldQA'
:
'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv'
,
'MLLMGuard_DS'
:
'https://opencompass.openxlab.space/utils/VLMEval/MLLMGuard_DS.tsv'
,
'BLINK'
:
'https://opencompass.openxlab.space/utils/VLMEval/BLINK.tsv'
,
'TaskMeAnything_v1_imageqa_random'
:
(
'https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random/'
'resolve/main/TaskMeAnything-v1-imageqa-random.tsv'
),
'A-OKVQA'
:
'https://huggingface.co/datasets/Allen8/A-OKVQA/resolve/main/a-okvqa.tsv'
,
'WorldMedQA-V'
:
'https://opencompass.openxlab.space/utils/VLMEval/WorldMedQA-V.tsv'
,
'VisOnlyQA-VLMEvalKit'
:
(
'https://huggingface.co/datasets/ryokamoi/VisOnlyQA_Eval_Real/'
'resolve/main/visonlyqa_vlmevalkit.tsv'
),
}
DATASET_MD5
=
{
# MMBench v1.0
'MMBench_DEV_EN'
:
'b6caf1133a01c6bb705cf753bb527ed8'
,
'MMBench_TEST_EN'
:
'6939fadb0ce626fefc0bdc9c64efc528'
,
'MMBench_DEV_CN'
:
'08b8fc3324a5ed74155350f57be69fbd'
,
'MMBench_TEST_CN'
:
'7e1239baf0ee4c8b513e19705a0f317e'
,
'MMBench'
:
'4115aea3383f3dd0083be6a633e0f820'
,
# Internal Only
'MMBench_CN'
:
'2e053ffc90ea598b1feae13c36dc13ee'
,
# Internal Only
# MMBench v1.1
'MMBench_DEV_EN_V11'
:
'30c05be8f2f347a50be25aa067248184'
,
'MMBench_TEST_EN_V11'
:
'26f0f15381a21720255091d3e0316ce6'
,
'MMBench_DEV_CN_V11'
:
'593f9b5f6bea453d870a798b34ae4f37'
,
'MMBench_TEST_CN_V11'
:
'74bbe4556dac745613c7cbe5ad787050'
,
'MMBench_V11'
:
'b9276414f57af1308dcc4d0cd9b42e7c'
,
# Internal Only
'MMBench_CN_V11'
:
'95f6980dd1b4de38e3cbffe0305a3f25'
,
# Internal Only
# SEEDBench
'SEEDBench_IMG'
:
'68017231464752261a2526d6ca3a10c0'
,
'SEEDBench2'
:
'4ec15cf864c4f16274112284f531813e'
,
'SEEDBench2_Plus'
:
'e32d3216dc4f452b0fe497a52015d1fd'
,
# ScienceQA
'ScienceQA_VAL'
:
'96320d05e142e585e7204e72affd29f3'
,
'ScienceQA_TEST'
:
'e42e9e00f9c59a80d8a5db35bc32b71f'
,
# MMT-Bench
'MMT-Bench_ALL_MI'
:
'5272157097e19cdd7cb41e412ab3b7c7'
,
'MMT-Bench_ALL'
:
'b273a2f4c596fe4f2605de0494cd632f'
,
'MMT-Bench_VAL_MI'
:
'c7d7b998eb5cd9aa36c7d4f721472462'
,
'MMT-Bench_VAL'
:
'8dd4b730f53dbf9c3aed90ca31c928e0'
,
# AesBench
'AesBench_VAL'
:
'3edb0c319e9187aa0b97fe7a11700a8c'
,
'AesBench_TEST'
:
'58b1f7ba2cc32e1d68896d6ee716bbf8'
,
# Q-Bench1
'Q-Bench1_VAL'
:
'837bdb6cd2da571713543462815187b7'
,
'Q-Bench1_TEST'
:
'15e759bfd58c9d5f30b23a317d347153'
,
# A-Bench
'A-Bench_VAL'
:
'218563ec50d34bb336c814143a5bb9c1'
,
'A-Bench_TEST'
:
'567013fb033a20cf23f51d8e865bd16c'
,
# R-Bench
'R-Bench-Dis'
:
'd6e961dbfc43350688af2560226830b4'
,
'R-Bench-Ref'
:
'270c1cb555acb523f3fdb178ed57021d'
,
# Other Benchmarks
'CCBench'
:
'f5dde47f24dc5a6fb6e595b409b466ac'
,
'AI2D_TEST'
:
'0f593e0d1c7df9a3d69bf1f947e71975'
,
'AI2D_TEST_NO_MASK'
:
'fd8f463634d4fe9fbd23b876e8eea5be'
,
'MMStar'
:
'e1ecd2140806c1b1bbf54b43372efb9e'
,
'RealWorldQA'
:
'92321028d2bc29040284b6674721e48f'
,
'MLLMGuard_DS'
:
'975fc0dd7119386e198c37d71e274b3f'
,
'BLINK'
:
'3b6649b6a662184ea046908e5506260e'
,
'TaskMeAnything_v1_imageqa_random'
:
'023fef69e2ca21827afb77c5ec3bc889'
,
'WorldMedQA-V'
:
'441e63875e30c87f5750528b57b41285'
,
"VisOnlyQA-VLMEvalKit"
:
'cf460a31d2acb8d3a7cecd0e69298bfa'
,
}
DATASET_URL
.
update
(
MMMB_URLS
)
DATASET_URL
.
update
(
MTL_MMBench_URLS
)
DATASET_MD5
.
update
(
MMMB_MD5
)
DATASET_MD5
.
update
(
MTL_MMBench_MD5
)
def
build_prompt
(
self
,
line
):
if
isinstance
(
line
,
int
):
line
=
self
.
data
.
iloc
[
line
]
if
self
.
meta_only
:
tgt_path
=
toliststr
(
line
[
'image_path'
])
else
:
tgt_path
=
self
.
dump_image
(
line
)
question
=
line
[
'question'
]
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
options_prompt
=
'Options:
\n
'
for
key
,
item
in
options
.
items
():
options_prompt
+=
f
'
{
key
}
.
{
item
}
\n
'
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
prompt
=
''
if
hint
is
not
None
:
prompt
+=
f
'Hint:
{
hint
}
\n
'
prompt
+=
f
'Question:
{
question
}
\n
'
if
len
(
options
):
prompt
+=
options_prompt
prompt
+=
'Please select the correct answer from the options above.
\n
'
msgs
=
[]
if
isinstance
(
tgt_path
,
list
):
msgs
.
extend
([
dict
(
type
=
'image'
,
value
=
p
)
for
p
in
tgt_path
])
else
:
msgs
=
[
dict
(
type
=
'image'
,
value
=
tgt_path
)]
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
prompt
))
return
msgs
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
from
.utils.multiple_choice
import
report_acc
,
report_acc_MMT
,
mcq_circular_eval
,
mcq_vanilla_eval
# assert dataset is not None
dataset_map
=
{
'MMBench_TEST_EN'
:
'MMBench'
,
'MMBench_TEST_EN_V11'
:
'MMBench_V11'
,
'MMBench_TEST_CN'
:
'MMBench_CN'
,
'MMBench_TEST_CN_V11'
:
'MMBench_CN_V11'
}
dataset
=
self
.
dataset_name
if
dataset
in
dataset_map
:
dataset
=
dataset_map
[
dataset
]
nproc
=
judge_kwargs
.
pop
(
'nproc'
,
4
)
circular
=
False
if
listinstr
([
'mmbench'
,
'ccbench'
],
dataset
.
lower
()):
data
=
load
(
eval_file
)
data
[
'index'
]
=
[
int
(
x
)
for
x
in
data
[
'index'
]]
dump
(
data
,
eval_file
)
circular
=
True
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
model
=
judge_kwargs
.
get
(
'model'
,
'exact_matching'
)
assert
model
in
[
'chatgpt-0125'
,
'exact_matching'
,
'gpt-4-0125'
]
name_str_map
=
{
'chatgpt-0125'
:
'openai'
,
'gpt-4-0125'
:
'gpt4'
}
name_str
=
name_str_map
[
model
]
if
model
in
name_str_map
else
model
if
model
==
'exact_matching'
:
model
=
None
elif
gpt_key_set
():
model
=
build_judge
(
**
judge_kwargs
)
if
not
model
.
working
():
warnings
.
warn
(
'OPENAI API is not working properly, will use exact matching for evaluation'
)
warnings
.
warn
(
DEBUG_MESSAGE
)
model
=
None
else
:
warnings
.
warn
(
'OPENAI_API_KEY is not set properly, will use exact matching for evaluation'
)
model
=
None
result_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
name_str
}
_result.pkl'
)
data
=
load
(
eval_file
)
data
=
data
.
sort_values
(
by
=
'index'
)
data
[
'prediction'
]
=
[
str
(
x
)
for
x
in
data
[
'prediction'
]]
# If not choice label, then use lower case
for
k
in
data
.
keys
():
data
[
k
.
lower
()
if
k
not
in
list
(
string
.
ascii_uppercase
)
else
k
]
=
data
.
pop
(
k
)
meta
=
self
.
data
meta_q_map
=
{
x
:
y
for
x
,
y
in
zip
(
meta
[
'index'
],
meta
[
'question'
])}
data_map
=
{
x
:
y
for
x
,
y
in
zip
(
data
[
'index'
],
data
[
'question'
])}
for
k
in
data_map
:
assert
k
in
meta_q_map
,
(
f
'eval_file should be the same as or a subset of dataset
{
self
.
dataset_name
}
'
)
if
circular
:
data
=
mcq_circular_eval
(
model
,
data
,
meta
,
nproc
,
result_file
,
self
.
dataset_name
)
else
:
data
=
mcq_vanilla_eval
(
model
,
data
,
meta
,
nproc
,
result_file
,
self
.
dataset_name
)
# load split
dump
(
data
,
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
name_str
}
_result.
{
suffix
}
'
))
data
=
load
(
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
name_str
}
_result.
{
suffix
}
'
))
# May have different report acc functions for different datasets
if
'MMT'
in
dataset
:
acc
=
report_acc_MMT
(
data
)
else
:
acc
=
report_acc
(
data
)
score_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
'_acc.csv'
)
dump
(
acc
,
score_file
)
if
dataset
==
'AesBench_VAL'
:
warnings
.
warn
(
'Note that AesBench VAL is just a toy version of AesBench TEST. For full results,
\
please evaluate on AesBench TEST. The AesBench TEST dataset is more than 20 times
\
larger than the VAL dataset and the leaderboard results are based on AesBench TEST.'
)
if
dataset
==
'VisOnlyQA-VLMEvalKit'
:
warnings
.
warn
(
'Note that the results on VisOnlyQA-VLMEvalKit are different from the results on
\
the original VisOnlyQA. VisOnlyQA-VLMEvalKit does not include the
\
chemistry__shape_multi split and uses a different evaluation prompt. Please
\
explicitly specify the version of the dataset when you report results.'
)
return
acc
class
MMMUDataset
(
ImageMCQDataset
):
DATASET_URL
=
{
'MMMU_DEV_VAL'
:
'https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv'
,
'MMMU_TEST'
:
'https://opencompass.openxlab.space/utils/VLMEval/MMMU_TEST.tsv'
,
}
DATASET_MD5
=
{
'MMMU_DEV_VAL'
:
'521afc0f3bf341e6654327792781644d'
,
'MMMU_TEST'
:
'c19875d11a2d348d07e5eb4bdf33166d'
,
}
@
staticmethod
def
split_MMMU
(
msgs
):
text
,
images
=
None
,
[]
for
s
in
msgs
:
if
s
[
'type'
]
==
'image'
:
images
.
append
(
s
[
'value'
])
elif
s
[
'type'
]
==
'text'
:
assert
text
is
None
text
=
s
[
'value'
]
text_segs
=
text
.
split
(
'<image '
)
if
len
(
text_segs
)
==
1
:
return
msgs
segs
=
[
dict
(
type
=
'text'
,
value
=
text_segs
[
0
])]
for
i
,
seg
in
enumerate
(
text_segs
):
if
i
==
0
:
continue
assert
istype
(
seg
[
0
],
int
)
and
seg
[
1
]
==
'>'
image_idx
=
int
(
seg
[
0
])
-
1
segs
.
append
(
dict
(
type
=
'image'
,
value
=
images
[
image_idx
]))
segs
.
append
(
dict
(
type
=
'text'
,
value
=
seg
[
2
:]))
return
segs
def
build_prompt
(
self
,
line
):
msgs
=
super
().
build_prompt
(
line
)
msgs
=
self
.
split_MMMU
(
msgs
)
return
msgs
class
MUIRDataset
(
ImageMCQDataset
):
DATASET_URL
=
{
'MUIRBench'
:
'http://opencompass.openxxlab.com/utils/VLMEval/MUIRBench.tsv'
}
DATASET_MD5
=
{
'MUIRBench'
:
'2e5e6fd7699761b08a7cb3ab8c0c2ec8'
}
@
staticmethod
def
split_MUIR
(
msgs
):
text
,
images
=
None
,
[]
# Separate images and text from msgs
for
s
in
msgs
:
if
s
[
'type'
]
==
'image'
:
images
.
append
(
s
[
'value'
])
elif
s
[
'type'
]
==
'text'
:
assert
text
is
None
# Ensure only one text entry is expected
text
=
s
[
'value'
]
# Split text by <image> tags
text_segs
=
text
.
split
(
'<image>'
)
# Initialize the segments list
segs
=
[]
# Iterate through the text segments and images
for
i
,
seg
in
enumerate
(
text_segs
):
# Append the image if this is not the first segment and there are still images left
if
i
>
0
and
i
-
1
<
len
(
images
):
segs
.
append
(
dict
(
type
=
'image'
,
value
=
images
[
i
-
1
]))
# Append the text segment (if it's non-empty)
if
len
(
seg
)
>
0
:
segs
.
append
(
dict
(
type
=
'text'
,
value
=
seg
))
return
segs
def
build_prompt
(
self
,
line
):
if
isinstance
(
line
,
int
):
line
=
self
.
data
.
iloc
[
line
]
if
self
.
meta_only
:
tgt_path
=
toliststr
(
line
[
'image_path'
])
else
:
tgt_path
=
self
.
dump_image
(
line
)
question
=
line
[
'question'
]
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
# options_prompt = ''
options_prompt
=
'
\n
'
.
join
([
f
'
{
key
}
.
{
item
}
'
for
key
,
item
in
options
.
items
()])
# for key, item in options.items():
# options_prompt += f'{key}. {item}\n'
prompt
=
''
prompt
+=
f
'
{
question
}
\n
'
if
len
(
options
):
prompt
+=
options_prompt
prompt
+=
"
\n
Answer with the option's letter from the given choices directly."
msgs
=
[]
if
isinstance
(
tgt_path
,
list
):
msgs
.
extend
([
dict
(
type
=
'image'
,
value
=
p
)
for
p
in
tgt_path
])
else
:
msgs
=
[
dict
(
type
=
'image'
,
value
=
tgt_path
)]
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
prompt
))
msgs
=
self
.
split_MUIR
(
msgs
)
return
msgs
class
GMAIMMBenchDataset
(
ImageMCQDataset
):
DATASET_URL
=
{
'GMAI-MMBench_VAL'
:
'https://huggingface.co/datasets/VLMEval/GMAI-MMBench/resolve/main/GMAI-MMBench_VAL.tsv'
,
'GMAI_mm_bench_TEST_part_1'
:
'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_1.tsv'
,
# noqa: E501
'GMAI_mm_bench_TEST_part_2'
:
'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_2.tsv'
,
# noqa: E501
'GMAI_mm_bench_TEST_part_3'
:
'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_3.tsv'
,
# noqa: E501
'GMAI_mm_bench_TEST_part_4'
:
'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_4.tsv'
,
# noqa: E501
'GMAI_mm_bench_TEST_part_5'
:
'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_5.tsv'
,
# noqa: E501
'GMAI_mm_bench_TEST_part_6'
:
'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_6.tsv'
,
# noqa: E501
'GMAI_mm_bench_TEST_part_7'
:
'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_7.tsv'
,
# noqa: E501
'GMAI_mm_bench_TEST_part_8'
:
'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_8.tsv'
,
# noqa: E501
'GMAI_mm_bench_TEST_part_9'
:
'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_9.tsv'
,
# noqa: E501
'GMAI_mm_bench_TEST_part_10'
:
'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_10.tsv'
,
# noqa: E501
'GMAI_mm_bench_TEST_part_11'
:
'https://huggingface.co/datasets/OpenGVLab/GMAI-MMBench/resolve/main/GMAI_mm_bench_TEST_part_11.tsv'
,
# noqa: E501
}
DATASET_MD5
=
{
'GMAI-MMBench_VAL'
:
'254bd581627866f1c499d3d6b4422324'
,
'GMAI_mm_bench_TEST_part_1'
:
'900d735231230a63f4ed45665c078ef4'
,
'GMAI_mm_bench_TEST_part_2'
:
'1b27ab621386945d7e4a765ad2d22b0e'
,
'GMAI_mm_bench_TEST_part_3'
:
'44bdc2b6267dd505d529b8cad06f0fb2'
,
'GMAI_mm_bench_TEST_part_4'
:
'5a04a04fcac9f1466709f242fdb80acb'
,
'GMAI_mm_bench_TEST_part_5'
:
'c70baf8909eda9af0ddeab275c721336'
,
'GMAI_mm_bench_TEST_part_6'
:
'825abc39596b644dead9350d0cfa3b96'
,
'GMAI_mm_bench_TEST_part_7'
:
'defb8aed2fb77365a76b6b9abd6a2701'
,
'GMAI_mm_bench_TEST_part_8'
:
'ff490d60b85f2bb0abb67a435b298c65'
,
'GMAI_mm_bench_TEST_part_9'
:
'ff67c86f40da93b09139ac1d1ba5dc6b'
,
'GMAI_mm_bench_TEST_part_10'
:
'3dae94627b9ac0fe00180d4780fbf6dc'
,
'GMAI_mm_bench_TEST_part_11'
:
'd08dc813f0eb6bbab63cae2a9d113c4b'
,
}
@
classmethod
def
supported_datasets
(
cls
):
return
[
'GMAI-MMBench_VAL'
,
'GMAI-MMBench_TEST'
]
def
load_data
(
self
,
dataset
):
if
dataset
==
'GMAI-MMBench_VAL'
:
data_path
=
osp
.
join
(
LMUDataRoot
(),
f
'
{
dataset
}
.tsv'
)
if
file_size
(
data_path
,
'GB'
)
>
1
:
local_path
=
data_path
.
replace
(
'.tsv'
,
'_local.tsv'
)
if
not
osp
.
exists
(
local_path
)
or
os
.
environ
.
get
(
'FORCE_LOCAL'
):
from
..tools
import
LOCALIZE
LOCALIZE
(
data_path
,
local_path
)
data_path
=
local_path
return
load
(
data_path
)
elif
dataset
==
'GMAI-MMBench_TEST'
:
dfs
=
[]
for
part_num
in
range
(
1
,
12
):
part_name
=
f
'GMAI_mm_bench_TEST_part_
{
part_num
}
'
url
=
self
.
DATASET_URL
[
part_name
]
file_md5
=
self
.
DATASET_MD5
.
get
(
part_name
)
tsv_path
=
osp
.
join
(
LMUDataRoot
(),
f
'
{
part_name
}
.tsv'
)
if
not
osp
.
exists
(
tsv_path
)
or
(
file_md5
and
md5
(
tsv_path
)
!=
file_md5
):
download_file
(
url
,
filename
=
tsv_path
)
local_path
=
tsv_path
.
replace
(
'.tsv'
,
'_local.tsv'
)
if
not
osp
.
exists
(
local_path
)
or
os
.
environ
.
get
(
'FORCE_LOCAL'
):
from
..tools
import
LOCALIZE
LOCALIZE
(
tsv_path
,
local_path
)
tsv_path
=
local_path
# 加载数据
df
=
load
(
tsv_path
)
dfs
.
append
(
df
)
# 合并所有数据
data
=
pd
.
concat
(
dfs
,
ignore_index
=
True
)
return
data
else
:
raise
ValueError
(
f
"未知的数据集:
{
dataset
}
"
)
def
report_acc_by_groups
(
self
,
df
,
group_column
):
res
=
defaultdict
(
list
)
# Check for the 'split' column
if
'split'
in
df
:
splits
=
list
(
set
(
df
[
'split'
]))
res
[
'split'
]
=
splits
else
:
df
[
'split'
]
=
[
'none'
]
*
len
(
df
)
res
[
'split'
]
=
[
'none'
]
res
[
'Overall'
]
=
[
np
.
mean
(
df
[
df
[
'split'
]
==
sp
][
'hit'
])
for
sp
in
res
[
'split'
]]
if
group_column
not
in
df
:
raise
ValueError
(
f
"Column '
{
group_column
}
' not found in dataframe."
)
# noqa: E713
abilities
=
list
(
set
(
df
[
group_column
]))
abilities
=
[
'None'
if
isinstance
(
ab
,
float
)
and
pd
.
isna
(
ab
)
else
ab
for
ab
in
abilities
]
abilities
.
sort
()
for
ab
in
abilities
:
ab_name
=
ab
sub_df
=
df
[
df
[
group_column
]
==
ab
]
res
[
ab_name
]
=
[
np
.
mean
(
sub_df
[
sub_df
[
'split'
]
==
sp
][
'hit'
])
for
sp
in
res
[
'split'
]]
return
pd
.
DataFrame
(
res
)
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
from
.utils.multiple_choice
import
report_acc
,
mcq_vanilla_eval
nproc
=
judge_kwargs
.
pop
(
'nproc'
,
4
)
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
model
=
judge_kwargs
.
get
(
'model'
,
'exact_matching'
)
assert
model
in
[
'chatgpt-0125'
,
'exact_matching'
,
'gpt-4-0125'
]
name_str_map
=
{
'chatgpt-0125'
:
'openai'
,
'gpt-4-0125'
:
'gpt4'
}
name_str
=
name_str_map
[
model
]
if
model
in
name_str_map
else
model
if
model
==
'exact_matching'
:
model
=
None
elif
gpt_key_set
():
model
=
build_judge
(
**
judge_kwargs
)
if
not
model
.
working
():
warnings
.
warn
(
'OPENAI API is not working properly, will use exact matching for evaluation'
)
warnings
.
warn
(
DEBUG_MESSAGE
)
model
=
None
else
:
warnings
.
warn
(
'OPENAI_API_KEY is not set properly, will use exact matching for evaluation'
)
model
=
None
result_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
name_str
}
_result.pkl'
)
data
=
load
(
eval_file
)
data
=
data
.
sort_values
(
by
=
'index'
)
data
[
'prediction'
]
=
[
str
(
x
)
for
x
in
data
[
'prediction'
]]
# If not choice label, then use lower case
for
k
in
data
.
keys
():
data
[
k
.
lower
()
if
k
not
in
list
(
string
.
ascii_uppercase
)
else
k
]
=
data
.
pop
(
k
)
meta
=
self
.
data
meta_q_map
=
{
x
:
y
for
x
,
y
in
zip
(
meta
[
'index'
],
meta
[
'question'
])}
data_map
=
{
x
:
y
for
x
,
y
in
zip
(
data
[
'index'
],
data
[
'question'
])}
for
k
in
data_map
:
assert
k
in
meta_q_map
,
(
f
'eval_file should be the same as or a subset of dataset
{
self
.
dataset_name
}
'
)
data
=
mcq_vanilla_eval
(
model
,
data
,
meta
,
nproc
,
result_file
,
self
.
dataset_name
)
# load split
dump
(
data
,
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
name_str
}
_result.
{
suffix
}
'
))
data
=
load
(
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
name_str
}
_result.
{
suffix
}
'
))
acc
=
report_acc
(
data
)
for
group_col
in
[
'clinical vqa task'
,
'department'
,
'perceptual granularity'
]:
acc_grouped
=
self
.
report_acc_by_groups
(
data
,
group_col
)
score_file_grouped
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
group_col
}
_acc.csv'
)
dump
(
acc_grouped
,
score_file_grouped
)
return
acc
class
MMERealWorld
(
ImageMCQDataset
):
TYPE
=
'MMERealWorld'
DATASET_MD5
=
{
'MME-RealWorld'
:
'271c33ec814c39533c467ec6fb8a6f36'
,
'MME-RealWorld-Lite'
:
'4c17057d7d3b6c4a0d4397c3dae0881c'
,
'MME-RealWorld-CN'
:
'daaa763d52a760a38606d5dedb3fe444'
,
}
SYS
=
{
'MME-RealWorld'
:
(
'Select the best answer to the above multiple-choice question based on the image. '
'Respond with only the letter (A, B, C, D, or E) of the correct option.
\n
'
'The best answer is:'
),
'MME-RealWorld-Lite'
:
(
'Select the best answer to the above multiple-choice question based on the image. '
'Respond with only the letter (A, B, C, D, or E) of the correct option.
\n
'
'The best answer is:'
),
'MME-RealWorld-CN'
:
(
'根据图像选择上述多项选择题的最佳答案。只需回答正确选项的字母(A, B, C, D 或 E)。
\n
'
'最佳答案为:'
),
}
@
classmethod
def
supported_datasets
(
cls
):
return
[
'MME-RealWorld'
,
'MME-RealWorld-CN'
,
'MME-RealWorld-Lite'
,]
def
load_data
(
self
,
dataset
=
"MME-RealWorld"
,
repo_id
=
"yifanzhang114/MME-RealWorld-Base64"
):
def
check_integrity
(
pth
):
data_file
=
osp
.
join
(
pth
,
f
"
{
dataset
}
.tsv"
)
if
not
os
.
path
.
exists
(
data_file
):
return
False
if
md5
(
data_file
)
!=
self
.
DATASET_MD5
[
dataset
]:
return
False
return
True
def
generate_tsv
(
pth
):
tsv_file
=
os
.
path
.
join
(
pth
,
f
"
{
dataset
}
.tsv"
)
if
os
.
path
.
exists
(
tsv_file
):
print
(
f
"
{
tsv_file
}
already exists."
)
return
json_dir
=
os
.
path
.
join
(
pth
,
dataset
)
json_files
=
[
f
for
f
in
os
.
listdir
(
json_dir
)
if
f
.
endswith
(
".json"
)]
data_list
=
[]
for
json_file
in
json_files
:
with
open
(
os
.
path
.
join
(
json_dir
,
json_file
),
"r"
)
as
f
:
data
=
json
.
load
(
f
)
for
item
in
tqdm
(
data
):
choice_prompt
=
(
"The choices are listed below:
\n
"
if
dataset
in
[
"MME-RealWorld"
,
"MME-RealWorld-Lite"
]
else
"选项如下所示:
\n
"
)
data_list
.
append
(
{
"index"
:
item
[
"index"
],
"image"
:
item
[
"image"
],
"question"
:
item
[
"question"
],
"multi-choice options"
:
choice_prompt
+
"
\n
"
.
join
(
item
[
"multi-choice options"
]),
"A"
:
item
[
"multi-choice options"
][
0
][
4
:],
"B"
:
item
[
"multi-choice options"
][
1
][
4
:],
"C"
:
item
[
"multi-choice options"
][
2
][
4
:],
"D"
:
item
[
"multi-choice options"
][
3
][
4
:],
"E"
:
item
[
"multi-choice options"
][
4
][
4
:],
"answer"
:
item
[
"answer"
],
"category"
:
item
[
"category"
],
"l2-category"
:
item
[
"l2-category"
],
}
)
df
=
pd
.
DataFrame
(
data_list
)
df
.
to_csv
(
tsv_file
,
sep
=
"
\t
"
,
index
=
False
)
print
(
f
"TSV file saved to
{
tsv_file
}
"
)
# Check if dataset is cached and has integrity
if
dataset
==
"MME-RealWorld-Lite"
:
url
=
'https://huggingface.co/datasets/yifanzhang114/MME-RealWorld-Base64/resolve/main/mme_realworld_lite.tsv'
# noqa: E501
file_md5
=
(
self
.
DATASET_MD5
[
dataset
]
if
dataset
in
self
.
DATASET_MD5
else
None
)
datas
=
self
.
prepare_tsv
(
url
,
file_md5
)
choice_prompt
=
"The choices are listed below:
\n
"
for
index
,
item
in
datas
.
iterrows
():
options
=
eval
(
item
[
"multi-choice options"
])
datas
.
loc
[
index
,
"multi-choice options"
]
=
choice_prompt
+
"
\n
"
.
join
(
options
)
datas
.
loc
[
index
,
"A"
]
=
options
[
0
][
4
:]
datas
.
loc
[
index
,
"B"
]
=
options
[
1
][
4
:]
datas
.
loc
[
index
,
"C"
]
=
options
[
2
][
4
:]
datas
.
loc
[
index
,
"D"
]
=
options
[
3
][
4
:]
datas
.
loc
[
index
,
"E"
]
=
options
[
4
][
4
:]
return
datas
update_flag
=
False
cache_path
=
get_cache_path
(
repo_id
)
if
cache_path
is
not
None
and
check_integrity
(
cache_path
):
dataset_path
=
cache_path
print
(
f
"Using cached dataset from
{
cache_path
}
"
)
else
:
from
huggingface_hub
import
snapshot_download
# Download or find the dataset path
dataset_path
=
snapshot_download
(
repo_id
=
repo_id
,
repo_type
=
"dataset"
)
generate_tsv
(
dataset_path
)
update_flag
=
True
data_path
=
os
.
path
.
join
(
dataset_path
,
f
"
{
dataset
}
.tsv"
)
if
file_size
(
data_path
,
"GB"
)
>
1
:
local_path
=
data_path
.
replace
(
".tsv"
,
"_local.tsv"
)
if
(
not
osp
.
exists
(
local_path
)
or
os
.
environ
.
get
(
"FORCE_LOCAL"
,
None
)
or
update_flag
):
from
vlmeval.tools
import
LOCALIZE
LOCALIZE
(
data_path
,
local_path
)
data_path
=
local_path
return
load
(
data_path
)
def
post_build
(
self
,
dataset
):
self
.
TYPE
=
'MMERealWorld'
# Given one data record, return the built prompt (a multi-modal message), can override
def
build_prompt
(
self
,
line
):
if
isinstance
(
line
,
int
):
line
=
self
.
data
.
iloc
[
line
]
if
self
.
meta_only
:
tgt_path
=
toliststr
(
line
[
'image_path'
])
else
:
tgt_path
=
self
.
dump_image
(
line
)
question
=
line
[
'question'
]
choice_prompt
=
line
[
'multi-choice options'
]
+
'
\n
'
question
+=
' '
+
choice_prompt
+
self
.
SYS
[
self
.
dataset_name
]
msgs
=
[]
if
isinstance
(
tgt_path
,
list
):
msgs
.
extend
([
dict
(
type
=
'image'
,
value
=
p
)
for
p
in
tgt_path
])
else
:
msgs
=
[
dict
(
type
=
'image'
,
value
=
tgt_path
)]
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
question
))
return
msgs
# It returns a dictionary
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
from
.utils.multiple_choice
import
extract_characters_regex
,
get_dimension_rating
assert
eval_file
.
endswith
(
'.xlsx'
),
'data file should be an xlsx file'
FAIL_MSG
=
'Failed to obtain answer via API.'
tmp_file
=
eval_file
.
replace
(
'.xlsx'
,
'_tmp.pkl'
)
tgt_file
=
eval_file
.
replace
(
'.xlsx'
,
'_rating.json'
)
score_file
=
eval_file
.
replace
(
'.xlsx'
,
'_score.xlsx'
)
if
not
osp
.
exists
(
score_file
):
res
=
{}
if
not
osp
.
exists
(
tmp_file
)
else
load
(
tmp_file
)
res
=
{
k
:
v
for
k
,
v
in
res
.
items
()
if
FAIL_MSG
not
in
v
}
data
=
load
(
eval_file
)
cnt_rejected
=
0
data_un
=
data
[
~
pd
.
isna
(
data
[
'prediction'
])]
for
idx
in
data
[
'index'
]:
ans
=
data
.
loc
[
data
[
'index'
]
==
idx
,
'answer'
].
values
[
0
]
pred
=
data
.
loc
[
data
[
'index'
]
==
idx
,
'prediction'
].
values
[
0
]
extract_pred
=
extract_characters_regex
(
pred
)
if
extract_pred
==
''
:
cnt_rejected
+=
1
data
.
loc
[
data
[
'index'
]
==
idx
,
'score'
]
=
0
else
:
data
.
loc
[
data
[
'index'
]
==
idx
,
'score'
]
=
int
(
extract_pred
==
ans
)
print
(
f
'Among
{
len
(
data
)
}
questions, failed to obtain prediction for
{
len
(
data
)
-
len
(
data_un
)
}
questions, '
f
'failed to obtain the score for another
{
cnt_rejected
}
questions. '
f
'Those questions will be counted as 0 score in ALL rating.'
)
dump
(
data
,
score_file
)
rating
=
get_dimension_rating
(
score_file
)
dump
(
rating
,
tgt_file
)
return
rating
class
HRBenchDataset
(
ImageMCQDataset
):
DATASET_URL
=
{
'HRBench4K'
:
'https://huggingface.co/datasets/DreamMr/HR-Bench/resolve/main/hr_bench_4k.tsv'
,
'HRBench8K'
:
'https://huggingface.co/datasets/DreamMr/HR-Bench/resolve/main/hr_bench_8k.tsv'
,
}
DATASET_MD5
=
{
'HRBench4K'
:
'f6b041b03d49543494b8a56d2e35be65'
,
'HRBench8K'
:
'274c9c7f89329b804a4723178a00219c'
,
}
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
assert
os
.
path
.
exists
(
eval_file
),
'{} does not exist!'
.
format
(
eval_file
)
from
.utils.multiple_choice
import
mcq_vanilla_eval
from
.utils.hrbench
import
report_acc_hrbench
nproc
=
judge_kwargs
.
pop
(
'nproc'
,
4
)
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
model
=
judge_kwargs
.
get
(
'model'
,
'extract_matching'
)
assert
model
in
[
'chatgpt-0125'
,
'exact_matching'
,
'gpt-4-0125'
]
name_str_map
=
{
'chatgpt-0125'
:
'openai'
,
'gpt-4-0125'
:
'gpt4'
}
name_str
=
name_str_map
[
model
]
if
model
in
name_str_map
else
model
if
model
==
'exact_matching'
:
model
=
None
elif
gpt_key_set
():
model
=
build_judge
(
**
judge_kwargs
)
if
not
model
.
working
():
warnings
.
warn
(
'OPENAI API is not working properly, will use exact matching for evaluation'
)
warnings
.
warn
(
DEBUG_MESSAGE
)
model
=
None
else
:
warnings
.
warn
(
'OPENAI_API_KEY is not set properly, will use exact matching for evaluation'
)
model
=
None
result_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
name_str
}
_result.pkl'
)
data
=
load
(
eval_file
)
data
=
data
.
sort_values
(
by
=
'index'
)
data
[
'prediction'
]
=
[
str
(
x
)
for
x
in
data
[
'prediction'
]]
# If not choice label, then use lower case
for
k
in
data
.
keys
():
data
[
k
.
lower
()
if
k
not
in
list
(
string
.
ascii_uppercase
)
else
k
]
=
data
.
pop
(
k
)
meta
=
self
.
data
meta_q_map
=
{
x
:
y
for
x
,
y
in
zip
(
meta
[
'index'
],
meta
[
'question'
])}
data_map
=
{
x
:
y
for
x
,
y
in
zip
(
data
[
'index'
],
data
[
'question'
])}
for
k
in
data_map
:
assert
k
in
meta_q_map
,
(
f
'eval_file should be the same as or a subset of dataset
{
self
.
dataset_name
}
'
)
score_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
'_acc.csv'
)
if
osp
.
exists
(
score_file
):
acc
=
load
(
score_file
)
return
acc
data
=
mcq_vanilla_eval
(
model
,
data
,
meta
,
nproc
,
result_file
,
self
.
dataset_name
)
dump
(
data
,
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
name_str
}
_result.
{
suffix
}
'
))
data
=
load
(
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
name_str
}
_result.
{
suffix
}
'
))
acc
=
report_acc_hrbench
(
data
)
score_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
'_acc.csv'
)
dump
(
acc
,
score_file
)
return
acc
class
CustomMCQDataset
(
ImageMCQDataset
):
def
load_data
(
self
,
dataset
):
data_path
=
osp
.
join
(
LMUDataRoot
(),
f
'
{
dataset
}
.tsv'
)
if
file_size
(
data_path
,
'GB'
)
>
1
:
local_path
=
data_path
.
replace
(
'.tsv'
,
'_local.tsv'
)
if
not
osp
.
exists
(
local_path
)
or
os
.
environ
.
get
(
'FORCE_LOCAL'
,
None
):
from
..tools
import
LOCALIZE
LOCALIZE
(
data_path
,
local_path
)
data_path
=
local_path
return
load
(
data_path
)
class
NaturalBenchDataset
(
ImageMCQDataset
):
DATASET_URL
=
{
'NaturalBenchDataset'
:
(
'https://huggingface.co/datasets/BaiqiL/'
'NaturalBench/resolve/main/NaturalBenchDataset.tsv'
),
}
DATASET_MD5
=
{
'NaturalBenchDataset'
:
'dbe25b044bc35696426381e9ba4fe930'
,
}
def
build_prompt
(
self
,
line
):
SUFFIX_FOR_VQA
=
{
"yes_no"
:
"Please answer Yes or No."
,
"multiple_choice"
:
"Please output the letter corresponding to the correct option."
}
if
isinstance
(
line
,
int
):
line
=
self
.
data
.
iloc
[
line
]
if
self
.
meta_only
:
tgt_path
=
toliststr
(
line
[
'image_path'
])
else
:
tgt_path
=
self
.
dump_image
(
line
)
question
=
line
[
'question'
]
prompt
=
f
'
{
question
}
{
SUFFIX_FOR_VQA
[
line
[
"type"
]]
}
'
msgs
=
[]
if
isinstance
(
tgt_path
,
list
):
msgs
.
extend
([
dict
(
type
=
'image'
,
value
=
p
)
for
p
in
tgt_path
])
else
:
msgs
=
[
dict
(
type
=
'image'
,
value
=
tgt_path
)]
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
prompt
))
return
msgs
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
from
.utils.naturalbench
import
extract_answer
,
get_scores
data
=
load
(
eval_file
)
data
=
data
.
sort_values
(
by
=
'index'
)
predictions
=
[
str
(
x
)
for
x
in
data
[
'prediction'
]]
answers
=
[
str
(
x
)
for
x
in
data
[
'answer'
]]
indexs
=
[
str
(
x
)
for
x
in
data
[
'index'
]]
meta
=
self
.
data
types
=
[
str
(
x
)
for
x
in
meta
[
'type'
]]
results
=
{}
assert
len
(
predictions
)
==
len
(
answers
)
==
len
(
indexs
)
==
len
(
types
)
==
(
1900
*
4
)
number_answered_samples
=
len
(
predictions
)
//
4
for
i
in
range
(
number_answered_samples
):
results
[
i
]
=
{
"q0_i0"
:
extract_answer
(
predictions
[
i
*
4
],
types
[
i
*
4
]),
"q0_i1"
:
extract_answer
(
predictions
[
i
*
4
+
1
],
types
[
i
*
4
+
1
]),
"q1_i0"
:
extract_answer
(
predictions
[
i
*
4
+
2
],
types
[
i
*
4
+
2
]),
"q1_i1"
:
extract_answer
(
predictions
[
i
*
4
+
3
],
types
[
i
*
4
+
3
])
}
scores
=
get_scores
(
results
)
print
(
scores
)
score_file
=
'NaturalBench_acc.csv'
df
=
pd
.
DataFrame
(
list
(
scores
.
items
()),
columns
=
[
'Metric'
,
'Score'
])
dump
(
df
,
score_file
)
return
scores
VLMEvalKit/vlmeval/dataset/image_mt.py
0 → 100644
View file @
bc5ebf0f
from
.image_base
import
ImageBaseDataset
from
.utils.judge_util
import
build_judge
from
..smp
import
*
from
..utils
import
track_progress_rich
class
ImageMTDataset
(
ImageBaseDataset
):
TYPE
=
'MT'
def
build_prompt
(
self
,
line
):
if
isinstance
(
line
,
int
):
line
=
self
.
data
.
iloc
[
line
]
if
self
.
meta_only
:
tgt_path
=
toliststr
(
line
[
'image_path'
])
else
:
tgt_path
=
self
.
dump_image
(
line
)
questions
=
toliststr
(
line
[
'question'
])
if
'answer'
in
line
:
answers
=
toliststr
(
line
[
'answer'
])
else
:
answers
=
[
''
]
*
len
(
questions
)
assert
len
(
questions
)
==
len
(
answers
)
dlgs
,
pics_number
=
[],
0
for
i
in
range
(
len
(
questions
)):
q
,
a
=
questions
[
i
],
answers
[
i
]
if
'<ImageHere>'
in
q
:
content
=
[]
tag_number
=
q
.
count
(
'<ImageHere>'
)
images
=
tgt_path
[
pics_number
:
pics_number
+
tag_number
]
pics_number
+=
tag_number
q_split
=
q
.
split
(
'<ImageHere>'
)
for
i
in
range
(
tag_number
):
qsp
,
im
=
q_split
[
i
],
images
[
i
]
if
qsp
!=
''
:
content
.
append
(
dict
(
type
=
'text'
,
value
=
qsp
))
content
.
append
(
dict
(
type
=
'image'
,
value
=
im
))
if
q_split
[
-
1
]
!=
''
:
content
.
append
(
dict
(
type
=
'text'
,
value
=
q_split
[
-
1
]))
else
:
content
=
[
dict
(
type
=
'text'
,
value
=
q
)]
dlgs
.
append
(
dict
(
role
=
'user'
,
content
=
content
))
assert
'<ImageHere>'
not
in
a
,
'We currently do not support images in the answer. '
content
=
[
dict
(
type
=
'text'
,
value
=
a
)]
dlgs
.
append
(
dict
(
role
=
'assistant'
,
content
=
content
))
return
dlgs
class
MMDUDataset
(
ImageMTDataset
):
DATASET_URL
=
{
'MMDU'
:
'https://opencompass.openxlab.space/utils/VLMEval/MMDU.tsv'
}
DATASET_MD5
=
{
'MMDU'
:
'848b635a88a078f49aebcc6e39792061'
}
DIMS
=
[
'Creativity'
,
'Richness'
,
'Visual Perception'
,
'Logical Coherence'
,
'Answer Accuracy'
,
'Image Relationship Understanding'
,
'Overall Score'
]
def
calculat_metric
(
self
,
ans
):
all
=
defaultdict
(
lambda
:
0
)
tot
=
defaultdict
(
lambda
:
0
)
valid
=
defaultdict
(
lambda
:
0
)
for
k
in
ans
:
res
=
ans
[
k
][
'res'
]
assert
isinstance
(
res
,
pd
.
DataFrame
)
lt
=
len
(
res
)
for
i
in
range
(
lt
):
line
=
res
.
iloc
[
i
]
for
k
in
self
.
DIMS
:
tot
[
k
]
+=
1
if
k
in
line
and
line
[
k
]
is
not
None
:
try
:
score
=
int
(
line
[
k
])
score
=
np
.
clip
(
score
,
0
,
10
)
all
[
k
]
+=
score
valid
[
k
]
+=
1
except
Exception
as
e
:
print
(
f
'Failed to parse the score:
{
str
(
e
)
}
'
)
sp1
=
{
'set'
:
'all'
}
sp1
.
update
({
k
:
all
[
k
]
/
tot
[
k
]
*
10
for
k
in
self
.
DIMS
})
sp2
=
{
'set'
:
'valid'
}
sp2
.
update
({
k
:
all
[
k
]
/
valid
[
k
]
*
10
for
k
in
self
.
DIMS
})
return
pd
.
DataFrame
([
sp1
,
sp2
])
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
model
=
judge_kwargs
[
'model'
]
tmp_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
.pkl'
)
score_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
_score.csv'
)
nproc
=
judge_kwargs
.
pop
(
'nproc'
,
4
)
data
=
load
(
eval_file
)
model
=
judge_kwargs
.
pop
(
'model'
,
'gpt-4o'
)
judge_model
=
build_judge
(
model
=
model
,
**
judge_kwargs
)
lt
=
len
(
data
)
lines
=
[
data
.
iloc
[
i
]
for
i
in
range
(
lt
)]
tups
=
[(
judge_model
,
line
)
for
line
in
lines
]
indices
=
[
line
[
'index'
]
for
line
in
lines
]
ans
=
{}
if
osp
.
exists
(
tmp_file
):
ans
=
load
(
tmp_file
)
tups
=
[
x
for
x
,
i
in
zip
(
tups
,
indices
)
if
i
not
in
ans
]
indices
=
[
i
for
i
in
indices
if
i
not
in
ans
]
from
.utils.mmdu
import
mmdu_score
if
len
(
indices
):
new_results
=
track_progress_rich
(
mmdu_score
,
tups
,
nproc
=
nproc
,
chunksize
=
nproc
,
keys
=
indices
,
save
=
tmp_file
,)
ans
=
load
(
tmp_file
)
for
k
,
v
in
zip
(
indices
,
new_results
):
assert
k
in
ans
metric
=
self
.
calculat_metric
(
ans
)
dump
(
metric
,
score_file
)
return
metric
VLMEvalKit/vlmeval/dataset/image_vqa.py
0 → 100644
View file @
bc5ebf0f
import
os
import
re
import
tempfile
from
functools
import
partial
import
pandas
as
pd
from
.image_base
import
ImageBaseDataset
from
.utils
import
build_judge
,
DEBUG_MESSAGE
from
..smp
import
*
from
..utils
import
track_progress_rich
class
ImageVQADataset
(
ImageBaseDataset
):
TYPE
=
'VQA'
DATASET_URL
=
{
'OCRVQA_TEST'
:
'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TEST.tsv'
,
'OCRVQA_TESTCORE'
:
'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TESTCORE.tsv'
,
'TextVQA_VAL'
:
'https://opencompass.openxlab.space/utils/VLMEval/TextVQA_VAL.tsv'
,
'DocVQA_VAL'
:
'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_VAL.tsv'
,
'DocVQA_TEST'
:
'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_TEST.tsv'
,
'InfoVQA_VAL'
:
'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_VAL.tsv'
,
'InfoVQA_TEST'
:
'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_TEST.tsv'
,
'ChartQA_TEST'
:
'https://opencompass.openxlab.space/utils/VLMEval/ChartQA_TEST.tsv'
,
'GQA_TestDev_Balanced'
:
'https://opencompass.openxlab.space/utils/VLMEval/GQA_TestDev_Balanced.tsv'
,
}
DATASET_MD5
=
{
'OCRVQA_TEST'
:
'ca46a6d74b403e9d6c0b670f6fc00db9'
,
'OCRVQA_TESTCORE'
:
'c5239fe77db8bdc1f2ad8e55e0d1fe97'
,
'TextVQA_VAL'
:
'b233b31f551bbf4056f2f955da3a92cd'
,
'DocVQA_VAL'
:
'd5ee77e1926ff10690d469c56b73eabf'
,
'DocVQA_TEST'
:
'6a2f28cac26ef2d3447374e8c6f6c8e9'
,
'InfoVQA_VAL'
:
'2342e9c225222f0ef4dec545ebb126fe'
,
'InfoVQA_TEST'
:
'df535bf51b88dc9718252c34131a6227'
,
'ChartQA_TEST'
:
'c902e0aa9be5582a7aad6dcf52734b42'
,
'GQA_TestDev_Balanced'
:
'fead7df22befc1ed3ca2b62ea26fa17b'
,
}
def
build_prompt
(
self
,
line
):
msgs
=
super
().
build_prompt
(
line
)
assert
msgs
[
-
1
][
'type'
]
==
'text'
msgs
[
-
1
][
'value'
]
+=
'
\n
Answer the question using a single word or phrase.'
return
msgs
# It returns a DataFrame
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
from
.utils.vqa_eval
import
hit_calculate
,
process_line
data
=
load
(
eval_file
)
dataset
=
self
.
dataset_name
assert
'answer'
in
data
and
'prediction'
in
data
data
[
'prediction'
]
=
[
str
(
x
)
for
x
in
data
[
'prediction'
]]
data
[
'answer'
]
=
[
str
(
x
)
for
x
in
data
[
'answer'
]]
lt
=
len
(
data
)
pool
=
mp
.
Pool
(
16
)
lines
=
[
data
.
iloc
[
i
]
for
i
in
range
(
lt
)]
if
listinstr
([
'TextVQA'
],
dataset
):
res
=
pool
.
map
(
partial
(
process_line
,
method
=
'vqa_score'
),
lines
)
elif
listinstr
([
'ChartQA'
],
dataset
):
res
=
pool
.
map
(
partial
(
process_line
,
method
=
'relaxed_accuracy'
),
lines
)
elif
listinstr
([
'OCRVQA'
,
'GQA'
],
dataset
):
res
=
pool
.
map
(
partial
(
process_line
,
method
=
'accuracy'
),
lines
)
elif
listinstr
([
'DocVQA'
,
'InfoVQA'
],
dataset
):
res
=
pool
.
map
(
partial
(
process_line
,
method
=
'anls'
),
lines
)
else
:
# default using vqa_score to calculate score
res
=
pool
.
map
(
process_line
,
lines
)
hit
=
hit_calculate
(
res
,
dataset
)
ret
=
dict
()
if
'split'
in
data
:
splits
=
set
(
data
[
'split'
])
for
sp
in
splits
:
sub
=
[
r
for
l
,
r
in
zip
(
lines
,
res
)
if
l
[
'split'
]
==
sp
]
# [np.mean(x['match']) >= full_score_weight for x in sub]
hit
=
hit_calculate
(
sub
,
dataset
)
ret
[
sp
]
=
np
.
mean
(
hit
)
*
100
sub
=
[
r
for
l
,
r
in
zip
(
lines
,
res
)]
hit
=
hit_calculate
(
sub
,
dataset
)
ret
[
'Overall'
]
=
np
.
mean
(
hit
)
*
100
else
:
ret
[
'Overall'
]
=
np
.
mean
(
hit
)
*
100
if
'category'
in
data
:
cates
=
list
(
set
(
data
[
'category'
]))
cates
.
sort
()
for
c
in
cates
:
sub
=
[
r
for
l
,
r
in
zip
(
lines
,
res
)
if
l
[
'category'
]
==
c
]
# [np.mean(x['match']) >= full_score_weight for x in sub]
hit
=
hit_calculate
(
sub
,
dataset
)
ret
[
c
]
=
np
.
mean
(
hit
)
*
100
ret
=
d2df
(
ret
)
ret
.
round
(
2
)
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
result_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
'_acc.csv'
)
dump
(
ret
,
result_file
)
return
ret
class
VizWiz
(
ImageBaseDataset
):
TYPE
=
'VQA'
DATASET_URL
=
{
'VizWiz'
:
'https://opencompass.openxlab.space/utils/VLMEval/VizWiz.tsv'
}
DATASET_MD5
=
{
'VizWiz'
:
'fa4ac4164467563ed2fac6eac6631bd0'
}
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
from
.utils.vqa_eval
import
hit_calculate
,
process_line
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
result_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
'_acc.csv'
)
if
not
osp
.
exists
(
result_file
):
data
=
load
(
eval_file
)
assert
'answers'
in
data
and
'prediction'
in
data
data
[
'prediction'
]
=
[
str
(
x
)
for
x
in
data
[
'prediction'
]]
data
[
'answer'
]
=
[
str
(
x
)
for
x
in
data
[
'answers'
]]
lt
=
len
(
data
)
pool
=
mp
.
Pool
(
16
)
lines
=
[
data
.
iloc
[
i
]
for
i
in
range
(
lt
)]
res
=
pool
.
map
(
process_line
,
lines
)
hit
=
hit_calculate
(
res
,
'VizWiz'
)
ret
=
dict
()
ret
[
'Overall'
]
=
np
.
mean
(
hit
)
*
100
ret
=
d2df
(
ret
)
ret
.
round
(
2
)
dump
(
ret
,
result_file
)
retz
=
pd
.
read_csv
(
result_file
)
return
retz
class
OCRBench
(
ImageBaseDataset
):
TYPE
=
'VQA'
DATASET_URL
=
{
'OCRBench'
:
'https://opencompass.openxlab.space/utils/VLMEval/OCRBench.tsv'
}
DATASET_MD5
=
{
'OCRBench'
:
'e953d98a987cc6e26ef717b61260b778'
}
# It returns a dictionary
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
OCRBench_score
=
{
'Regular Text Recognition'
:
0
,
'Irregular Text Recognition'
:
0
,
'Artistic Text Recognition'
:
0
,
'Handwriting Recognition'
:
0
,
'Digit String Recognition'
:
0
,
'Non-Semantic Text Recognition'
:
0
,
'Scene Text-centric VQA'
:
0
,
'Doc-oriented VQA'
:
0
,
'Key Information Extraction'
:
0
,
'Handwritten Mathematical Expression Recognition'
:
0
,
}
data
=
load
(
eval_file
)
lt
=
len
(
data
)
lines
=
[
data
.
iloc
[
i
]
for
i
in
range
(
lt
)]
for
i
in
tqdm
(
range
(
len
(
lines
))):
line
=
lines
[
i
]
predict
=
str
(
line
[
'prediction'
])
answers
=
eval
(
line
[
'answer'
])
category
=
line
[
'category'
]
if
category
==
'Handwritten Mathematical Expression Recognition'
:
for
j
in
range
(
len
(
answers
)):
answer
=
answers
[
j
].
strip
().
replace
(
'
\n
'
,
' '
).
replace
(
' '
,
''
)
predict
=
predict
.
strip
().
replace
(
'
\n
'
,
' '
).
replace
(
' '
,
''
)
if
answer
in
predict
:
OCRBench_score
[
category
]
+=
1
break
else
:
for
j
in
range
(
len
(
answers
)):
answer
=
answers
[
j
].
lower
().
strip
().
replace
(
'
\n
'
,
' '
)
predict
=
predict
.
lower
().
strip
().
replace
(
'
\n
'
,
' '
)
if
answer
in
predict
:
OCRBench_score
[
category
]
+=
1
break
final_score_dict
=
{}
final_score_dict
[
'Text Recognition'
]
=
\
(
OCRBench_score
[
'Regular Text Recognition'
]
+
OCRBench_score
[
'Irregular Text Recognition'
]
+
OCRBench_score
[
'Artistic Text Recognition'
]
+
OCRBench_score
[
'Handwriting Recognition'
]
+
OCRBench_score
[
'Digit String Recognition'
]
+
OCRBench_score
[
'Non-Semantic Text Recognition'
])
final_score_dict
[
'Scene Text-centric VQA'
]
=
OCRBench_score
[
'Scene Text-centric VQA'
]
final_score_dict
[
'Doc-oriented VQA'
]
=
OCRBench_score
[
'Doc-oriented VQA'
]
final_score_dict
[
'Key Information Extraction'
]
=
OCRBench_score
[
'Key Information Extraction'
]
final_score_dict
[
'Handwritten Mathematical Expression Recognition'
]
=
\
(
OCRBench_score
[
'Handwritten Mathematical Expression Recognition'
])
final_score_dict
[
'Final Score'
]
=
\
(
final_score_dict
[
'Text Recognition'
]
+
final_score_dict
[
'Scene Text-centric VQA'
]
+
final_score_dict
[
'Doc-oriented VQA'
]
+
final_score_dict
[
'Key Information Extraction'
]
+
final_score_dict
[
'Handwritten Mathematical Expression Recognition'
])
final_score_dict
[
'Final Score Norm'
]
=
(
float
(
final_score_dict
[
'Final Score'
])
/
10
)
score_pth
=
eval_file
.
replace
(
'.xlsx'
,
'_score.json'
)
dump
(
final_score_dict
,
score_pth
)
return
final_score_dict
class
MathVista
(
ImageBaseDataset
):
TYPE
=
'VQA'
DATASET_URL
=
{
'MathVista_MINI'
:
'https://opencompass.openxlab.space/utils/VLMEval/MathVista_MINI.tsv'
}
DATASET_MD5
=
{
'MathVista_MINI'
:
'f199b98e178e5a2a20e7048f5dcb0464'
}
# It returns a DataFrame
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
from
.utils.mathvista
import
MathVista_auxeval
,
MathVista_acc
model
=
judge_kwargs
[
'model'
]
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
storage
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
.xlsx'
)
tmp_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
.pkl'
)
nproc
=
judge_kwargs
.
pop
(
'nproc'
,
4
)
if
not
osp
.
exists
(
storage
):
data
=
load
(
eval_file
)
model
=
build_judge
(
max_tokens
=
128
,
**
judge_kwargs
)
assert
model
.
working
(),
(
'MathVista evaluation requires a working OPENAI API
\n
'
+
DEBUG_MESSAGE
)
lt
=
len
(
data
)
lines
=
[
data
.
iloc
[
i
]
for
i
in
range
(
lt
)]
tups
=
[(
model
,
line
)
for
line
in
lines
]
indices
=
[
line
[
'index'
]
for
line
in
lines
]
ans
=
{}
if
osp
.
exists
(
tmp_file
):
ans
=
load
(
tmp_file
)
tups
=
[
x
for
x
,
i
in
zip
(
tups
,
indices
)
if
i
not
in
ans
]
indices
=
[
i
for
i
in
indices
if
i
not
in
ans
]
if
len
(
indices
):
new_results
=
track_progress_rich
(
MathVista_auxeval
,
tups
,
nproc
=
nproc
,
chunksize
=
nproc
,
keys
=
indices
,
save
=
tmp_file
,
)
ans
=
load
(
tmp_file
)
for
k
,
v
in
zip
(
indices
,
new_results
):
assert
k
in
ans
assert
ans
[
k
][
'log'
]
==
v
[
'log'
]
and
ans
[
k
][
'res'
]
==
v
[
'res'
]
data
[
'res'
]
=
[
ans
[
idx
][
'res'
]
for
idx
in
data
[
'index'
]]
data
[
'log'
]
=
[
ans
[
idx
][
'log'
]
for
idx
in
data
[
'index'
]]
dump
(
data
,
storage
)
score
=
MathVista_acc
(
storage
)
score_pth
=
storage
.
replace
(
'.xlsx'
,
'_score.csv'
)
dump
(
score
,
score_pth
)
return
score
class
MathVerse
(
ImageBaseDataset
):
TYPE
=
'VQA'
DATASET_URL
=
{
'MathVerse_MINI'
:
'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIV.tsv'
,
# noqa
'MathVerse_MINI_Vision_Only'
:
'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVOnly.tsv'
,
# noqa
'MathVerse_MINI_Vision_Dominant'
:
'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVDom.tsv'
,
# noqa
'MathVerse_MINI_Vision_Intensive'
:
'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVInt.tsv'
,
# noqa
'MathVerse_MINI_Text_Lite'
:
'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITLite.tsv'
,
# noqa
'MathVerse_MINI_Text_Dominant'
:
'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITDom.tsv'
,
# noqa
}
DATASET_MD5
=
{
'MathVerse_MINI'
:
'5017caca32b7fa110c350a1bea861b65'
,
'MathVerse_MINI_Vision_Only'
:
'68a11d4680014ac881fa37adeadea3a4'
,
'MathVerse_MINI_Vision_Dominant'
:
'b8fb63852d261ab2aaefba29cc2414d3'
,
'MathVerse_MINI_Vision_Intensive'
:
'01cbd35be202bb0c4873a4186a63bc19'
,
'MathVerse_MINI_Text_Lite'
:
'19e4b13bdd30b89a03b2e358bcfefa04'
,
'MathVerse_MINI_Text_Dominant'
:
'4f5cd2fa6630ea00bb11d6fde1f6fe6a'
,
}
# It returns a DataFrame
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
from
.utils.mathverse
import
MathVerse_auxeval_extract
,
MathVerse_auxeval_score
,
MathVerse_acc
model
=
judge_kwargs
[
'model'
]
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
storage_extract
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
_extract.xlsx'
)
tmp_file_extract
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
_extract.pkl'
)
storage_score
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
_score.xlsx'
)
tmp_file_score
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
_score.pkl'
)
nproc
=
judge_kwargs
.
pop
(
'nproc'
,
4
)
# stage1: extract the answer
if
not
osp
.
exists
(
storage_extract
):
data
=
load
(
eval_file
)
model
=
build_judge
(
max_tokens
=
128
,
**
judge_kwargs
)
assert
model
.
working
(),
(
'MathVerse evaluation requires a working OPENAI API
\n
'
+
DEBUG_MESSAGE
)
lt
=
len
(
data
)
lines
=
[
data
.
iloc
[
i
]
for
i
in
range
(
lt
)]
tups
=
[(
model
,
line
)
for
line
in
lines
]
indices
=
[
line
[
'index'
]
for
line
in
lines
]
ans
=
{}
if
osp
.
exists
(
tmp_file_extract
):
ans
=
load
(
tmp_file_extract
)
tups
=
[
x
for
x
,
i
in
zip
(
tups
,
indices
)
if
i
not
in
ans
]
indices
=
[
i
for
i
in
indices
if
i
not
in
ans
]
if
len
(
indices
):
new_results
=
track_progress_rich
(
MathVerse_auxeval_extract
,
tups
,
nproc
=
nproc
,
chunksize
=
nproc
,
keys
=
indices
,
save
=
tmp_file_extract
,
)
ans
=
load
(
tmp_file_extract
)
for
k
,
v
in
zip
(
indices
,
new_results
):
assert
k
in
ans
assert
ans
[
k
][
'log_extract'
]
==
v
[
'log_extract'
]
and
ans
[
k
][
'extract'
]
==
v
[
'extract'
]
data
[
'extract'
]
=
[
ans
[
idx
][
'extract'
]
for
idx
in
data
[
'index'
]]
data
[
'log_extract'
]
=
[
ans
[
idx
][
'log_extract'
]
for
idx
in
data
[
'index'
]]
dump
(
data
,
storage_extract
)
# stage2: score the answer
if
not
osp
.
exists
(
storage_score
):
data
=
load
(
storage_extract
)
model
=
build_judge
(
max_tokens
=
128
,
**
judge_kwargs
)
assert
model
.
working
(),
(
'MathVerse evaluation requires a working OPENAI API
\n
'
+
DEBUG_MESSAGE
)
lt
=
len
(
data
)
lines
=
[
data
.
iloc
[
i
]
for
i
in
range
(
lt
)]
tups
=
[(
model
,
line
)
for
line
in
lines
]
indices
=
[
line
[
'index'
]
for
line
in
lines
]
ans
=
{}
if
osp
.
exists
(
tmp_file_score
):
ans
=
load
(
tmp_file_score
)
tups
=
[
x
for
x
,
i
in
zip
(
tups
,
indices
)
if
i
not
in
ans
]
indices
=
[
i
for
i
in
indices
if
i
not
in
ans
]
if
len
(
indices
):
new_results
=
track_progress_rich
(
MathVerse_auxeval_score
,
tups
,
nproc
=
nproc
,
chunksize
=
nproc
,
keys
=
indices
,
save
=
tmp_file_score
,
)
ans
=
load
(
tmp_file_score
)
for
k
,
v
in
zip
(
indices
,
new_results
):
assert
k
in
ans
assert
ans
[
k
][
'log_score'
]
==
v
[
'log_score'
]
and
ans
[
k
][
'score'
]
==
v
[
'score'
]
data
[
'score'
]
=
[
ans
[
idx
][
'score'
]
for
idx
in
data
[
'index'
]]
data
[
'log_score'
]
=
[
ans
[
idx
][
'log_score'
]
for
idx
in
data
[
'index'
]]
dump
(
data
,
storage_score
)
score
=
MathVerse_acc
(
storage_score
)
score_pth
=
storage_score
.
replace
(
'.xlsx'
,
'.csv'
)
dump
(
score
,
score_pth
)
return
score
class
MathVision
(
ImageBaseDataset
):
TYPE
=
'VQA'
DATASET_URL
=
{
'MathVision'
:
'https://opencompass.openxlab.space/utils/VLMEval/MathVision.tsv'
,
'MathVision_MINI'
:
'https://opencompass.openxlab.space/utils/VLMEval/MathVision_MINI.tsv'
}
DATASET_MD5
=
{
'MathVision'
:
'93f6de14f7916e598aa1b7165589831e'
,
'MathVision_MINI'
:
'060fe4fa5d868987ce179307bd5f8a33'
}
# It returns a DataFrame
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
from
.utils.mathv
import
MATH_V_auxeval
,
MATH_V_acc
if
'model'
in
judge_kwargs
:
model
=
judge_kwargs
[
'model'
]
else
:
model
=
os
.
path
.
basename
(
os
.
environ
.
get
(
'LOCAL_LLM'
))
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
storage
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
.xlsx'
)
tmp_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
.pkl'
)
nproc
=
judge_kwargs
.
pop
(
'nproc'
,
4
)
if
not
osp
.
exists
(
storage
):
data
=
load
(
eval_file
)
model
=
build_judge
(
max_tokens
=
128
,
**
judge_kwargs
)
assert
model
.
working
(),
(
'MATH-Vision evaluation requires a working OPENAI API
\n
'
+
DEBUG_MESSAGE
)
lt
=
len
(
data
)
lines
=
[
data
.
iloc
[
i
]
for
i
in
range
(
lt
)]
tups
=
[(
model
,
line
)
for
line
in
lines
]
indices
=
[
line
[
'index'
]
for
line
in
lines
]
ans
=
{}
if
osp
.
exists
(
tmp_file
):
ans
=
load
(
tmp_file
)
tups
=
[
x
for
x
,
i
in
zip
(
tups
,
indices
)
if
i
not
in
ans
]
indices
=
[
i
for
i
in
indices
if
i
not
in
ans
]
if
len
(
indices
):
new_results
=
track_progress_rich
(
MATH_V_auxeval
,
tups
,
nproc
=
nproc
,
chunksize
=
nproc
,
keys
=
indices
,
save
=
tmp_file
,
)
ans
=
load
(
tmp_file
)
for
k
,
v
in
zip
(
indices
,
new_results
):
assert
k
in
ans
assert
ans
[
k
][
'log'
]
==
v
[
'log'
]
and
ans
[
k
][
'res'
]
==
v
[
'res'
]
data
[
'res'
]
=
[
ans
[
idx
][
'res'
]
for
idx
in
data
[
'index'
]]
data
[
'log'
]
=
[
ans
[
idx
][
'log'
]
for
idx
in
data
[
'index'
]]
dump
(
data
,
storage
)
score
=
MATH_V_acc
(
storage
)
score_pth
=
storage
.
replace
(
'.xlsx'
,
'_score.csv'
)
dump
(
score
,
score_pth
)
return
score
class
OlympiadBench
(
ImageBaseDataset
):
TYPE
=
'VQA_ex_prompt'
DATASET_URL
=
{
'OlympiadBench'
:
'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench.tsv'
,
'OlympiadBench_EN'
:
'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_EN.tsv'
,
'OlympiadBench_CN'
:
'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_CN.tsv'
}
DATASET_MD5
=
{
'OlympiadBench'
:
'9735ae0f0299eae1e7d07f5a7feab914'
,
'OlympiadBench_EN'
:
'5c68e100d394351fc7049f29d4d4efed'
,
'OlympiadBench_CN'
:
'ea01b16788955702c79650c701e5b623'
}
def
dump_image
(
self
,
line
):
os
.
makedirs
(
self
.
img_root
,
exist_ok
=
True
)
tgt_path_z
=
[]
if
isinstance
(
line
[
'image'
],
list
):
for
i
in
range
(
len
(
line
[
'image'
])):
tgt_path
=
osp
.
join
(
self
.
img_root
,
f
"
{
line
[
'index'
]
}
--
{
i
+
1
}
.jpg"
)
if
not
read_ok
(
tgt_path
):
decode_base64_to_image_file
(
line
[
'image'
][
i
],
tgt_path
)
tgt_path_z
.
append
(
tgt_path
)
else
:
tgt_path
=
osp
.
join
(
self
.
img_root
,
f
"
{
line
[
'index'
]
}
.jpg"
)
if
not
read_ok
(
tgt_path
):
decode_base64_to_image_file
(
line
[
'image'
],
tgt_path
)
tgt_path_z
.
append
(
tgt_path
)
return
tgt_path_z
def
build_prompt
(
self
,
line
):
from
.utils.olympiadbench
import
get_answer_type_text
,
make_input
self
.
is_chinese
=
'zh'
in
line
[
'source'
]
self
.
is_math
=
'maths'
in
line
[
'source'
]
self
.
is_theorem_proving
=
'TP'
in
line
[
'source'
]
if
self
.
is_chinese
:
subject_content
=
'数学'
if
self
.
is_math
else
'物理'
if
self
.
is_theorem_proving
:
prompt
=
(
f
"以下是中国
{
subject_content
}
竞赛中的证明题。请根据题目的要求,运用逻辑推理及常用定理证明题目中的命题。"
"证明过程中使用的变量和公式请使用LaTeX格式表示。"
)
else
:
answer_type_text
=
get_answer_type_text
(
line
[
'answer_type'
],
is_chinese
=
True
,
multiple_answer
=
line
[
'is_multiple_answer'
])
if
line
[
'is_multiple_answer'
]:
multiple_answer_text
=
'
\\
boxed{用英文逗号连接的多个答案}'
else
:
multiple_answer_text
=
'
\\
boxed{答案}'
unit_text
=
''
if
line
[
'unit'
]:
multiple_answer_text
+=
'(单位)'
unit_text
=
',注意答案的单位不要放在
\\
boxed{}中'
prompt
=
(
f
'以下是中国
{
subject_content
}
竞赛中的解答题
{
answer_type_text
}
。请根据题目的要求和所提供的信息计算得出答案。'
f
'解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以“所以最终答案是
{
multiple_answer_text
}
。”'
f
'显式给出结果
{
unit_text
}
。'
)
else
:
subject_content
=
'Math'
if
self
.
is_math
else
'Physics'
if
self
.
is_theorem_proving
:
prompt
=
(
f
'The following is a theorem proving problem from an International
{
subject_content
}
competition. '
'Please use logical reasoning and common theorems to prove the proposition in the problem '
'according to the given requirements. '
'Please use LaTeX format to represent the variables and formulas used in the proof.'
)
else
:
if
line
[
'is_multiple_answer'
]:
multiple_answer_text
=
'
\\
boxed{multiple answers connected with commas}'
else
:
multiple_answer_text
=
'
\\
boxed{answer}'
unit_text
=
''
if
line
[
'unit'
]:
multiple_answer_text
+=
'(unit)'
unit_text
=
', note that the unit of the answer should not be included in
\\
boxed{}'
answer_type_text
=
get_answer_type_text
(
line
[
'answer_type'
],
is_chinese
=
False
,
multiple_answer
=
line
[
'is_multiple_answer'
])
prompt
=
(
f
'The following is an open-ended problem from an International
{
subject_content
}
competition. '
f
'
{
answer_type_text
}
Please calculate the answer according to the given requirements and '
'the information provided. Please use LaTeX format to represent the variables and formulas '
'used in the solution process and results. Please end your solution with "So the final answer '
f
'is
{
multiple_answer_text
}
." and give the result explicitly
{
unit_text
}
.'
)
if
self
.
is_math
:
input
=
make_input
(
prompt
,
line
[
'question'
])
else
:
if
'context'
in
line
.
keys
()
and
str
(
line
[
'context'
])
!=
'nan'
:
# cannot be null
input
=
make_input
(
prompt
,
line
[
'context'
]
+
'
\n
'
+
line
[
'question'
])
else
:
input
=
make_input
(
prompt
,
line
[
'question'
])
ret
=
[
dict
(
type
=
'text'
,
value
=
input
)]
tgt_path
=
self
.
dump_image
(
line
)
ret
.
extend
([
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
])
return
ret
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
from
.utils.olympiadbench
import
MathJudger
,
extract_answer
judger
=
MathJudger
()
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
name_str1
=
'judge'
name_str2
=
'score'
result_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
name_str1
}
_result.xlsx'
)
score_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
name_str2
}
_result.csv'
)
if
not
osp
.
exists
(
result_file
):
data
=
load
(
eval_file
)
scorez
=
[]
for
i
in
tqdm
(
data
.
iterrows
()):
line
=
i
[
1
]
model_answer
=
line
[
'prediction'
]
is_chinese
=
'zh'
in
line
[
'source'
]
model_answer
=
extract_answer
(
is_chinese
,
model_answer
,
is_deepseek
=
False
)
answer_type
=
line
[
'answer_type'
]
final_answer
=
line
[
'final_answer'
][
2
:
-
2
]
if
str
(
answer_type
)
!=
'nan'
and
'Tuple'
in
answer_type
:
judge_result
=
judger
.
judge
(
model_answer
,
final_answer
)
else
:
if
str
(
line
[
'error'
])
!=
'nan'
:
if
','
in
line
[
'error'
]:
precisions
=
line
[
'error'
].
split
(
','
)
precisions
=
[
float
(
p
)
if
p
else
1e-8
for
p
in
precisions
]
judge_result
=
judger
.
judge
(
model_answer
,
final_answer
,
precisions
)
else
:
precision
=
float
(
line
[
'error'
])
judge_result
=
judger
.
judge
(
model_answer
,
final_answer
,
precision
)
else
:
judge_result
=
judger
.
judge
(
model_answer
,
final_answer
)
scorez
.
append
(
judge_result
)
data
[
'score'
]
=
scorez
dump
(
data
,
result_file
)
judge_file
=
load
(
result_file
)
if
not
osp
.
exists
(
score_file
):
name_list
=
[
'OE_MM_maths_en_COMP'
,
'OE_MM_maths_zh_CEE'
,
'OE_MM_maths_zh_COMP'
,
'OE_MM_physics_en_COMP'
,
'OE_MM_physics_zh_CEE'
,
'OE_TO_maths_en_COMP'
,
'OE_TO_maths_zh_CEE'
,
'OE_TO_maths_zh_COMP'
,
'OE_TO_physics_en_COMP'
,
'OE_TO_physics_zh_CEE'
]
sample_list
=
[[]
for
_
in
range
(
len
(
name_list
))]
for
i
in
judge_file
.
iterrows
():
line
=
i
[
1
]
for
j
in
range
(
len
(
name_list
)):
if
line
[
'source'
]
==
name_list
[
j
]:
sample_list
[
j
].
append
(
line
[
'score'
])
acc_dict
=
{}
correct_list
=
[]
# fine-grained
for
i
in
range
(
len
(
name_list
)):
correct_num
=
0
for
j
in
sample_list
[
i
]:
if
j
:
correct_num
+=
1
correct_list
.
append
(
correct_num
)
acc
=
100
*
correct_num
/
len
(
sample_list
[
i
])
acc_dict
[
name_list
[
i
]]
=
[
acc
]
# 4 grained
labela
=
[
'zh'
,
'en'
]
labelb
=
[
'maths'
,
'physics'
]
grain_list
=
[[
x
,
y
]
for
x
in
labela
for
y
in
labelb
]
for
j
in
grain_list
:
dict_name
=
j
[
0
]
+
"_"
+
j
[
1
]
correct_num
=
0
full_num
=
0
for
i
in
range
(
len
(
name_list
)):
if
all
(
k
in
name_list
[
i
]
for
k
in
j
):
correct_num
+=
correct_list
[
i
]
full_num
+=
len
(
sample_list
[
i
])
acc
=
100
*
correct_num
/
full_num
acc_dict
[
dict_name
]
=
[
acc
]
# 2 grained
grain_list
=
[
'maths'
,
'physics'
]
for
j
in
grain_list
:
dict_name
=
j
correct_num
=
0
full_num
=
0
for
i
in
range
(
len
(
name_list
)):
if
j
in
name_list
[
i
]:
correct_num
+=
correct_list
[
i
]
full_num
+=
len
(
sample_list
[
i
])
acc
=
100
*
correct_num
/
full_num
acc_dict
[
dict_name
]
=
[
acc
]
# AVG
correct_num
=
sum
(
correct_list
)
acc
=
100
*
correct_num
/
len
(
judge_file
)
acc_dict
[
'AVG'
]
=
[
acc
]
acc_pd
=
pd
.
DataFrame
(
acc_dict
)
acc_pd
.
to_csv
(
score_file
,
index
=
False
,
encoding
=
'gbk'
)
accdz
=
pd
.
read_csv
(
score_file
)
return
accdz
class
LLaVABench
(
ImageBaseDataset
):
TYPE
=
'VQA'
DATASET_URL
=
{
'LLaVABench'
:
'https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv'
}
DATASET_MD5
=
{
'LLaVABench'
:
'd382a093f749a697820d3dadd61c8428'
}
# It returns a DataFrame
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
from
.utils.llavabench
import
(
build_prompt
,
LLaVABench_atomeval
,
LLaVABench_score
,
)
suffix
=
'.'
+
eval_file
.
split
(
'.'
)[
-
1
]
record_file
=
eval_file
.
replace
(
suffix
,
'_openai_result'
+
suffix
)
score_file
=
eval_file
.
replace
(
suffix
,
'_score.csv'
)
nproc
=
judge_kwargs
.
pop
(
'nproc'
,
4
)
system_prompt
=
'You are a helpful and precise assistant for checking the quality of the answer.'
if
not
osp
.
exists
(
record_file
):
data
=
load
(
eval_file
)
lines
=
[
data
.
iloc
[
i
]
for
i
in
range
(
len
(
data
))]
model
=
build_judge
(
temperature
=
0.2
,
system_prompt
=
system_prompt
,
**
judge_kwargs
)
assert
model
.
working
(),
(
'LLaVABench evaluation requires a working OPENAI API
\n
'
+
DEBUG_MESSAGE
)
prompts
=
[
build_prompt
(
line
)
for
line
in
lines
]
tups
=
[(
model
,
prompt
)
for
prompt
in
prompts
]
scores
=
track_progress_rich
(
LLaVABench_atomeval
,
tups
,
nproc
=
nproc
,
chunksize
=
nproc
)
data
[
'gpt4_score'
]
=
[
x
[
0
]
for
x
in
scores
]
data
[
'score'
]
=
[
x
[
1
]
for
x
in
scores
]
dump
(
data
,
record_file
)
data
=
load
(
record_file
)
ret
=
LLaVABench_score
(
data
).
round
(
1
)
dump
(
ret
,
score_file
)
return
ret
class
MMVet
(
ImageBaseDataset
):
TYPE
=
'VQA'
DATASET_URL
=
{
'MMVet'
:
'https://opencompass.openxlab.space/utils/VLMEval/MMVet.tsv'
}
DATASET_MD5
=
{
'MMVet'
:
'748aa6d4aa9d4de798306a63718455e3'
}
# It returns a DataFrame
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
from
.utils.mmvet
import
MMVet_auxeval
,
MMVet_acc
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
model
=
judge_kwargs
[
'model'
]
storage
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
.xlsx'
)
tmp_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
.pkl'
)
nproc
=
judge_kwargs
.
pop
(
'nproc'
,
4
)
if
not
osp
.
exists
(
storage
):
data
=
load
(
eval_file
)
model
=
build_judge
(
max_tokens
=
3
,
**
judge_kwargs
)
assert
model
.
working
(),
(
'MMVet evaluation requires a working OPENAI API
\n
'
+
DEBUG_MESSAGE
)
lt
=
len
(
data
)
lines
=
[
data
.
iloc
[
i
]
for
i
in
range
(
lt
)]
tups
=
[(
model
,
line
)
for
line
in
lines
]
indices
=
[
line
[
'index'
]
for
line
in
lines
]
ans
=
load
(
tmp_file
)
if
osp
.
exists
(
tmp_file
)
else
{}
tups
=
[
x
for
x
,
i
in
zip
(
tups
,
indices
)
if
i
not
in
ans
]
indices
=
[
i
for
i
in
indices
if
i
not
in
ans
]
if
len
(
indices
):
new_results
=
track_progress_rich
(
MMVet_auxeval
,
tups
,
nproc
=
nproc
,
chunksize
=
nproc
,
keys
=
indices
,
save
=
tmp_file
,
)
ans
=
load
(
tmp_file
)
for
k
,
v
in
zip
(
indices
,
new_results
):
assert
k
in
ans
assert
ans
[
k
][
'log'
]
==
v
[
'log'
]
and
ans
[
k
][
'score'
]
==
v
[
'score'
]
data
[
'score'
]
=
[
ans
[
idx
][
'score'
]
for
idx
in
data
[
'index'
]]
data
[
'log'
]
=
[
ans
[
idx
][
'log'
]
for
idx
in
data
[
'index'
]]
dump
(
data
,
storage
)
score
,
score_fine
=
MMVet_acc
(
storage
)
score_pth
=
storage
.
replace
(
'.xlsx'
,
'_score.csv'
)
score_fine_pth
=
storage
.
replace
(
'.xlsx'
,
'_score_fine.csv'
)
dump
(
score
,
score_pth
)
dump
(
score_fine
,
score_fine_pth
)
return
score
class
MTVQADataset
(
ImageBaseDataset
):
TYPE
=
'VQA'
DATASET_URL
=
{
'MTVQA_TEST'
:
'https://opencompass.openxlab.space/utils/VLMEval/MTVQA_TEST.tsv'
}
DATASET_MD5
=
{
'MTVQA_TEST'
:
'd87c17dbab934b7cd89c0a3c1c5657f4'
}
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
data
=
load
(
eval_file
)
assert
'answer'
in
data
and
'prediction'
in
data
and
'category'
in
data
data
[
'prediction'
]
=
[
str
(
x
)
for
x
in
data
[
'prediction'
]]
data
[
'answer'
]
=
[
str
(
x
)
for
x
in
data
[
'answer'
]]
if
'split'
in
data
:
assert
np
.
all
([
x
.
lower
()
==
'test'
for
x
in
data
[
'split'
]]),
'We only support MTVQA_TEST for now. '
lt
=
len
(
data
)
category_scores
=
defaultdict
(
list
)
for
i
in
range
(
lt
):
line
=
data
.
iloc
[
i
]
ans
=
line
[
'answer'
].
strip
().
lower
().
replace
(
'.'
,
''
)
pred
=
line
[
'prediction'
].
strip
().
lower
().
replace
(
'.'
,
''
)
cate
=
line
[
'category'
]
score
=
1.0
if
ans
in
pred
else
0.0
category_scores
[
cate
].
append
(
score
)
category_scores
[
'Average'
].
append
(
score
)
# Calculate the average score for each category, the score is normalized to [0, 100]
category_averages
=
{
category
:
np
.
mean
(
scores
)
*
100
for
category
,
scores
in
category_scores
.
items
()}
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
result_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
'_acc.json'
)
dump
(
category_averages
,
result_file
)
return
category_averages
# MT-VQA adopts a custom prompt
def
build_prompt
(
self
,
line
):
msgs
=
super
().
build_prompt
(
line
)
assert
sum
([
x
[
'type'
]
==
'text'
for
x
in
msgs
])
==
1
for
item
in
msgs
:
if
item
[
'type'
]
==
'text'
:
item
[
'value'
]
+=
'
\n
Answer the question using a word or phrase in the language of the question.'
return
msgs
class
TableVQABench
(
ImageBaseDataset
):
TYPE
=
'VQA'
DATASET_URL
=
{
'TableVQABench'
:
'https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/mentor-vil/datasets/tablevqa-bench.tsv'
}
DATASET_MD5
=
{
'TableVQABench'
:
'2550adc61bdc82d8e62f3b003de7c62d'
}
from
.utils.tablevqabench
import
FINTABNETQA_PROMPT
,
VTABFACT_PROMPT
,
VWTQ_PROMPT
# It returns a DataFrame
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
import
pandas
as
pd
from
.utils.tablevqabench
import
evaluate_fintabnet
,
evaluate_tabfact
,
evaluate_wtq
data
=
load
(
eval_file
)
assert
'answer'
in
data
and
'prediction'
in
data
data
[
'prediction'
]
=
data
[
'prediction'
].
str
.
replace
(
'^Answer: '
,
''
,
regex
=
True
)
data_group
=
dict
(
tuple
(
data
.
groupby
(
'split'
)))
eval_result
=
{
'split'
:
[],
'average_scores'
:
[]}
for
split
in
[
'fintabnetqa'
,
'vtabfact'
,
'vwtq'
,
'vwtq_syn'
]:
data_split
=
data_group
[
split
].
to_dict
(
orient
=
'records'
)
if
split
==
'fintabnetqa'
:
split_eval_meta
=
evaluate_fintabnet
(
data_split
,
[
'accuracy'
])
elif
split
==
'vtabfact'
:
split_eval_meta
=
evaluate_tabfact
(
data_split
,
[
'accuracy'
])
elif
split
==
'vwtq'
or
split
==
'vwtq_syn'
:
split_eval_meta
=
evaluate_wtq
(
data_split
,
[
'accuracy'
])
eval_result
[
'split'
].
append
(
split
)
eval_result
[
'average_scores'
].
append
(
split_eval_meta
[
'average_scores'
])
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
result_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
'_acc.csv'
)
eval_result
=
pd
.
DataFrame
(
eval_result
)
dump
(
eval_result
,
result_file
)
return
eval_result
# TableVQABench adopts a custom prompt
def
build_prompt
(
self
,
line
):
msgs
=
super
().
build_prompt
(
line
)
assert
sum
([
x
[
'type'
]
==
'text'
for
x
in
msgs
])
==
1
for
item
in
msgs
:
if
item
[
'type'
]
==
'text'
:
if
line
[
'split'
]
==
'fintabnetqa'
:
item
[
'value'
]
=
self
.
FINTABNETQA_PROMPT
.
format_map
({
'question'
:
item
[
'value'
]})
elif
line
[
'split'
]
==
'vtabfact'
:
item
[
'value'
]
=
self
.
VTABFACT_PROMPT
.
format_map
({
'question'
:
item
[
'value'
]})
elif
line
[
'split'
]
==
'vwtq_syn'
or
line
[
'split'
]
==
'vwtq'
:
item
[
'value'
]
=
self
.
VWTQ_PROMPT
.
format_map
({
'question'
:
item
[
'value'
]})
return
msgs
class
CustomVQADataset
(
ImageBaseDataset
):
TYPE
=
'VQA'
def
load_data
(
self
,
dataset
):
data_path
=
osp
.
join
(
LMUDataRoot
(),
f
'
{
dataset
}
.tsv'
)
if
file_size
(
data_path
,
'GB'
)
>
1
:
local_path
=
data_path
.
replace
(
'.tsv'
,
'_local.tsv'
)
if
not
osp
.
exists
(
local_path
)
or
os
.
environ
.
get
(
'FORCE_LOCAL'
,
None
):
from
..tools
import
LOCALIZE
LOCALIZE
(
data_path
,
local_path
)
data_path
=
local_path
return
load
(
data_path
)
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
raise
NotImplementedError
class
CRPE
(
ImageBaseDataset
):
TYPE
=
'VQA'
DATASET_URL
=
{
'CRPE_EXIST'
:
'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_EXIST.tsv'
,
'CRPE_RELATION'
:
'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_RELATION.tsv'
}
DATASET_MD5
=
{
'CRPE_EXIST'
:
'315584e23ac1ff7f8719ed3b7ad90f08'
,
'CRPE_RELATION'
:
'bad7094cde0b572288f4b119c2d0c656'
}
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
from
.utils.crpe
import
is_correct
# find-image, count-text, find-text,
# infer-choose, count-image, visual-reasoning
score
=
{
'exist'
:
0
,
'subject'
:
0
,
'predicate'
:
0
,
'object'
:
0
,
'total'
:
0
,
}
num
=
{
'exist'
:
0
,
'subject'
:
0
,
'predicate'
:
0
,
'object'
:
0
,
'total'
:
0
,
}
final_score_dict
=
{
'exist'
:
0
,
'subject'
:
0
,
'predicate'
:
0
,
'object'
:
0
,
'total'
:
0
,
}
data
=
load
(
eval_file
)
lt
=
len
(
data
)
lines
=
[
data
.
iloc
[
i
]
for
i
in
range
(
lt
)]
for
i
in
tqdm
(
range
(
len
(
lines
))):
line
=
lines
[
i
]
predict
=
str
(
line
[
'prediction'
])
answers
=
str
(
line
[
'answer'
])
# print("predict =", predict)
# print("answers =", answers)
category
=
line
[
'category'
]
if
is_correct
(
answers
,
predict
):
score
[
category
]
+=
1
score
[
'total'
]
+=
1
num
[
category
]
+=
1
num
[
'total'
]
+=
1
for
category
in
[
'exist'
,
'subject'
,
'predicate'
,
'object'
,
'total'
]:
if
num
[
category
]
!=
0
:
final_score_dict
[
category
]
=
score
[
category
]
/
num
[
category
]
else
:
final_score_dict
[
category
]
=
None
score_pth
=
eval_file
.
replace
(
'.xlsx'
,
'_score.json'
)
dump
(
final_score_dict
,
score_pth
)
return
final_score_dict
def
build_prompt
(
self
,
line
):
ROOT
=
LMUDataRoot
()
msgs
=
super
().
build_prompt
(
line
)
for
msg
in
msgs
:
if
msg
[
'type'
]
==
'image'
:
msg
[
'value'
]
=
osp
.
join
(
osp
.
join
(
ROOT
,
'images'
,
self
.
dataset_name
),
msg
[
'value'
])
return
msgs
class
QSpatial
(
ImageBaseDataset
):
TYPE
=
'VQA'
DATASET_URL
=
{
'QSpatial_plus'
:
''
,
'QSpatial_scannet'
:
''
}
# NOTE: To evaluate Q-Spatial-ScanNet, you need to get the permission from ScanNet website
# Once you get the permission, you can use the helper code here to download and extract necessary images:
# https://github.com/andrewliao11/Q-Spatial-Bench-code?tab=readme-ov-file#for-qspatial_scannet
qspatial_root
=
"TO_BE_REPLACED_WITH_THE_PATH_TO_QSPATIAL_DATASET"
url
=
"https://raw.githubusercontent.com/andrewliao11/Q-Spatial-Bench-code/refs/heads/main/prompt_templates/"
def
post_build
(
self
,
dataset
):
# Download the prompt templates from github
links
=
[
self
.
url
+
"system_prompt.txt"
,
self
.
url
+
"spatial_prompt_single.txt"
,
self
.
url
+
"spatial_prompt_steps.txt"
,
self
.
url
+
"standard_prompt.txt"
,
self
.
url
+
"zero_shot_prompt.txt"
]
with
tempfile
.
TemporaryDirectory
()
as
temp_dir
:
for
link
in
links
:
tgt_path
=
os
.
path
.
join
(
temp_dir
,
link
.
split
(
"/"
)[
-
1
])
os
.
system
(
f
"wget
{
link
}
-O
{
tgt_path
}
"
)
self
.
system_prompt
=
open
(
os
.
path
.
join
(
temp_dir
,
"system_prompt.txt"
)).
read
()
self
.
_prompt_templates
=
dict
(
spatial_prompt_single
=
open
(
os
.
path
.
join
(
temp_dir
,
"spatial_prompt_single.txt"
)).
read
(),
spatial_prompt_steps
=
open
(
os
.
path
.
join
(
temp_dir
,
"spatial_prompt_steps.txt"
)).
read
(),
standard_prompt
=
open
(
os
.
path
.
join
(
temp_dir
,
"standard_prompt.txt"
)).
read
(),
zero_shot_prompt
=
open
(
os
.
path
.
join
(
temp_dir
,
"zero_shot_prompt.txt"
)).
read
(),
)
# Given one data record, return the built prompt (a multi-modal message), can override
def
build_prompt
(
self
,
line
):
from
jinja2.sandbox
import
SandboxedEnvironment
text_prompt_template
=
self
.
_prompt_templates
[
"spatial_prompt_single"
]
env
=
SandboxedEnvironment
()
text_prompt
=
env
.
from_string
(
text_prompt_template
).
render
(
question
=
line
[
"question"
])
tgt_path
=
self
.
dump_image
(
line
)
msgs
=
[]
if
isinstance
(
tgt_path
,
list
):
msgs
.
extend
([
dict
(
type
=
'image'
,
value
=
p
)
for
p
in
tgt_path
])
else
:
msgs
=
[
dict
(
type
=
'image'
,
value
=
tgt_path
)]
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
f
"
{
self
.
system_prompt
}
\n
{
text_prompt
}
"
))
return
msgs
# Given the dataset name, return the dataset as a pandas dataframe, can override
def
load_data
(
self
,
dataset
):
import
io
import
pandas
as
pd
from
datasets
import
load_dataset
hf_dataset
=
load_dataset
(
"andrewliao11/Q-Spatial-Bench"
,
split
=
dataset
)
df
=
hf_dataset
.
to_pandas
()
df
.
reset_index
(
drop
=
True
,
inplace
=
True
)
df
[
'index'
]
=
df
.
index
df
[
'answer'
]
=
list
(
zip
(
df
[
'answer_value'
],
df
[
'answer_unit'
]))
df
=
df
[[
'index'
]
+
[
col
for
col
in
df
.
columns
if
col
!=
'index'
]]
if
dataset
==
"QSpatial_scannet"
:
df
=
df
.
drop
(
columns
=
[
"image"
])
df
[
"image"
]
=
[
Image
.
open
(
os
.
path
.
join
(
self
.
qspatial_root
,
image_path
))
for
image_path
in
df
[
"image_path"
]]
else
:
df
[
"image"
]
=
[
Image
.
open
(
io
.
BytesIO
(
image_dict
[
"bytes"
]))
for
image_dict
in
df
[
"image"
]]
df
[
"image"
]
=
[
encode_image_to_base64
(
image
)
for
image
in
df
[
"image"
]]
return
df
@
classmethod
def
get_multiplier
(
self
,
unit
):
unit
=
unit
.
lower
()
if
unit
in
[
"meters"
,
"meter"
,
"m"
,
"metre"
,
"metres"
]:
multiplier
=
100
elif
unit
in
[
"centimeters"
,
"centimeter"
,
"cm"
]:
multiplier
=
1
elif
unit
in
[
"feet"
,
"foot"
,
"ft"
]:
multiplier
=
30.48
elif
unit
in
[
"inch"
,
"inches"
,
"in"
]:
multiplier
=
2.54
elif
unit
in
[
"mm"
]:
multiplier
=
0.1
else
:
print
(
f
"Unknown unit:
{
unit
}
"
)
multiplier
=
0.
return
multiplier
@
classmethod
def
parse_string
(
self
,
input_str
):
# Regular expression to match the pattern (number or range, text)
match
=
re
.
match
(
r
'\(([\d.-]+), (.+)\)'
,
input_str
)
if
match
:
number_part
=
match
.
group
(
1
)
text
=
match
.
group
(
2
)
if
'-'
in
number_part
:
start
,
end
=
map
(
float
,
number_part
.
split
(
'-'
))
number
=
(
start
+
end
)
/
2
else
:
number
=
float
(
number_part
)
return
number
*
self
.
get_multiplier
(
text
)
else
:
print
(
f
"Unable to parse the input string
{
input_str
}
"
)
return
0
@
classmethod
def
parse_prediction
(
self
,
vlm_response
):
# Value
pattern
=
r
'scalar{([^}]*)}'
str_inside_scalar_boxes
=
re
.
findall
(
pattern
,
vlm_response
)[
-
1
]
scalar_list
=
re
.
findall
(
r
'\d+\.?\d*'
,
str_inside_scalar_boxes
)
parsed_scalar
=
np
.
array
(
scalar_list
).
astype
(
float
).
mean
()
# Unit
pattern
=
r
'distance_unit{([^}]*)}'
str_inside_unit_boxes
=
re
.
findall
(
pattern
,
vlm_response
)
parsed_unit
=
str_inside_unit_boxes
[
-
1
]
pred_value_in_cms
=
parsed_scalar
*
self
.
get_multiplier
(
parsed_unit
)
return
pred_value_in_cms
# It returns a dictionary
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
data
=
load
(
eval_file
)
if
"model"
in
judge_kwargs
:
from
.utils.qspatial
import
QSpatial_auxeval
# extract using model
model
=
judge_kwargs
[
'model'
]
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
storage
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
.xlsx'
)
tmp_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
.pkl'
)
nproc
=
judge_kwargs
.
pop
(
'nproc'
,
4
)
if
not
osp
.
exists
(
storage
):
model
=
build_judge
(
max_tokens
=
128
,
**
judge_kwargs
)
assert
model
.
working
(),
(
'Evaluation requires a working OPENAI API
\n
'
+
DEBUG_MESSAGE
)
lt
=
len
(
data
)
lines
=
[
data
.
iloc
[
i
]
for
i
in
range
(
lt
)]
tups
=
[(
model
,
line
)
for
line
in
lines
]
indices
=
[
line
[
'index'
]
for
line
in
lines
]
ans
=
{}
if
osp
.
exists
(
tmp_file
):
ans
=
load
(
tmp_file
)
tups
=
[
x
for
x
,
i
in
zip
(
tups
,
indices
)
if
i
not
in
ans
]
indices
=
[
i
for
i
in
indices
if
i
not
in
ans
]
if
len
(
indices
):
new_results
=
track_progress_rich
(
QSpatial_auxeval
,
tups
,
nproc
=
nproc
,
chunksize
=
nproc
,
keys
=
indices
,
save
=
tmp_file
,
)
ans
=
load
(
tmp_file
)
for
k
,
v
in
zip
(
indices
,
new_results
):
assert
k
in
ans
assert
ans
[
k
][
'log'
]
==
v
[
'log'
]
and
ans
[
k
][
'res'
]
==
v
[
'res'
]
data
[
'res'
]
=
[
ans
[
idx
][
'res'
]
for
idx
in
data
[
'index'
]]
data
[
'log'
]
=
[
ans
[
idx
][
'log'
]
for
idx
in
data
[
'index'
]]
dump
(
data
,
storage
)
data
=
load
(
storage
)
pred_value_in_cms
=
[]
for
res
in
data
[
"res"
]:
try
:
pred_value_in_cms
.
append
(
self
.
parse_string
(
res
))
except
ValueError
:
pred_value_in_cms
.
append
(
0.
)
pred_value_in_cms
=
np
.
array
(
pred_value_in_cms
)
+
1e-8
else
:
# regex parsing
pred_value_in_cms
=
[]
n_errors_in_parsing
=
0
for
pred
in
data
[
"prediction"
]:
try
:
parsed_value
=
self
.
parse_prediction
(
pred
)
except
IndexError
:
n_errors_in_parsing
+=
1
parsed_value
=
1e-8
pred_value_in_cms
.
append
(
parsed_value
)
print
(
f
"Encounter
{
n_errors_in_parsing
}
errors in parsing"
)
pred_value_in_cms
=
np
.
array
(
pred_value_in_cms
)
+
1e-8
# Ground truth
ground_truth_value_in_cms
=
[]
for
answer
in
data
[
"answer"
]:
value
,
unit
=
eval
(
answer
)
ground_truth_value_in_cms
.
append
(
value
*
self
.
get_multiplier
(
unit
))
ground_truth_value_in_cms
=
np
.
array
(
ground_truth_value_in_cms
)
+
1e-8
# Calculate the score
pred_gt
=
pred_value_in_cms
/
ground_truth_value_in_cms
gt_pred
=
ground_truth_value_in_cms
/
pred_value_in_cms
delta_2
=
np
.
stack
([
pred_gt
,
gt_pred
]).
max
(
0
)
<
2.
delta_1_point_5
=
np
.
stack
([
pred_gt
,
gt_pred
]).
max
(
0
)
<
1.5
data
[
"eval_score_delta_2"
]
=
delta_2
data
[
"eval_score_delta_1_point_5"
]
=
delta_1_point_5
final_score_dict
=
{
"delta_2"
:
delta_2
.
mean
(),
"delta_1_point_5"
:
delta_1_point_5
.
mean
()
}
for
question_type
in
set
(
data
[
"question_type"
]):
filtered_data
=
data
[
data
[
"question_type"
]
==
question_type
]
delta_2_per_question_type
=
filtered_data
[
"eval_score_delta_2"
].
mean
()
delta_1_point_5_per_question_type
=
filtered_data
[
"eval_score_delta_1_point_5"
].
mean
()
final_score_dict
.
update
({
f
"
{
question_type
}
_delta_2"
:
delta_2_per_question_type
})
final_score_dict
.
update
({
f
"
{
question_type
}
_delta_1_point_5"
:
delta_1_point_5_per_question_type
})
score_pth
=
eval_file
.
replace
(
'.xlsx'
,
'_score.json'
)
dump
(
final_score_dict
,
score_pth
)
return
final_score_dict
class
MMNIAH
(
ImageBaseDataset
):
TYPE
=
'VQA'
DATASET_URL
=
{
'MM_NIAH_VAL'
:
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/MM_NIAH_VAL.tsv'
,
'MM_NIAH_TEST'
:
[
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-aa'
,
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ab'
,
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ac'
,
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ad'
,
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ae'
]}
DATASET_MD5
=
{
'MM_NIAH_VAL'
:
'27e5a8c3cef7746cb38f89cd86c474c5'
,
'MM_NIAH_TEST'
:
'f490eb2a43096307465fe9e7ef13497c'
}
def
prepare_tsv
(
self
,
url
,
file_md5
=
None
):
import
os
data_root
=
LMUDataRoot
()
os
.
makedirs
(
data_root
,
exist_ok
=
True
)
update_flag
=
False
file_name
=
'MM_NIAH_VAL.tsv'
if
'MM_NIAH_VAL'
in
url
else
'MM_NIAH_TEST.tsv'
data_path
=
osp
.
join
(
data_root
,
file_name
)
if
osp
.
exists
(
data_path
)
and
(
file_md5
is
None
or
md5
(
data_path
)
==
file_md5
):
pass
elif
file_name
==
'MM_NIAH_TEST.tsv'
:
warnings
.
warn
(
'The dataset tsv is not downloaded'
)
for
i
in
range
(
len
(
url
)):
if
osp
.
exists
(
osp
.
join
(
data_root
,
'part-a'
+
chr
(
ord
(
'a'
)
+
i
))):
print
(
'part_a'
+
chr
(
ord
(
'a'
)
+
i
)
+
' is existed'
)
continue
download_file
(
url
[
i
],
data_path
)
file_prefix
=
'part-'
output_file
=
data_path
split_files
=
sorted
([
f
for
f
in
os
.
listdir
(
data_root
)
if
f
.
startswith
(
file_prefix
)])
with
open
(
output_file
,
'wb'
)
as
outfile
:
# 逐个读取每个拆分文件并写入到输出文件
for
filename
in
split_files
:
with
open
(
osp
.
join
(
data_root
,
filename
),
'rb'
)
as
infile
:
outfile
.
write
(
infile
.
read
())
update_flag
=
True
else
:
warnings
.
warn
(
'The dataset tsv is not downloaded'
)
download_file
(
url
,
data_path
)
update_flag
=
True
if
file_size
(
data_path
,
'GB'
)
>
1
:
local_path
=
data_path
.
replace
(
'.tsv'
,
'_local.tsv'
)
if
not
osp
.
exists
(
local_path
)
or
os
.
environ
.
get
(
'FORCE_LOCAL'
,
None
)
or
update_flag
:
from
..tools
import
LOCALIZE
LOCALIZE
(
data_path
,
local_path
)
data_path
=
local_path
return
load
(
data_path
)
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
from
.utils.mmniah
import
is_correct
# find-image, count-text, find-text,
# infer-choose, count-image, visual-reasoning
MMNIAH_score
=
{
'count-text'
:
0
,
'find-image'
:
0
,
'find-text'
:
0
,
'infer-choose'
:
0
,
'count-image'
:
0
,
'visual-reasoning'
:
0
,
'total'
:
0
,
}
MMNIAH_num
=
{
'count-text'
:
0
,
'find-image'
:
0
,
'find-text'
:
0
,
'infer-choose'
:
0
,
'count-image'
:
0
,
'visual-reasoning'
:
0
,
'total'
:
0
,
}
final_score_dict
=
{
'count-text'
:
0
,
'find-image'
:
0
,
'find-text'
:
0
,
'infer-choose'
:
0
,
'count-image'
:
0
,
'visual-reasoning'
:
0
,
'total'
:
0
,
}
data
=
load
(
eval_file
)
lt
=
len
(
data
)
lines
=
[
data
.
iloc
[
i
]
for
i
in
range
(
lt
)]
for
i
in
tqdm
(
range
(
len
(
lines
))):
line
=
lines
[
i
]
predict
=
line
[
'prediction'
]
answers
=
line
[
'answer'
]
category
=
line
[
'category'
]
if
category
in
[
'visual-reasoning'
,
'find-image'
]:
answers
=
int
(
answers
)
if
is_correct
(
answers
,
predict
):
MMNIAH_score
[
category
]
+=
1
MMNIAH_score
[
'total'
]
+=
1
MMNIAH_num
[
category
]
+=
1
MMNIAH_num
[
'total'
]
+=
1
for
category
in
[
'find-image'
,
'count-text'
,
'find-text'
,
'infer-choose'
,
'count-image'
,
'visual-reasoning'
,
'total'
]:
if
MMNIAH_num
[
category
]
!=
0
:
final_score_dict
[
category
]
=
MMNIAH_score
[
category
]
/
MMNIAH_num
[
category
]
else
:
final_score_dict
[
category
]
=
None
score_pth
=
eval_file
.
replace
(
'.xlsx'
,
'_score.json'
)
dump
(
final_score_dict
,
score_pth
)
return
final_score_dict
def
build_prompt
(
self
,
line
):
msgs
=
super
().
build_prompt
(
line
)
if
isinstance
(
line
,
int
):
line
=
self
.
data
.
iloc
[
line
]
totalchoice
=
line
[
'multi-choice options'
]
totalchoice
=
eval
(
totalchoice
)
# find-image, count-text, find-text,
# infer-choose, count-image, visual-reasoning
context
=
msgs
[
-
1
][
'value'
]
context
=
eval
(
context
)
question
=
context
[
0
]
+
'
\n
'
+
context
[
1
]
# tgt_path是所有图像地址列表
tgt_path
=
[]
for
i
in
range
(
len
(
msgs
)
-
1
):
tgt_path
.
append
(
msgs
[
i
][
'value'
])
choices
=
totalchoice
[
0
]
choices_image
=
totalchoice
[
1
]
if
choices
:
for
c_idx
,
c
in
enumerate
(
choices
):
question
=
f
"
{
question
}
\n
{
chr
(
c_idx
+
ord
(
'A'
))
}
.
{
c
}
"
question
+=
"
\n
Answer with the option's letter from the given choices directly."
elif
choices_image
:
for
c_idx
in
range
(
len
(
choices_image
)):
question
=
f
"
{
question
}
\n
{
chr
(
c_idx
+
ord
(
'A'
))
}
. <image>"
question
+=
"
\n
Answer with the option's letter from the given choices directly."
else
:
question
+=
'
\n
Answer the question using a single word or phrase.'
question
=
'<start>'
+
question
+
'<end>'
question
=
question
.
split
(
'<image>'
)
if
choices_image
:
for
i
in
range
(
len
(
question
)
-
5
):
question
[
i
]
=
question
[
i
]
+
'
\n
<image>'
for
i
in
range
(
len
(
question
)
-
5
,
len
(
question
)
-
1
):
question
[
i
]
=
question
[
i
]
+
'<image>'
else
:
for
i
in
range
(
len
(
question
)
-
1
):
question
[
i
]
=
question
[
i
]
+
'
\n
<image>'
assert
len
(
tgt_path
)
+
1
==
len
(
question
)
context
=
[]
for
i
in
range
(
len
(
tgt_path
)):
context
.
append
(
question
[
i
])
context
.
append
(
tgt_path
[
i
])
context
.
append
(
question
[
-
1
])
context
[
0
]
=
context
[
0
][
7
:]
context
[
-
1
]
=
context
[
-
1
][:
-
5
]
msgs
=
[]
for
i
in
range
(
len
(
context
)):
if
i
%
2
==
0
:
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
context
[
i
]))
else
:
ROOT
=
LMUDataRoot
()
msgs
.
append
(
dict
(
type
=
'image'
,
value
=
osp
.
join
(
osp
.
join
(
ROOT
,
'images'
,
self
.
dataset_name
),
context
[
i
])))
for
element
in
msgs
:
if
element
[
'value'
]
==
''
:
msgs
.
remove
(
element
)
return
msgs
VLMEvalKit/vlmeval/dataset/image_yorn.py
0 → 100644
View file @
bc5ebf0f
from
..smp
import
*
from
..utils
import
*
from
.image_base
import
ImageBaseDataset
from
.utils
import
build_judge
,
DEBUG_MESSAGE
class
ImageYORNDataset
(
ImageBaseDataset
):
TYPE
=
'Y/N'
DATASET_URL
=
{
'MME'
:
'https://opencompass.openxlab.space/utils/VLMEval/MME.tsv'
,
'HallusionBench'
:
'https://opencompass.openxlab.space/utils/VLMEval/HallusionBench.tsv'
,
'POPE'
:
'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv'
,
'AMBER'
:
'https://huggingface.co/datasets/yifanzhang114/AMBER_base64/resolve/main/AMBER.tsv'
,
}
DATASET_MD5
=
{
'MME'
:
'b36b43c3f09801f5d368627fb92187c3'
,
'HallusionBench'
:
'0c23ac0dc9ef46832d7a24504f2a0c7c'
,
'POPE'
:
'c12f5acb142f2ef1f85a26ba2fbe41d5'
,
'AMBER'
:
'970d94c0410916166e0a76ba75da7934'
,
}
# It returns a dataframe
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
from
.utils.yorn
import
YOrN_Extraction
,
YOrN_auxeval
from
.utils.yorn
import
default_rating
,
MME_rating
,
Hallusion_rating
,
POPE_rating
,
AMBER_rating
dataset
=
self
.
dataset_name
data
=
load
(
eval_file
)
data
[
'prediction'
]
=
[
str
(
x
)
for
x
in
data
[
'prediction'
]]
storage
=
eval_file
.
replace
(
'.xlsx'
,
'_auxmatch.xlsx'
)
tmp_file
=
eval_file
.
replace
(
'.xlsx'
,
'_tmp.pkl'
)
nproc
=
judge_kwargs
.
pop
(
'nproc'
,
4
)
if
not
osp
.
exists
(
storage
):
ans_map
=
{
k
:
YOrN_Extraction
(
v
)
for
k
,
v
in
zip
(
data
[
'index'
],
data
[
'prediction'
])}
if
osp
.
exists
(
tmp_file
):
tmp
=
load
(
tmp_file
)
for
k
in
tmp
:
if
ans_map
[
k
]
==
'Unknown'
and
tmp
[
k
]
!=
'Unknown'
:
ans_map
[
k
]
=
tmp
[
k
]
data
[
'extracted'
]
=
[
ans_map
[
x
]
for
x
in
data
[
'index'
]]
unknown
=
data
[
data
[
'extracted'
]
==
'Unknown'
]
model
=
judge_kwargs
.
get
(
'model'
,
'exact_matching'
)
if
model
==
'exact_matching'
:
model
=
None
elif
gpt_key_set
():
model
=
build_judge
(
**
judge_kwargs
)
if
not
model
.
working
():
warnings
.
warn
(
'OPENAI API is not working properly, will use exact matching for evaluation'
)
warnings
.
warn
(
DEBUG_MESSAGE
)
model
=
None
else
:
model
=
None
warnings
.
warn
(
'OPENAI_API_KEY is not working properly, will use exact matching for evaluation'
)
if
model
is
not
None
:
lt
=
len
(
unknown
)
lines
=
[
unknown
.
iloc
[
i
]
for
i
in
range
(
lt
)]
tups
=
[(
model
,
line
)
for
line
in
lines
]
indices
=
list
(
unknown
[
'index'
])
if
len
(
tups
):
res
=
track_progress_rich
(
YOrN_auxeval
,
tups
,
nproc
=
nproc
,
chunksize
=
nproc
,
keys
=
indices
,
save
=
tmp_file
)
for
k
,
v
in
zip
(
indices
,
res
):
ans_map
[
k
]
=
v
data
[
'extracted'
]
=
[
ans_map
[
x
]
for
x
in
data
[
'index'
]]
dump
(
data
,
storage
)
data
=
load
(
storage
)
if
listinstr
([
'AMBER'
],
dataset
):
data
[
'score'
]
=
(
data
[
'answer'
].
str
.
lower
()
==
data
[
'extracted'
].
str
.
lower
())
else
:
data
[
'score'
]
=
(
data
[
'answer'
]
==
data
[
'extracted'
])
dump
(
data
,
storage
)
if
dataset
is
not
None
and
listinstr
([
'MME'
],
dataset
):
score
=
MME_rating
(
storage
)
elif
dataset
is
not
None
and
listinstr
([
'Hallusion'
],
dataset
):
score
=
Hallusion_rating
(
storage
)
elif
dataset
is
not
None
and
listinstr
([
'POPE'
],
dataset
):
score
=
POPE_rating
(
storage
)
elif
dataset
is
not
None
and
listinstr
([
'AMBER'
],
dataset
):
score
=
AMBER_rating
(
storage
)
else
:
score
=
default_rating
(
storage
)
score_tgt
=
eval_file
.
replace
(
'.xlsx'
,
'_score.csv'
)
dump
(
score
,
score_tgt
)
return
score
VLMEvalKit/vlmeval/dataset/longvideobench.py
0 → 100644
View file @
bc5ebf0f
from
huggingface_hub
import
snapshot_download
from
..smp
import
*
from
.video_base
import
VideoBaseDataset
from
.utils
import
build_judge
,
DEBUG_MESSAGE
from
glob
import
glob
FAIL_MSG
=
'Failed to obtain answer via API.'
def
timestamp_to_seconds
(
timestamp
):
# Split the timestamp into hours, minutes, and seconds
h
,
m
,
s
=
timestamp
.
split
(
":"
)
# Convert hours, minutes, and total seconds (including fractions) to float and compute total seconds
total_seconds
=
int
(
h
)
*
3600
+
int
(
m
)
*
60
+
float
(
s
)
return
total_seconds
def
uniformly_subsample
(
lst
,
K
):
n
=
len
(
lst
)
if
K
>=
n
:
return
lst
step
=
n
/
K
return
[
lst
[
int
(
i
*
step
)]
for
i
in
range
(
K
)]
def
insert_subtitles_into_frames
(
frames
,
frame_timestamps
,
subtitles
,
starting_timestamp_for_subtitles
,
duration
,
):
interleaved_list
=
[]
cur_i
=
0
for
subtitle
in
subtitles
:
if
"timestamp"
in
subtitle
:
start
,
end
=
subtitle
[
"timestamp"
]
if
not
isinstance
(
end
,
float
):
end
=
duration
start
-=
starting_timestamp_for_subtitles
end
-=
starting_timestamp_for_subtitles
subtitle_timestamp
=
(
start
+
end
)
/
2
subtitle_text
=
subtitle
[
"text"
]
else
:
start
,
end
=
subtitle
[
"start"
],
subtitle
[
"end"
]
start
=
timestamp_to_seconds
(
start
)
end
=
timestamp_to_seconds
(
end
)
start
-=
starting_timestamp_for_subtitles
end
-=
starting_timestamp_for_subtitles
subtitle_timestamp
=
(
start
+
end
)
/
2
subtitle_text
=
subtitle
[
"line"
]
for
i
,
(
frame
,
frame_timestamp
)
in
enumerate
(
zip
(
frames
[
cur_i
:],
frame_timestamps
[
cur_i
:])
):
if
frame_timestamp
<=
subtitle_timestamp
:
# print("frame:", frame_timestamp)
interleaved_list
.
append
({
"type"
:
"image"
,
"value"
:
frame
})
cur_i
+=
1
else
:
break
if
end
-
start
<
1
:
end
=
subtitle_timestamp
+
0.5
start
=
subtitle_timestamp
-
0.5
covering_frames
=
False
for
frame
,
frame_timestamp
in
zip
(
frames
,
frame_timestamps
):
if
frame_timestamp
<
end
and
frame_timestamp
>
start
:
covering_frames
=
True
break
if
covering_frames
:
interleaved_list
.
append
({
"type"
:
"text"
,
"value"
:
subtitle_text
+
"
\n
"
})
else
:
pass
for
i
,
(
frame
,
frame_timestamp
)
in
enumerate
(
zip
(
frames
[
cur_i
:],
frame_timestamps
[
cur_i
:])
):
interleaved_list
.
append
({
"type"
:
"image"
,
"value"
:
frame
})
return
interleaved_list
class
LongVideoBench
(
VideoBaseDataset
):
MD5
=
'82905eae3a5ae7383c5a8ee9655e1ab9'
SYS
=
''
TYPE
=
'Video-MCQ'
def
__init__
(
self
,
dataset
=
'LongVideoBench'
,
use_subtitle
=
False
,
nframe
=
0
,
fps
=-
1
):
super
().
__init__
(
dataset
=
dataset
,
nframe
=
nframe
,
fps
=
fps
)
self
.
use_subtitle
=
use_subtitle
self
.
dataset_name
=
dataset
@
classmethod
def
supported_datasets
(
cls
):
return
[
'LongVideoBench'
]
def
prepare_dataset
(
self
,
dataset_name
=
'LongVideoBench'
,
repo_id
=
'longvideobench/LongVideoBench'
):
def
check_integrity
(
pth
):
data_file
=
osp
.
join
(
pth
,
f
'
{
dataset_name
}
.tsv'
)
if
not
osp
.
exists
(
data_file
):
return
False
if
md5
(
data_file
)
!=
self
.
MD5
:
print
(
"md5 mismatch"
,
md5
(
data_file
),
self
.
MD5
)
return
False
data
=
load
(
data_file
)
for
video_pth
in
data
[
'video_path'
]:
if
not
osp
.
exists
(
osp
.
join
(
pth
,
video_pth
)):
print
(
video_pth
,
"is not found"
)
return
False
return
True
if
modelscope_flag_set
():
repo_id
=
"AI-ModelScope/LongVideoBench"
cache_path
=
get_cache_path
(
repo_id
)
if
cache_path
is
not
None
and
check_integrity
(
cache_path
):
dataset_path
=
cache_path
else
:
def
generate_tsv
(
pth
):
data_file
=
osp
.
join
(
pth
,
f
'
{
dataset_name
}
.tsv'
)
if
osp
.
exists
(
data_file
)
and
md5
(
data_file
)
==
self
.
MD5
:
return
data_file
=
pd
.
read_json
(
osp
.
join
(
pth
,
'lvb_val.json'
))
data_file
=
data_file
.
assign
(
index
=
range
(
len
(
data_file
)))
data_file
[
'video'
]
=
data_file
[
'video_id'
]
data_file
[
'video_path'
]
=
data_file
[
'video_path'
].
apply
(
lambda
x
:
f
'./videos/
{
x
}
'
)
data_file
.
to_csv
(
osp
.
join
(
pth
,
f
'
{
dataset_name
}
.tsv'
),
sep
=
'
\t
'
,
index
=
False
)
if
modelscope_flag_set
():
from
modelscope
import
dataset_snapshot_download
dataset_snapshot_download
(
dataset_id
=
repo_id
)
else
:
snapshot_download
(
repo_id
=
repo_id
,
repo_type
=
'dataset'
)
print
(
"All videos are downloaded for LongVideoBench"
)
if
not
glob
(
osp
.
join
(
cache_path
,
"videos"
)):
tar_files
=
glob
(
osp
.
join
(
cache_path
,
"**/*.tar*"
),
recursive
=
True
)
def
untar_video_data
(
tar_file
,
cache_dir
):
import
tarfile
with
tarfile
.
open
(
tar_file
,
"r"
)
as
tar_ref
:
tar_ref
.
extractall
(
cache_dir
)
print
(
f
"Extracted all files from
{
tar_file
}
to
{
cache_dir
}
"
)
def
concat_tar_parts
(
tar_parts
,
output_tar
):
with
open
(
output_tar
,
"wb"
)
as
out_tar
:
from
tqdm
import
tqdm
for
part
in
tqdm
(
sorted
(
tar_parts
)):
with
open
(
part
,
"rb"
)
as
part_file
:
out_tar
.
write
(
part_file
.
read
())
print
(
f
"Concatenated parts
{
tar_parts
}
into
{
output_tar
}
"
)
tar_parts_dict
=
{}
# Group tar parts together
for
tar_file
in
tar_files
:
base_name
=
tar_file
.
split
(
".tar"
)[
0
]
if
base_name
not
in
tar_parts_dict
:
tar_parts_dict
[
base_name
]
=
[]
tar_parts_dict
[
base_name
].
append
(
tar_file
)
# Concatenate and untar split parts
for
base_name
,
parts
in
tar_parts_dict
.
items
():
print
(
f
"Extracting following tar files:
{
parts
}
"
)
output_tar
=
base_name
+
".tar"
if
not
osp
.
exists
(
output_tar
):
print
(
'Start concatenating tar files'
)
concat_tar_parts
(
parts
,
output_tar
)
print
(
'Finish concatenating tar files'
)
if
not
osp
.
exists
(
osp
.
join
(
cache_path
,
osp
.
basename
(
base_name
))):
untar_video_data
(
output_tar
,
cache_path
)
print
(
'All videos are extracted for LongVideoBench'
)
dataset_path
=
cache_path
generate_tsv
(
dataset_path
)
data_file
=
osp
.
join
(
dataset_path
,
f
'
{
dataset_name
}
.tsv'
)
return
dict
(
data_file
=
data_file
,
root
=
dataset_path
)
def
save_video_frames
(
self
,
video_path
,
video_llm
=
False
):
vid_path
=
osp
.
join
(
self
.
data_root
,
video_path
)
vid
=
decord
.
VideoReader
(
vid_path
)
video_info
=
{
'fps'
:
vid
.
get_avg_fps
(),
'n_frames'
:
len
(
vid
),
}
if
self
.
nframe
>
0
and
self
.
fps
<
0
:
step_size
=
len
(
vid
)
/
(
self
.
nframe
+
1
)
indices
=
[
int
(
i
*
step_size
)
for
i
in
range
(
1
,
self
.
nframe
+
1
)]
frame_paths
=
self
.
frame_paths
(
video_path
[:
-
4
])
elif
self
.
fps
>
0
:
# not constrained by num_frames, get frames by fps
total_duration
=
video_info
[
'n_frames'
]
/
video_info
[
'fps'
]
required_frames
=
int
(
total_duration
*
self
.
fps
)
step_size
=
video_info
[
'fps'
]
/
self
.
fps
indices
=
[
int
(
i
*
step_size
)
for
i
in
range
(
required_frames
)]
frame_paths
=
self
.
frame_paths_fps
(
video_path
[:
-
4
],
len
(
indices
))
flag
=
np
.
all
([
osp
.
exists
(
p
)
for
p
in
frame_paths
])
if
not
flag
:
images
=
[
vid
[
i
].
asnumpy
()
for
i
in
indices
]
images
=
[
Image
.
fromarray
(
arr
)
for
arr
in
images
]
for
im
,
pth
in
zip
(
images
,
frame_paths
):
if
not
osp
.
exists
(
pth
)
and
not
video_llm
:
im
.
save
(
pth
)
return
frame_paths
,
indices
,
video_info
# def save_video_into_images(self, line, num_frames=8):
# frame_paths, indices, video_info = self.save_video_frames(line['video_path'], num_frames)
# return frame_paths
def
build_prompt
(
self
,
line
,
video_llm
):
if
isinstance
(
line
,
int
):
assert
line
<
len
(
self
)
line
=
self
.
data
.
iloc
[
line
]
frames
,
indices
,
video_info
=
self
.
save_video_frames
(
line
[
'video_path'
],
video_llm
)
fps
=
video_info
[
"fps"
]
message
=
[
dict
(
type
=
'text'
,
value
=
self
.
SYS
)]
if
video_llm
:
message
.
append
(
dict
(
type
=
'video'
,
value
=
osp
.
join
(
self
.
data_root
,
line
[
'video_path'
])))
else
:
if
not
self
.
use_subtitle
:
with
open
(
osp
.
join
(
self
.
data_root
,
"subtitles"
,
line
[
"subtitle_path"
]))
as
f
:
subtitles
=
json
.
load
(
f
)
frame_message
=
insert_subtitles_into_frames
(
frames
,
[
ind_
/
fps
for
ind_
in
indices
],
subtitles
,
line
[
"starting_timestamp_for_subtitles"
],
line
[
"duration"
]
)
message
+=
frame_message
else
:
for
im
in
frames
:
message
.
append
(
dict
(
type
=
'image'
,
value
=
im
))
line
[
'question'
]
+=
'
\n
'
+
'
\n
'
.
join
(
[
"{}. {}"
.
format
(
chr
(
ord
(
"A"
)
+
i
),
cand
)
for
i
,
cand
in
enumerate
(
eval
(
line
[
'candidates'
]))]
)
prompt
=
line
[
"question"
]
+
"
\n
Answer with the option's letter from the given choices directly."
message
.
append
(
dict
(
type
=
'text'
,
value
=
prompt
))
return
message
# It returns a dictionary
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
from
.utils.longvideobench
import
get_dimension_rating
,
extract_characters_regex
,
extract_option
assert
eval_file
.
endswith
(
'.xlsx'
),
'data file should be an xlsx file'
tmp_file
=
eval_file
.
replace
(
'.xlsx'
,
'_tmp.pkl'
)
tgt_file
=
eval_file
.
replace
(
'.xlsx'
,
'_rating.json'
)
score_file
=
eval_file
.
replace
(
'.xlsx'
,
'_score.xlsx'
)
if
not
osp
.
exists
(
score_file
):
model
=
judge_kwargs
.
get
(
'model'
,
'exact_matching'
)
assert
model
in
[
'chatgpt-0125'
,
'exact_matching'
,
'gpt-4-0125'
]
if
model
==
'exact_matching'
:
model
=
None
elif
gpt_key_set
():
model
=
build_judge
(
**
judge_kwargs
)
if
not
model
.
working
():
warnings
.
warn
(
'OPENAI API is not working properly, will use exact matching for evaluation'
)
warnings
.
warn
(
DEBUG_MESSAGE
)
model
=
None
else
:
warnings
.
warn
(
'OPENAI_API_KEY is not set properly, will use exact matching for evaluation'
)
model
=
None
res
=
{}
if
not
osp
.
exists
(
tmp_file
)
else
load
(
tmp_file
)
res
=
{
k
:
v
for
k
,
v
in
res
.
items
()
if
FAIL_MSG
not
in
v
}
data
=
load
(
eval_file
)
data_un
=
data
[
~
pd
.
isna
(
data
[
'prediction'
])]
for
idx
in
data
[
'index'
]:
ans
=
data
.
loc
[
data
[
'index'
]
==
idx
,
'correct_choice'
].
values
[
0
]
ans
=
chr
(
ord
(
"A"
)
+
ans
)
pred
=
str
(
data
.
loc
[
data
[
'index'
]
==
idx
,
'prediction'
].
values
[
0
])
if
extract_characters_regex
(
pred
)
==
''
:
extract_pred
=
extract_option
(
model
,
data
.
loc
[
data
[
'index'
]
==
idx
].
to_dict
(
orient
=
'records'
)[
0
],
'LongVideoBench'
)
data
.
loc
[
idx
,
'score'
]
=
int
(
extract_pred
==
ans
)
else
:
data
.
loc
[
idx
,
'score'
]
=
int
(
extract_characters_regex
(
pred
)
==
ans
)
rejected
=
[
x
for
x
in
data
[
'score'
]
if
x
==
-
1
]
print
(
f
'Among
{
len
(
data
)
}
questions, failed to obtain prediction for
{
len
(
data
)
-
len
(
data_un
)
}
questions, '
f
'failed to obtain the score for another
{
len
(
rejected
)
}
questions. '
f
'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
)
dump
(
data
,
score_file
)
rating
=
get_dimension_rating
(
score_file
)
dump
(
rating
,
tgt_file
)
return
rating
VLMEvalKit/vlmeval/dataset/miabench.py
0 → 100644
View file @
bc5ebf0f
import
json
import
os
import
pandas
as
pd
from
.image_base
import
ImageBaseDataset
from
..smp
import
*
from
.utils
import
build_judge
,
DEBUG_MESSAGE
from
..utils
import
track_progress_rich
def
generate_prompt
(
d
):
question
=
d
[
'question'
]
weights
=
eval
(
d
[
'component_weight'
])
components
=
eval
(
d
[
'components'
])
num_of_component
=
int
(
d
[
'num_of_component'
])
response
=
d
[
'prediction'
]
if
num_of_component
==
1
:
components
=
f
"The first component is: '
{
components
[
0
]
}
'. "
score
=
f
"The first component is worth:
{
weights
[
0
]
}
scores. "
elif
num_of_component
==
2
:
components
=
f
"The first component is: '
{
components
[
0
]
}
', and the second component is '
{
components
[
1
]
}
'. "
score
=
f
"The first and second component is each worth
{
weights
[
0
]
}
and
{
weights
[
1
]
}
scores. "
elif
num_of_component
==
3
:
components
=
(
f
"The first component is: '
{
components
[
0
]
}
', and the second component is '
{
components
[
1
]
}
', "
f
"and the third component is '
{
components
[
2
]
}
'. "
)
score
=
(
"The first, second, and third component is each worth "
f
"
{
weights
[
0
]
}
,
{
weights
[
1
]
}
, and
{
weights
[
2
]
}
scores."
)
elif
num_of_component
==
4
:
components
=
(
f
"The first component is: '
{
components
[
0
]
}
', and the second component is '
{
components
[
1
]
}
', "
f
"and the third component is '
{
components
[
2
]
}
', and the fourth component is '
{
components
[
3
]
}
'. "
)
score
=
(
"The first, second, third, and fourth component is each worth "
f
"
{
weights
[
0
]
}
,
{
weights
[
1
]
}
,
{
weights
[
2
]
}
, and
{
weights
[
3
]
}
scores."
)
elif
num_of_component
==
5
:
components
=
(
f
"The first component is: '
{
components
[
0
]
}
', and the second component is '
{
components
[
1
]
}
', "
f
"and the third component is '
{
components
[
2
]
}
', and the fourth component is '
{
components
[
3
]
}
', "
f
"and the fifth component is '
{
components
[
4
]
}
'. "
)
score
=
(
"The first, second, third, fourth, and fifth component is each worth "
f
"
{
weights
[
0
]
}
,
{
weights
[
1
]
}
,
{
weights
[
2
]
}
,
{
weights
[
3
]
}
, and
{
weights
[
4
]
}
scores."
)
return
(
"Here is an instruction for a multimodal LLM: '"
f
"
{
question
}
"
"'. You need to grade if the response from the model follows each component of the instruction. "
f
"
{
components
}
"
"The response is: '"
f
"
{
response
}
"
"'. You need to score the response and be strict. The total score ranges from 0 to 10, "
"depending on if the response follows the instruction. "
f
"
{
score
}
"
"List scores of each component, and the total score in one sentence in this format: "
"score of component 1: x/2, score of component 2: y/8, total score: z/10. Then explain your reasons."
)
def
process_rawscore
(
component_type
,
raw_score
):
first_sentence
=
raw_score
.
split
(
'.'
)[
0
].
split
(
','
)
score_dict
=
{}
for
i
in
range
(
len
(
first_sentence
)
-
1
):
score_
=
first_sentence
[
i
].
split
(
':'
)[
1
][
1
:].
split
(
'/'
)
score
=
int
(
score_
[
0
])
/
int
(
score_
[
1
])
score_dict
[
component_type
[
i
]]
=
score
total_score_
=
first_sentence
[
i
+
1
].
split
(
':'
)[
1
][
1
:].
split
(
'/'
)
total_score
=
int
(
total_score_
[
0
])
/
int
(
total_score_
[
1
])
score_dict
[
'total_score'
]
=
total_score
return
score_dict
def
get_score_dict
(
data
,
score_raw
):
cat_score_dict
=
{}
for
i
in
range
(
len
(
data
)):
try
:
cmp
=
data
[
'component_type'
][
i
][
2
:
-
2
]
cmp_list
=
cmp
.
split
(
'
\'
,
\'
'
)
score_dict
=
process_rawscore
(
cmp_list
,
score_raw
[
i
])
for
key
,
val
in
score_dict
.
items
():
if
key
not
in
cat_score_dict
.
keys
():
cat_score_dict
[
key
]
=
[
val
]
else
:
cat_score_dict
[
key
].
append
(
val
)
except
:
pass
cat_score_dict_average
=
{}
for
key
,
val
in
cat_score_dict
.
items
():
cat_score_dict_average
[
key
]
=
sum
(
val
)
/
len
(
val
)
return
cat_score_dict_average
class
MIABench
(
ImageBaseDataset
):
TYPE
=
'VQA'
DATASET_URL
=
{
'MIA-Bench'
:
'https://opencompass.openxlab.space/utils/VLMEval/Mia-Bench.tsv'
,
}
DATASET_MD5
=
{
'MIA-Bench'
:
'0b9de595f4dd40af18a69b94d89aba82'
,
}
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
judge_name
=
judge_kwargs
.
pop
(
'model'
,
'gpt-4o'
)
model
=
build_judge
(
model
=
judge_name
,
**
judge_kwargs
)
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
storage
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
judge_name
}
.xlsx'
)
# noqa: F841
tmp_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
judge_name
}
.pkl'
)
# noqa: F841
nproc
=
judge_kwargs
.
pop
(
'nproc'
,
4
)
# noqa: F841
if
not
osp
.
exists
(
storage
):
data
=
load
(
eval_file
)
num_samples
=
len
(
data
)
lines
=
[
data
.
loc
[
i
]
for
i
in
range
(
num_samples
)]
prompts
=
[
generate_prompt
(
line
)
for
line
in
lines
]
org_data
=
MIABench
(
'MIA-Bench'
).
data
img_map
=
{
x
:
y
for
x
,
y
in
zip
(
org_data
[
'index'
],
org_data
[
'image'
])}
image_b64
=
[
img_map
[
idx
]
for
idx
in
data
[
'index'
]]
indices
=
list
(
data
[
'index'
])
mm_messages
=
[
dict
(
message
=
[
dict
(
type
=
'text'
,
value
=
prompt
),
dict
(
type
=
'image'
,
value
=
f
'data:image/jpeg;base64,
{
b64
}
'
)
])
for
prompt
,
b64
in
zip
(
prompts
,
image_b64
)
]
res
=
{}
if
osp
.
exists
(
tmp_file
):
res
=
load
(
tmp_file
)
jobs
=
{
k
:
v
for
k
,
v
in
zip
(
indices
,
mm_messages
)
if
k
not
in
res
}
job_keys
=
list
(
jobs
.
keys
())
job_vals
=
[
jobs
[
k
]
for
k
in
job_keys
]
resps
=
track_progress_rich
(
model
.
generate
,
job_vals
,
nproc
=
nproc
,
chunksize
=
nproc
,
keys
=
job_keys
,
save
=
tmp_file
,
)
for
k
,
resp
in
zip
(
job_keys
,
resps
):
res
[
k
]
=
resp
data
[
'score_raw'
]
=
[
res
[
idx
]
for
idx
in
indices
]
dump
(
data
,
storage
)
goresult
=
load
(
storage
)
results
=
get_score_dict
(
goresult
,
goresult
[
'score_raw'
])
result_pth
=
storage
.
replace
(
'.xlsx'
,
'_score.csv'
)
results_pd
=
pd
.
DataFrame
.
from_dict
(
list
(
results
.
items
()))
dump
(
results_pd
,
result_pth
)
return
results
VLMEvalKit/vlmeval/dataset/mlvu.py
0 → 100644
View file @
bc5ebf0f
import
huggingface_hub
from
huggingface_hub
import
snapshot_download
from
..smp
import
*
from
.video_concat_dataset
import
ConcatVideoDataset
from
.video_base
import
VideoBaseDataset
from
.utils
import
build_judge
,
DEBUG_MESSAGE
from
..utils
import
track_progress_rich
import
torchvision.transforms
as
T
from
torchvision
import
transforms
from
torchvision.transforms.functional
import
InterpolationMode
from
decord
import
VideoReader
,
cpu
import
pandas
as
pd
import
imageio
import
cv2
import
zipfile
import
os
import
glob
from
.utils.mlvu
import
*
FAIL_MSG
=
'Failed to obtain answer via API.'
class
MLVU
(
ConcatVideoDataset
):
def
__init__
(
self
,
dataset
=
'MLVU'
,
nframe
=
0
,
fps
=-
1
):
self
.
DATASET_SETS
[
dataset
]
=
[
'MLVU_MCQ'
,
'MLVU_OpenEnded'
]
self
.
type_data_dict
=
{
'M-Avg'
:[
'plotQA'
,
'needle'
,
'ego'
,
'count'
,
'anomaly_reco'
,
'topic_reasoning'
],
'G-Avg'
:[
'sub_scene'
,
'summary'
]
}
super
().
__init__
(
dataset
=
dataset
,
nframe
=
nframe
,
fps
=
fps
)
@
classmethod
def
supported_datasets
(
cls
):
return
[
'MLVU'
]
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
result
=
super
().
evaluate
(
eval_file
=
eval_file
,
**
judge_kwargs
)
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
score_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
'_acc.csv'
)
for
key
in
self
.
type_data_dict
:
result
.
loc
[
key
]
=
0.0
for
name
,
item
in
result
.
iterrows
():
if
name
in
self
.
type_data_dict
[
key
]:
result
.
loc
[
key
,
'success'
]
+=
item
[
'success'
]
result
.
loc
[
key
,
'overall'
]
+=
item
[
'overall'
]
if
key
==
'G-Avg'
:
result
.
loc
[
key
,
'acc'
]
=
round
(
result
.
loc
[
key
,
'success'
]
/
result
.
loc
[
key
,
'overall'
],
2
)
else
:
result
.
loc
[
key
,
'acc'
]
=
round
(
result
.
loc
[
key
,
'success'
]
/
result
.
loc
[
key
,
'overall'
]
*
100
,
1
)
result
=
result
.
reset_index
().
rename
(
columns
=
{
'index'
:
'task'
})
dump
(
result
,
score_file
)
return
result
class
MLVU_MCQ
(
VideoBaseDataset
):
MD5
=
'bb5c37e7cf8d43fc9a25c23d2b4633f5'
BASE_SYS
=
'Carefully watch this video and pay attention to every detail. '
SYS
=
BASE_SYS
+
'Based on your observations, select the best option that accurately addresses the question.'
TYPE
=
'Video-MCQ'
def
__init__
(
self
,
dataset
=
'MLVU_MCQ'
,
nframe
=
0
,
fps
=-
1
):
self
.
type_data_list
=
{
'plotQA'
:
(
'1_plotQA.json'
,
'./MLVU/video/1_plotQA'
,
'MCQ'
),
'needle'
:
(
'2_needle.json'
,
'./MLVU/video/2_needle'
,
'MCQ'
),
'ego'
:
(
'3_ego.json'
,
'./MLVU/video/3_ego'
,
'MCQ'
),
'count'
:
(
'4_count.json'
,
'./MLVU/video/4_count'
,
'MCQ'
),
'order'
:
(
'5_order.json'
,
'./MLVU/video/5_order'
,
'MCQ'
),
'anomaly_reco'
:
(
'6_anomaly_reco.json'
,
'./MLVU/video/6_anomaly_reco'
,
'MCQ'
),
'topic_reasoning'
:
(
'7_topic_reasoning.json'
,
'./MLVU/video/7_topic_reasoning'
,
'MCQ'
),
}
super
().
__init__
(
dataset
=
dataset
,
nframe
=
nframe
,
fps
=
fps
)
@
classmethod
def
supported_datasets
(
cls
):
return
[
'MLVU_MCQ'
]
def
prepare_dataset
(
self
,
dataset_name
=
'MLVU_MCQ'
,
repo_id
=
'MLVU/MVLU'
):
def
check_integrity
(
pth
):
data_file
=
osp
.
join
(
pth
,
f
'
{
dataset_name
}
.tsv'
)
if
not
os
.
path
.
exists
(
data_file
):
return
False
if
md5
(
data_file
)
!=
self
.
MD5
:
return
False
data
=
load
(
data_file
)
for
idx
,
item
in
data
.
iterrows
():
if
not
osp
.
exists
(
osp
.
join
(
pth
,
item
[
'prefix'
],
item
[
'video'
])):
return
False
return
True
if
modelscope_flag_set
():
repo_id
=
"AI-ModelScope/MLVU"
cache_path
=
get_cache_path
(
repo_id
)
if
cache_path
is
not
None
and
check_integrity
(
cache_path
):
dataset_path
=
cache_path
else
:
def
generate_tsv
(
pth
):
data_file
=
osp
.
join
(
pth
,
f
'
{
dataset_name
}
.tsv'
)
if
os
.
path
.
exists
(
data_file
)
and
md5
(
data_file
)
==
self
.
MD5
:
return
json_data_dir
=
os
.
path
.
join
(
dataset_path
,
'MLVU'
,
'json'
)
self
.
data_list
=
[]
for
k
,
v
in
self
.
type_data_list
.
items
():
with
open
(
os
.
path
.
join
(
json_data_dir
,
v
[
0
]),
'r'
)
as
f
:
json_data
=
json
.
load
(
f
)
for
data
in
json_data
:
self
.
data_list
.
append
({
'task_type'
:
k
,
'prefix'
:
v
[
1
],
'duration'
:
data
[
'duration'
],
'video'
:
data
[
'video'
],
'question'
:
data
[
'question'
],
'answer'
:
data
[
'answer'
],
'candidates'
:
data
[
'candidates'
],
})
data_df
=
pd
.
DataFrame
(
self
.
data_list
)
data_df
=
data_df
.
assign
(
index
=
range
(
len
(
data_df
)))
data_df
.
to_csv
(
data_file
,
sep
=
'
\t
'
,
index
=
False
)
if
modelscope_flag_set
():
from
modelscope
import
dataset_snapshot_download
dataset_path
=
dataset_snapshot_download
(
dataset_id
=
repo_id
)
else
:
hf_token
=
os
.
environ
.
get
(
'HUGGINGFACE_TOKEN'
)
huggingface_hub
.
login
(
hf_token
)
dataset_path
=
snapshot_download
(
repo_id
=
repo_id
,
repo_type
=
'dataset'
)
generate_tsv
(
dataset_path
)
data_file
=
osp
.
join
(
dataset_path
,
f
'
{
dataset_name
}
.tsv'
)
return
dict
(
root
=
dataset_path
,
data_file
=
data_file
)
def
qa_template
(
self
,
data
):
question
=
f
"Question:
{
data
[
'question'
]
}
\n
"
question
+=
'Options:
\n
'
answer
=
data
[
'answer'
]
answer_idx
=
-
1
for
idx
,
c
in
enumerate
(
eval
(
data
[
'candidates'
])):
question
+=
f
"(
{
chr
(
ord
(
'A'
)
+
idx
)
}
)
{
c
}
\n
"
if
c
==
answer
:
answer_idx
=
idx
question
=
question
.
rstrip
()
answer
=
f
"(
{
chr
(
ord
(
'A'
)
+
answer_idx
)
}
)
{
answer
}
"
return
question
,
answer
def
save_video_frames
(
self
,
line
):
suffix
=
line
[
'video'
].
split
(
'.'
)[
-
1
]
video
=
line
[
'video'
].
replace
(
f
'.
{
suffix
}
'
,
''
)
vid_path
=
osp
.
join
(
self
.
data_root
,
line
[
'prefix'
],
line
[
'video'
])
vid
=
decord
.
VideoReader
(
vid_path
)
video_info
=
{
'fps'
:
vid
.
get_avg_fps
(),
'n_frames'
:
len
(
vid
),
}
if
self
.
nframe
>
0
and
self
.
fps
<
0
:
step_size
=
len
(
vid
)
/
(
self
.
nframe
+
1
)
indices
=
[
int
(
i
*
step_size
)
for
i
in
range
(
1
,
self
.
nframe
+
1
)]
frame_paths
=
self
.
frame_paths
(
video
)
elif
self
.
fps
>
0
:
# not constrained by num_frames, get frames by fps
total_duration
=
video_info
[
'n_frames'
]
/
video_info
[
'fps'
]
required_frames
=
int
(
total_duration
*
self
.
fps
)
step_size
=
video_info
[
'fps'
]
/
self
.
fps
indices
=
[
int
(
i
*
step_size
)
for
i
in
range
(
required_frames
)]
frame_paths
=
self
.
frame_paths_fps
(
video
,
len
(
indices
))
flag
=
np
.
all
([
osp
.
exists
(
p
)
for
p
in
frame_paths
])
if
not
flag
:
images
=
[
vid
[
i
].
asnumpy
()
for
i
in
indices
]
images
=
[
Image
.
fromarray
(
arr
)
for
arr
in
images
]
for
im
,
pth
in
zip
(
images
,
frame_paths
):
if
not
osp
.
exists
(
pth
):
im
.
save
(
pth
)
return
frame_paths
def
save_video_into_images
(
self
,
line
):
frame_paths
=
self
.
save_video_frames
(
line
)
return
frame_paths
def
build_prompt
(
self
,
line
,
video_llm
):
if
isinstance
(
line
,
int
):
assert
line
<
len
(
self
)
line
=
self
.
data
.
iloc
[
line
]
question
,
answer
=
self
.
qa_template
(
line
)
message
=
[
dict
(
type
=
'text'
,
value
=
self
.
SYS
,
role
=
'system'
)]
message
.
append
(
dict
(
type
=
'text'
,
value
=
question
))
video_path
=
os
.
path
.
join
(
self
.
data_root
,
line
[
'prefix'
],
line
[
'video'
])
if
video_llm
:
message
.
append
(
dict
(
type
=
'video'
,
value
=
video_path
))
else
:
img_frame_paths
=
self
.
save_video_into_images
(
line
)
for
im
in
img_frame_paths
:
message
.
append
(
dict
(
type
=
'image'
,
value
=
im
))
message
.
append
(
dict
(
type
=
'text'
,
value
=
'
\n
Only give the best option.'
))
return
message
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
assert
eval_file
.
endswith
(
'.xlsx'
),
'data file should be an xlsx file'
tmp_file
=
eval_file
.
replace
(
'.xlsx'
,
'_tmp.pkl'
)
score_file
=
eval_file
.
replace
(
'.xlsx'
,
'_score.xlsx'
)
if
not
osp
.
exists
(
score_file
):
model
=
judge_kwargs
.
setdefault
(
'model'
,
'chatgpt-0125'
)
assert
model
in
[
'chatgpt-0125'
,
'exact_matching'
,
'gpt-4-0125'
]
if
model
==
'exact_matching'
:
model
=
None
elif
gpt_key_set
():
model
=
build_judge
(
**
judge_kwargs
)
if
not
model
.
working
():
warnings
.
warn
(
'OPENAI API is not working properly, will use exact matching for evaluation'
)
warnings
.
warn
(
DEBUG_MESSAGE
)
model
=
None
else
:
warnings
.
warn
(
'OPENAI_API_KEY is not set properly, will use exact matching for evaluation'
)
model
=
None
res
=
{}
if
not
osp
.
exists
(
tmp_file
)
else
load
(
tmp_file
)
res
=
{
k
:
v
for
k
,
v
in
res
.
items
()
if
FAIL_MSG
not
in
v
}
data
=
load
(
eval_file
)
data_un
=
data
[
~
pd
.
isna
(
data
[
'prediction'
])]
for
idx
in
data
[
'index'
]:
ans
=
data
.
loc
[
data
[
'index'
]
==
idx
,
'answer'
].
values
[
0
]
pred
=
data
.
loc
[
data
[
'index'
]
==
idx
,
'prediction'
].
values
[
0
]
options
=
eval
(
data
.
loc
[
data
[
'index'
]
==
idx
,
'candidates'
].
values
[
0
])
answer_idx
=
-
1
for
id
,
c
in
enumerate
(
options
):
if
c
==
ans
:
answer_idx
=
id
ans
=
f
"(
{
chr
(
ord
(
'A'
)
+
answer_idx
)
}
)
{
ans
}
"
input_item
=
data
.
loc
[
data
[
'index'
]
==
idx
].
to_dict
(
orient
=
'records'
)[
0
]
for
id
,
option_content
in
enumerate
(
eval
(
input_item
[
'candidates'
])):
input_item
[
chr
(
ord
(
'A'
)
+
id
)]
=
option_content
if
option_content
==
input_item
[
'answer'
]:
input_item
[
'answer'
]
=
chr
(
ord
(
'A'
)
+
id
)
if
FAIL_MSG
in
pred
:
data
.
loc
[
idx
,
'score'
]
=
-
1
else
:
data
.
loc
[
idx
,
'score'
]
=
int
(
check_ans_with_model
(
pred
,
ans
,
model
,
input_item
,
'MLVU_MCQ'
))
rejected
=
[
x
for
x
in
data
[
'score'
]
if
x
==
-
1
]
print
(
f
'Among
{
len
(
data
)
}
questions, failed to obtain prediction for
{
len
(
data
)
-
len
(
data_un
)
}
questions, '
f
'failed to obtain the score for another
{
len
(
rejected
)
}
questions. '
f
'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
)
dump
(
data
,
score_file
)
rating
=
get_dimension_rating
(
score_file
)
return
rating
class
MLVU_OpenEnded
(
VideoBaseDataset
):
MD5
=
'cee573a3627c6ac434ded704c60511ba'
BASE_SYS
=
'Carefully watch this video and pay attention to every detail. '
SYS
=
BASE_SYS
+
'Based on your observations, answer the given questions.'
TYPE
=
'Video-VQA'
def
__init__
(
self
,
dataset
=
'MLVU_OpenEnded'
,
nframe
=
0
,
fps
=-
1
):
self
.
type_data_list
=
{
'sub_scene'
:
(
'8_sub_scene.json'
,
'./MLVU/video/8_sub_scene'
,
'VQA'
),
'summary'
:
(
'9_summary.json'
,
'./MLVU/video/9_summary'
,
'VQA'
)
}
super
().
__init__
(
dataset
=
dataset
,
nframe
=
nframe
,
fps
=
fps
)
@
classmethod
def
supported_datasets
(
cls
):
return
[
'MLVU_OpenEnded'
]
def
prepare_dataset
(
self
,
dataset_name
=
'MLVU_OpenEnded'
,
repo_id
=
'MLVU/MVLU'
):
def
check_integrity
(
pth
):
data_file
=
osp
.
join
(
pth
,
f
'
{
dataset_name
}
.tsv'
)
if
not
os
.
path
.
exists
(
data_file
):
return
False
if
md5
(
data_file
)
!=
self
.
MD5
:
return
False
data
=
load
(
data_file
)
for
idx
,
item
in
data
.
iterrows
():
if
not
osp
.
exists
(
osp
.
join
(
pth
,
item
[
'prefix'
],
item
[
'video'
])):
return
False
return
True
if
modelscope_flag_set
():
repo_id
=
"AI-ModelScope/MLVU"
cache_path
=
get_cache_path
(
repo_id
)
if
cache_path
is
not
None
and
check_integrity
(
cache_path
):
dataset_path
=
cache_path
else
:
def
generate_tsv
(
pth
):
data_file
=
osp
.
join
(
pth
,
f
'
{
dataset_name
}
.tsv'
)
if
os
.
path
.
exists
(
data_file
)
and
md5
(
data_file
)
==
self
.
MD5
:
return
json_data_dir
=
os
.
path
.
join
(
dataset_path
,
'MLVU'
,
'json'
)
self
.
data_list
=
[]
for
k
,
v
in
self
.
type_data_list
.
items
():
with
open
(
os
.
path
.
join
(
json_data_dir
,
v
[
0
]),
'r'
)
as
f
:
json_data
=
json
.
load
(
f
)
for
data
in
json_data
:
self
.
data_list
.
append
({
'task_type'
:
k
,
'prefix'
:
v
[
1
],
'duration'
:
data
[
'duration'
],
'video'
:
data
[
'video'
],
'question'
:
data
[
'question'
],
'answer'
:
data
[
'answer'
],
'scoring_points'
:
data
[
'scoring_points'
]
if
'scoring_points'
in
data
else
''
})
data_df
=
pd
.
DataFrame
(
self
.
data_list
)
data_df
=
data_df
.
assign
(
index
=
range
(
len
(
data_df
)))
data_df
.
to_csv
(
data_file
,
sep
=
'
\t
'
,
index
=
False
)
if
modelscope_flag_set
():
from
modelscope
import
dataset_snapshot_download
dataset_path
=
dataset_snapshot_download
(
dataset_id
=
repo_id
)
else
:
hf_token
=
os
.
environ
.
get
(
'HUGGINGFACE_TOKEN'
)
huggingface_hub
.
login
(
hf_token
)
dataset_path
=
snapshot_download
(
repo_id
=
repo_id
,
repo_type
=
'dataset'
)
generate_tsv
(
dataset_path
)
data_file
=
osp
.
join
(
dataset_path
,
f
'
{
dataset_name
}
.tsv'
)
return
dict
(
root
=
dataset_path
,
data_file
=
data_file
)
def
qa_template
(
self
,
data
):
question
=
f
"
{
data
[
'question'
]
}
"
answer
=
data
[
'answer'
]
return
question
,
answer
def
save_video_frames
(
self
,
line
):
suffix
=
line
[
'video'
].
split
(
'.'
)[
-
1
]
video
=
line
[
'video'
].
replace
(
f
'.
{
suffix
}
'
,
''
)
vid_path
=
osp
.
join
(
self
.
data_root
,
line
[
'prefix'
],
line
[
'video'
])
vid
=
decord
.
VideoReader
(
vid_path
)
video_info
=
{
'fps'
:
vid
.
get_avg_fps
(),
'n_frames'
:
len
(
vid
),
}
if
self
.
nframe
>
0
and
self
.
fps
<
0
:
step_size
=
len
(
vid
)
/
(
self
.
nframe
+
1
)
indices
=
[
int
(
i
*
step_size
)
for
i
in
range
(
1
,
self
.
nframe
+
1
)]
frame_paths
=
self
.
frame_paths
(
video
)
elif
self
.
fps
>
0
:
# not constrained by num_frames, get frames by fps
total_duration
=
video_info
[
'n_frames'
]
/
video_info
[
'fps'
]
required_frames
=
int
(
total_duration
*
self
.
fps
)
step_size
=
video_info
[
'fps'
]
/
self
.
fps
indices
=
[
int
(
i
*
step_size
)
for
i
in
range
(
required_frames
)]
frame_paths
=
self
.
frame_paths_fps
(
video
,
len
(
indices
))
flag
=
np
.
all
([
osp
.
exists
(
p
)
for
p
in
frame_paths
])
if
not
flag
:
images
=
[
vid
[
i
].
asnumpy
()
for
i
in
indices
]
images
=
[
Image
.
fromarray
(
arr
)
for
arr
in
images
]
for
im
,
pth
in
zip
(
images
,
frame_paths
):
if
not
osp
.
exists
(
pth
):
im
.
save
(
pth
)
return
frame_paths
def
save_video_into_images
(
self
,
line
):
frame_paths
=
self
.
save_video_frames
(
line
)
return
frame_paths
def
build_prompt
(
self
,
line
,
video_llm
):
if
isinstance
(
line
,
int
):
assert
line
<
len
(
self
)
line
=
self
.
data
.
iloc
[
line
]
question
,
answer
=
self
.
qa_template
(
line
)
message
=
[
dict
(
type
=
'text'
,
value
=
self
.
SYS
,
role
=
'system'
)]
message
.
append
(
dict
(
type
=
'text'
,
value
=
question
))
video_path
=
os
.
path
.
join
(
self
.
data_root
,
line
[
'prefix'
],
line
[
'video'
])
if
video_llm
:
message
.
append
(
dict
(
type
=
'video'
,
value
=
video_path
))
else
:
img_frame_paths
=
self
.
save_video_into_images
(
line
)
for
im
in
img_frame_paths
:
message
.
append
(
dict
(
type
=
'image'
,
value
=
im
))
return
message
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
model
=
judge_kwargs
[
'model'
]
if
'model'
in
judge_kwargs
else
judge_kwargs
.
setdefault
(
'model'
,
'gpt-4-0125'
)
if
model
!=
'gpt-4-0125'
:
print
(
'MLVU Open Ended default using gpt-4-0125! So judge model is changed to gpt-4-0125'
)
judge_kwargs
[
'model'
]
=
'gpt-4-0125'
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
score_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
_score.xlsx'
)
tmp_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
.pkl'
)
nproc
=
judge_kwargs
.
pop
(
'nproc'
,
4
)
if
not
osp
.
exists
(
score_file
):
data
=
load
(
eval_file
)
model_dict
=
{
'sub_scene'
:
build_judge
(
system_prompt
=
system_prompt_sub_scene
,
**
judge_kwargs
),
'summary'
:
build_judge
(
system_prompt
=
system_prompt_summary
,
**
judge_kwargs
)
}
lt
=
len
(
data
)
lines
=
[
data
.
iloc
[
i
]
for
i
in
range
(
lt
)]
tups
=
[(
model_dict
[
line
[
'task_type'
]],
line
)
for
line
in
lines
]
indices
=
[
line
[
'index'
]
for
line
in
lines
]
ans
=
{}
if
osp
.
exists
(
tmp_file
):
ans
=
load
(
tmp_file
)
tups
=
[
x
for
x
,
i
in
zip
(
tups
,
indices
)
if
i
not
in
ans
]
indices
=
[
i
for
i
in
indices
if
i
not
in
ans
]
if
len
(
indices
):
_
=
track_progress_rich
(
MLVU_OpenEnded_generate
,
tups
,
nproc
=
nproc
,
chunksize
=
nproc
,
keys
=
indices
,
save
=
tmp_file
,
)
ans
=
load
(
tmp_file
)
data
=
MLVU_OpenEnded_extract
(
ans
,
data
)
dump
(
data
,
score_file
)
rating
=
get_dimension_rating
(
score_file
)
return
rating
VLMEvalKit/vlmeval/dataset/mmbench_video.py
0 → 100644
View file @
bc5ebf0f
from
huggingface_hub
import
snapshot_download
from
..smp
import
*
from
.video_base
import
VideoBaseDataset
from
.utils
import
build_judge
,
DEBUG_MESSAGE
from
..utils
import
track_progress_rich
FAIL_MSG
=
'Failed to obtain answer via API.'
def
unwrap_hf_pkl
(
pth
,
suffix
=
'.mp4'
):
base_dir
=
os
.
path
.
join
(
pth
,
'video_pkl/'
)
target_dir
=
os
.
path
.
join
(
pth
,
'video/'
)
pickle_files
=
[
os
.
path
.
join
(
base_dir
,
file
)
for
file
in
os
.
listdir
(
base_dir
)]
pickle_files
.
sort
()
if
not
os
.
path
.
exists
(
target_dir
):
os
.
makedirs
(
target_dir
,
exist_ok
=
True
)
for
pickle_file
in
pickle_files
:
with
open
(
pickle_file
,
'rb'
)
as
file
:
video_data
=
pickle
.
load
(
file
)
# For each video file in the pickle file, write its contents to a new mp4 file
for
video_name
,
video_content
in
video_data
.
items
():
output_path
=
os
.
path
.
join
(
target_dir
,
f
'
{
video_name
}{
suffix
}
'
)
with
open
(
output_path
,
'wb'
)
as
output_file
:
output_file
.
write
(
video_content
)
print
(
'The video file has been restored and stored from the pickle file.'
)
else
:
print
(
'The video file already exists.'
)
class
MMBenchVideo
(
VideoBaseDataset
):
MD5
=
'98f7df3eb1007fc375ea6fe88a98e2ff'
SYS
=
'You are an AI assistant responsible for answering questions about videos.'
FRAMES_TMPL_PACK
=
"""
You will be provided with {} separate frames uniformly sampled from a video,
\
the frames are provided in chronological order of the video.
Please analyze these images and provide the answer / answers to the
\
following question / questions about the video content.
If multiple questions are provided (with indices I1, I2, I3, ...),
\
you should organize your answers in the following json format:
{{
'I1': 'Answer to Question I1',
'I2': 'Answer to Question I2',
...
}}
Otherwise, please directly reply with your response to the only question.
Even if the information in these separate frames is not enough to give an answer,
PLEASE GIVE A RESPONSE TO EACH OF THE QUESTIONS IN THE FORMAT DESCRIBED ABOVE.
"""
FRAMES_TMPL_NOPACK
=
"""
You will be provided with {} separate frames uniformly sampled from a video,
\
the frames are provided in chronological order of the video.
Please analyze these images and provide the answer to the question about the video content.
Please directly reply with your response to the only question.
"""
TYPE
=
'Video-VQA'
def
__init__
(
self
,
dataset
=
'MMBench-Video'
,
pack
=
False
,
nframe
=
0
,
fps
=-
1
):
super
().
__init__
(
dataset
=
dataset
,
pack
=
pack
,
nframe
=
nframe
,
fps
=
fps
)
@
classmethod
def
supported_datasets
(
cls
):
return
[
'MMBench-Video'
]
def
prepare_dataset
(
self
,
dataset_name
=
'MMBench-Video'
,
repo_id
=
'opencompass/MMBench-Video'
):
def
check_integrity
(
pth
):
data_file
=
osp
.
join
(
pth
,
f
'
{
dataset_name
}
.tsv'
)
if
md5
(
data_file
)
!=
self
.
MD5
:
return
False
data
=
load
(
data_file
)
for
video_pth
in
data
[
'video_path'
]:
if
not
osp
.
exists
(
osp
.
join
(
pth
,
video_pth
)):
return
False
return
True
cache_path
=
get_cache_path
(
repo_id
)
if
cache_path
is
not
None
and
check_integrity
(
cache_path
):
dataset_path
=
cache_path
else
:
if
modelscope_flag_set
():
from
modelscope
import
dataset_snapshot_download
dataset_path
=
dataset_snapshot_download
(
dataset_id
=
repo_id
)
else
:
dataset_path
=
snapshot_download
(
repo_id
=
repo_id
,
repo_type
=
'dataset'
)
unwrap_hf_pkl
(
dataset_path
)
self
.
video_path
=
osp
.
join
(
dataset_path
,
'video/'
)
data_file
=
osp
.
join
(
dataset_path
,
f
'
{
dataset_name
}
.tsv'
)
return
dict
(
data_file
=
data_file
,
root
=
osp
.
join
(
dataset_path
,
'video'
))
def
build_prompt_pack
(
self
,
line
):
if
isinstance
(
line
,
int
):
assert
line
<
len
(
self
)
video
=
self
.
videos
[
line
]
elif
isinstance
(
line
,
pd
.
Series
):
video
=
line
[
'video'
]
elif
isinstance
(
line
,
str
):
video
=
line
frames
=
self
.
save_video_frames
(
video
)
sub
=
self
.
data
[
self
.
data
[
'video'
]
==
video
]
sys_prompt
=
self
.
SYS
+
self
.
FRAMES_TMPL_PACK
.
format
(
len
(
frames
))
message
=
[
dict
(
type
=
'text'
,
value
=
sys_prompt
)]
for
im
in
frames
:
message
.
append
(
dict
(
type
=
'image'
,
value
=
im
))
nq
=
len
(
sub
)
prompt
=
'Questions:
\n
{}
\n
Answers:
\n
'
qs
=
{
int
(
sub
.
iloc
[
i
][
'index'
]):
sub
.
iloc
[
i
][
'question'
]
for
i
in
range
(
nq
)}
prompt
=
prompt
.
format
(
json
.
dumps
(
qs
))
message
.
append
(
dict
(
type
=
'text'
,
value
=
prompt
))
return
message
def
build_prompt_nopack
(
self
,
line
,
video_llm
):
if
isinstance
(
line
,
int
):
assert
line
<
len
(
self
)
line
=
self
.
data
.
iloc
[
line
]
if
video_llm
:
question
=
line
[
'question'
]
prefix
,
video_idx_path
=
os
.
path
.
split
(
line
[
'video_path'
])
message
=
[
dict
(
type
=
'text'
,
value
=
question
)]
message
.
append
(
dict
(
type
=
'video'
,
value
=
os
.
path
.
join
(
self
.
video_path
,
video_idx_path
)))
return
message
else
:
frames
=
self
.
save_video_frames
(
line
[
'video'
])
sys_prompt
=
self
.
FRAMES_TMPL_NOPACK
.
format
(
len
(
frames
))
message
=
[
dict
(
type
=
'text'
,
value
=
sys_prompt
)]
for
im
in
frames
:
message
.
append
(
dict
(
type
=
'image'
,
value
=
im
))
prompt
=
'Question: {}
\n
Answer: '
.
format
(
line
[
'question'
])
message
.
append
(
dict
(
type
=
'text'
,
value
=
prompt
))
return
message
def
build_prompt
(
self
,
line
,
video_llm
):
if
self
.
pack
and
not
video_llm
:
return
self
.
build_prompt_pack
(
line
)
else
:
return
self
.
build_prompt_nopack
(
line
,
video_llm
)
@
staticmethod
def
remove_side_quote
(
s
,
syms
=
[
','
,
'"'
,
"'"
]):
if
np
.
all
([
x
in
syms
for
x
in
s
]):
return
''
while
s
[
0
]
in
syms
:
s
=
s
[
1
:]
while
s
[
-
1
]
in
syms
:
s
=
s
[:
-
1
]
return
s
@
staticmethod
def
robust_json_load
(
s
):
try
:
jsons
=
list
(
extract_json_objects
(
s
))
assert
len
(
jsons
)
==
1
return
jsons
[
0
]
except
:
if
'{'
in
s
and
s
.
find
(
'{'
)
==
s
.
rfind
(
'{'
):
sub_str
=
s
[
s
.
find
(
'{'
)
+
1
:].
strip
()
lines
=
sub_str
.
split
(
'
\n
'
)
res
=
{}
for
l
in
lines
:
l
=
l
.
strip
()
if
': '
in
l
:
key
=
l
.
split
(
': '
)[
0
].
strip
()
val
=
l
.
split
(
': '
)[
1
].
strip
()
key
=
MMBenchVideo
.
remove_side_quote
(
key
)
val
=
MMBenchVideo
.
remove_side_quote
(
val
)
if
len
(
key
)
and
len
(
val
):
res
[
key
]
=
val
return
res
return
None
def
load_pack_answers
(
self
,
data_raw
):
vstats
=
defaultdict
(
lambda
:
0
)
data
=
defaultdict
(
lambda
:
{})
for
k
in
data_raw
:
ans
=
data_raw
[
k
].
strip
()
if
FAIL_MSG
in
ans
:
vstats
[
'GEN_FAIL'
]
+=
1
continue
res
=
self
.
robust_json_load
(
ans
)
if
res
is
not
None
:
data
[
k
]
=
res
vstats
[
'PARSE_OK'
]
+=
1
else
:
vstats
[
'PARSE_FAIL'
]
+=
1
# return data
meta
=
cp
.
deepcopy
(
self
.
data
)
lt
=
len
(
meta
)
prediction
=
[]
for
i
in
range
(
lt
):
line
=
meta
.
iloc
[
i
]
vid
=
line
[
'video'
]
idx
=
str
(
line
[
'index'
])
prediction
.
append
(
data
[
vid
][
idx
]
if
idx
in
data
[
vid
]
else
None
)
meta
[
'prediction'
]
=
prediction
vstats
[
'VALIDQ'
]
=
len
([
x
for
x
in
prediction
if
x
is
not
None
])
vstats
[
'INVALIDQ'
]
=
len
([
x
for
x
in
prediction
if
x
is
None
])
return
meta
,
vstats
# It returns a dictionary
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
from
.utils.mmbench_video
import
get_dimension_rating
,
system_prompt
,
build_prompt
assert
eval_file
.
endswith
(
'.xlsx'
),
'data file should be an xlsx file'
judge
=
judge_kwargs
[
'model'
]
nproc
=
judge_kwargs
.
pop
(
'nproc'
,
4
)
tmp_file
=
eval_file
.
replace
(
'.xlsx'
,
f
'_
{
judge
}
_tmp.pkl'
)
tgt_file
=
eval_file
.
replace
(
'.xlsx'
,
f
'_
{
judge
}
_rating.json'
)
score_file
=
eval_file
.
replace
(
'.xlsx'
,
f
'_
{
judge
}
_score.xlsx'
)
model
=
build_judge
(
system_prompt
=
system_prompt
,
**
judge_kwargs
)
assert
model
.
working
(),
'MMBench-Video evaluation requires a working OPENAI API
\n
'
+
DEBUG_MESSAGE
if
not
osp
.
exists
(
score_file
):
res
=
{}
if
not
osp
.
exists
(
tmp_file
)
else
load
(
tmp_file
)
res
=
{
k
:
v
for
k
,
v
in
res
.
items
()
if
model
.
fail_msg
not
in
v
}
data
=
load
(
eval_file
)
data_un
=
data
[
~
data
[
'index'
].
isin
(
res
)]
data_un
=
data_un
[
~
pd
.
isna
(
data_un
[
'prediction'
])]
lt
=
len
(
data_un
)
prompts
=
[
build_prompt
(
data_un
.
iloc
[
i
])
for
i
in
range
(
lt
)]
indices
=
[
data_un
.
iloc
[
i
][
'index'
]
for
i
in
range
(
lt
)]
if
len
(
prompts
):
_
=
track_progress_rich
(
model
.
generate
,
prompts
,
keys
=
indices
,
save
=
tmp_file
,
nproc
=
nproc
,
chunksize
=
nproc
)
score_map
=
load
(
tmp_file
)
data
[
'score'
]
=
[
score_map
[
idx
]
if
idx
in
score_map
else
-
1
for
idx
in
data
[
'index'
]]
rejected
=
[
x
for
x
in
score_map
.
values
()
if
FAIL_MSG
in
x
]
data
[
'score'
]
=
[
int
(
x
)
if
istype
(
x
,
int
)
else
-
1
for
x
in
data
[
'score'
]]
print
(
f
'Among
{
len
(
data
)
}
questions, failed to obtain prediction for
{
len
(
data
)
-
len
(
score_map
)
}
questions, '
f
'failed to obtain the score for another
{
len
(
rejected
)
}
questions. '
f
'Those questions will be counted as 0 score in ALL rating, and will not be counted in VALID rating.'
)
dump
(
data
,
score_file
)
rating
=
get_dimension_rating
(
score_file
)
dump
(
rating
,
tgt_file
)
return
rating
VLMEvalKit/vlmeval/dataset/mmgenbench.py
0 → 100644
View file @
bc5ebf0f
import
warnings
import
pandas
as
pd
from
abc
import
abstractmethod
from
..smp
import
*
from
.image_base
import
ImageBaseDataset
class
MMGenBench
(
ImageBaseDataset
):
prompt_list
=
[
"""
# Role
You are an expert in the field of image understanding, focusing on the
\
understanding of images and generating the image caption-prompt.
# Definition Explanation
image caption-prompt: Refers to the caption or description of an image,
\
used to provide to a Text-to-Image model to generate a new image.
Text-to-Image model: Can generate a new image based on the provided image
\
caption-prompt, such as stable diffusion 3, flux, and other image generation models.
# Task Description
Generate an image caption-prompt based on the input image.
# Key Points and Requirements
1. Accurately understand the input image and precisely generate an image caption-prompt.
2. The generated image caption-prompt, when provided to the Text-to-Image model, requires the
\
Text-to-Image model to generate a new image that is as consistent as possible with the input image.
3. The generated image caption-prompt must conform to the preferences of the Text-to-Image model.
4. The generated image caption-prompt should describe the input image in as much
\
detail as possible, and it should be between 20 to 60 words.
# Output Format
A string, that is the image caption-prompt. No extra output needed.
"""
]
TYPE
=
'GenerateImgPrompt'
DATASET_URL
=
{
'MMGenBench-Test'
:
'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Test.tsv'
,
'MMGenBench-Domain'
:
'https://huggingface.co/datasets/lerogo/MMGenBench/resolve/main/MMGenBench-Domain.tsv'
,
}
PROMPT_MAP
=
{
'MMGenBench-Test'
:
prompt_list
[
0
],
'MMGenBench-Domain'
:
prompt_list
[
0
],
}
DATASET_MD5
=
{
'MMGenBench-Test'
:
"94f8dac6bbf7c20be403f99adeaa73da"
,
'MMGenBench-Domain'
:
"5c10daf6e2c5f08bdfb0701aa6db86bb"
,
}
def
__init__
(
self
,
dataset
=
'MMGenBench'
,
**
kwargs
):
super
().
__init__
(
dataset
,
**
kwargs
)
warnings
.
warn
(
'This dataset is for inference only and does not support direct output of evaluation results.
\n
'
)
warnings
.
warn
(
'Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.
\n
'
)
def
load_data
(
self
,
dataset
):
data
=
super
().
load_data
(
dataset
)
if
'question'
not
in
data
:
data
[
'question'
]
=
[(
self
.
PROMPT_MAP
[
dataset
]
)]
*
len
(
data
)
return
data
# Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
@
abstractmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
warnings
.
warn
(
'This evaluation method is not supported.
\n
'
)
warnings
.
warn
(
'Please refer to "https://github.com/lerogo/MMGenBench" for more evaluation information.
\n
'
)
return
None
VLMEvalKit/vlmeval/dataset/mmlongbench.py
0 → 100644
View file @
bc5ebf0f
import
re
import
math
from
urllib.request
import
urlopen
from
PIL
import
Image
,
ImageDraw
,
ImageFont
import
torchvision.transforms
as
transforms
from
vlmeval.dataset.utils
import
build_judge
,
levenshtein_distance
from
vlmeval.smp
import
*
from
.image_base
import
ImageBaseDataset
FAIL_MSG
=
'Failed to obtain answer via API.'
def
get_gpt4_ICE
():
example_1
=
"""
---
Question: List the primary questions asked about the services in this report.
Analysis: The primary questions asked about the services in the report for The Limes Residential Home are:
\n\n
1. Is the service safe?
\n
2. Is the service effective?
\n
3. Is the service caring?
\n
4. Is the service responsive?
\n
5. Is the service well-led?
Extracted answer: [
'Is the servife safe?',
'Is the service effective',
'Is the serve caring?',
'Is the service responsive?',
'Is the service well-led?'
]
Answer format: List
\n
"""
example_2
=
"""
---
Question: How many regulations of the HSCA 2008 are breached in all according to this report?
Analysis: According to the report, the provider breached 10 Health and Social Care Act 2008 (Regulated Activities)
Regulations in total. Here are the specifics:
\n\n
1. Regulation 13: Safeguarding service users from abuse and
improper treatment
\n
2. Regulation 12: Safe care and treatment
\n
3. Regulation 18: Staffing
\n
4. Regulation 11:
Need for consent
\n
5. Regulation 10: Dignity and respect
\n
6. Regulation 9: Person-centred care
\n
7. Regulation 17:
Good governance
\n
8. Regulation 18 (CQC Registration Regulations 2009): Notification of other incidents
\n
9.
Regulation 18: Failure to maintain an accurate and up-to-date care plan
\n
10. Regulation 11: Failure to implement
the Mental Capacity Act 2005 code of practice effectively
\n\n
These breaches involve issues concerning staffing,
safeguarding, medicines management, dignity and respect, consent, care planning, governance, and failure to
notify the CQC of incidents.
Extracted answer: 10
Answer format: Integer
\n
"""
example_3
=
"""
---
Question: According to the survey that is the percentage of Chinese who are paying more or
about the same attention to politics after Trump's election?
Analysis: The survey provided does not specify the percentage of Chinese individuals specifically who are paying
more or about the same attention to politics after Trump's election. The report focuses primarily on American
demographics and does not include specific details about the Chinese population in relation to this question. If
you need information about a different demographic or a summary of the findings from the American demographic,
I can certainly help with that!
Extracted answer: Not answerable
Answer format: String
\n
"""
example_4
=
"""
---
Question: How many quotations from male respondent over 50 years old are included in this report?
Analysis: The image you've provided appears to be a screenshot of a document with multiple charts. However, the
text is too small and blurry to read accurately. If you can provide a clearer image or more context, I might be
able to help you with your question.
Extracted answer: Fail to answer
Answer format: String
\n
"""
return
[
example_1
,
example_2
,
example_3
,
example_4
]
def
build_mmlongbench_gpt4_prompt
(
line
):
task_description
=
"""
Given the question and analysis, you are tasked to extract answers with required formats from the free-form analysis.
- Your extracted answers should be one of the following formats: (1) Integer, (2) Float, (3) String and (4) List.
If you find the analysis the question can not be answered from the given documents, type "Not answerable".
Exception: If the analysis only tells you that it can not read/understand the images or documents,
type "Fail to answer".
- Please make your response as concise as possible. Also note that your response should be formatted as below:
```
Extracted answer: [answer]
Answer format: [answer format]
```
Please read the following example, then extract the answer from the model response
and type it at the end of the prompt.
\n
"""
question
=
line
[
'question'
]
prediction
=
str
(
line
[
'prediction'
])
prompt
=
task_description
examples
=
get_gpt4_ICE
()
for
example
in
examples
:
prompt
+=
example
prompt
+=
'---
\n
Question:'
+
question
+
'
\n
'
prompt
+=
'Analysis: '
+
prediction
return
prompt
def
anls_compute
(
groundtruth
,
prediction
,
threshold
=
0.5
):
dist
=
levenshtein_distance
(
groundtruth
,
prediction
)
length
=
max
(
len
(
groundtruth
.
upper
()),
len
(
prediction
.
upper
()))
value
=
0.0
if
length
==
0
else
float
(
dist
)
/
float
(
length
)
anls
=
1.0
-
value
if
anls
<=
threshold
:
anls
=
0.0
return
anls
def
is_float_equal
(
reference
,
prediction
,
include_percentage
:
bool
=
False
,
is_close
:
float
=
False
)
->
bool
:
def
get_precision
(
gt_ans
:
float
)
->
int
:
precision
=
3
if
'.'
in
str
(
gt_ans
):
precision
=
len
(
str
(
gt_ans
).
split
(
'.'
)[
-
1
])
return
precision
reference
=
float
(
str
(
reference
).
strip
().
rstrip
(
'%'
).
strip
())
try
:
prediction
=
float
(
str
(
prediction
).
strip
().
rstrip
(
'%'
).
strip
())
except
:
return
False
if
include_percentage
:
gt_result
=
[
reference
/
100
,
reference
,
reference
*
100
]
else
:
gt_result
=
[
reference
]
for
item
in
gt_result
:
try
:
if
is_close
:
if
math
.
isclose
(
item
,
prediction
,
rel_tol
=
0.01
):
return
True
precision
=
max
(
min
(
get_precision
(
prediction
),
get_precision
(
item
)),
2
)
if
round
(
prediction
,
precision
)
==
round
(
item
,
precision
):
return
True
except
Exception
:
continue
return
False
def
get_clean_string
(
s
):
s
=
str
(
s
).
lower
().
strip
()
if
s
.
endswith
(
'mile'
):
s
.
rstrip
(
'mile'
).
strip
()
if
s
.
endswith
(
'miles'
):
s
.
rstrip
(
'miles'
).
strip
()
if
s
.
endswith
(
'million'
):
s
.
rstrip
(
'million'
).
strip
()
# remove parenthesis
s
=
re
.
sub
(
r
'\s*\([^)]*\)'
,
''
,
s
).
strip
()
# remove quotes
s
=
re
.
sub
(
r
"^['\"]|['\"]$"
,
''
,
s
).
strip
()
s
=
s
.
strip
().
lstrip
(
'$'
).
strip
()
s
=
s
.
strip
().
rstrip
(
'%'
).
strip
()
return
s
def
is_exact_match
(
s
):
flag
=
False
# Website
if
'https://'
in
s
:
flag
=
True
# code file
if
s
.
endswith
(
'.py'
)
or
s
.
endswith
(
'ipynb'
):
flag
=
True
if
s
.
startswith
(
'page'
):
flag
=
True
# telephone number
if
re
.
fullmatch
(
r
'\b\d+(-\d+|\s\d+)?\b'
,
s
):
flag
=
True
# time
if
'a.m.'
in
s
or
'p.m.'
in
s
:
flag
=
True
# YYYY-MM-DD
if
re
.
fullmatch
(
r
'\b\d{4}[-\s]\d{2}[-\s]\d{2}\b'
,
s
):
flag
=
True
# YYYY-MM
if
re
.
fullmatch
(
r
'\b\d{4}[-\s]\d{2}\b'
,
s
):
flag
=
True
# Email address
if
re
.
fullmatch
(
r
'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
,
s
):
flag
=
True
return
flag
def
isfloat
(
num
):
try
:
float
(
num
)
return
True
except
ValueError
:
return
False
def
get_font
():
try
:
truetype_url
=
"http://opencompass.openxlab.space/utils/Fonts/SimHei.ttf"
ff
=
urlopen
(
truetype_url
)
font
=
ImageFont
.
truetype
(
ff
,
size
=
40
)
except
Exception
as
e
:
logging
.
warning
(
f
'
{
type
(
e
)
}
:
{
e
}
'
)
logging
.
warning
(
"Fail to download the font. Use the default one."
)
font
=
ImageFont
.
load_default
(
size
=
40
)
return
font
def
frame2img
(
img_path_list
,
font
,
save_path
=
None
,
idx_start
=
0
):
imgs
=
[
Image
.
open
(
img_path
)
for
img_path
in
img_path_list
]
new_imgs
=
[]
for
img
in
imgs
:
w
,
h
=
img
.
size
scale
=
w
/
h
if
w
>
h
:
new_w
=
560
*
2
new_h
=
int
(
560
*
2
/
scale
)
else
:
new_w
=
int
(
560
*
2
*
scale
)
new_h
=
560
*
2
img
=
transforms
.
functional
.
resize
(
img
,
[
new_h
,
new_w
],)
new_imgs
.
append
(
img
)
imgs
=
new_imgs
new_w
=
0
new_h
=
0
pad
=
40
if
w
>
h
:
for
im
in
imgs
:
w
,
h
=
im
.
size
new_w
=
max
(
new_w
,
w
)
new_h
+=
h
+
10
+
pad
new_img
=
Image
.
new
(
"RGB"
,
(
new_w
,
new_h
),
"white"
)
draw
=
ImageDraw
.
Draw
(
new_img
)
curr_h
=
0
for
idx
,
im
in
enumerate
(
imgs
):
w
,
h
=
im
.
size
new_img
.
paste
(
im
,
(
0
,
pad
+
curr_h
))
draw
.
text
((
0
,
curr_h
),
f
"<IMAGE
{
idx
+
idx_start
}
>"
,
font
=
font
,
fill
=
"black"
)
if
idx
+
1
<
len
(
imgs
):
draw
.
line
([(
0
,
pad
+
curr_h
+
h
+
5
),
(
new_w
,
pad
+
curr_h
+
h
+
5
)],
fill
=
'black'
,
width
=
2
)
curr_h
+=
h
+
10
+
pad
else
:
for
im
in
imgs
:
w
,
h
=
im
.
size
new_w
+=
w
+
10
new_h
=
max
(
new_h
,
h
)
new_h
+=
pad
new_img
=
Image
.
new
(
'RGB'
,
(
new_w
,
new_h
),
'white'
)
draw
=
ImageDraw
.
Draw
(
new_img
)
curr_w
=
0
for
idx
,
im
in
enumerate
(
imgs
):
w
,
h
=
im
.
size
new_img
.
paste
(
im
,
(
curr_w
,
pad
))
draw
.
text
((
curr_w
,
0
),
f
"<IMAGE
{
idx
+
idx_start
}
>"
,
font
=
font
,
fill
=
'black'
)
if
idx
+
1
<
len
(
imgs
):
draw
.
line
([(
curr_w
+
w
+
5
,
0
),
(
curr_w
+
w
+
5
,
new_h
)],
fill
=
'black'
,
width
=
2
)
curr_w
+=
w
+
10
if
save_path
is
not
None
:
new_img
.
save
(
save_path
)
return
new_img
def
concat_images
(
image_list
,
max_concat
=
1
,
column_num
=
1
):
concatenated_images
=
[]
if
column_num
==
-
1
:
MAX_COLUMN_NUM
=
20
max_concat
=
1
while
len
(
image_list
)
/
max_concat
>
MAX_COLUMN_NUM
:
max_concat
+=
1
interval
=
max
(
math
.
ceil
(
len
(
image_list
)
/
max_concat
),
1
)
for
i
in
range
(
0
,
len
(
image_list
),
interval
):
batch_images
=
image_list
[
i
:
i
+
interval
]
concatenated_image
=
frame2img
(
batch_images
,
font
=
get_font
(),
idx_start
=
i
)
concatenated_images
.
append
(
concatenated_image
)
else
:
interval
=
max
(
math
.
ceil
(
len
(
image_list
)
/
max_concat
),
1
)
for
i
in
range
(
0
,
len
(
image_list
),
interval
):
batch_images
=
[
Image
.
open
(
filename
)
for
filename
in
image_list
[
i
:
i
+
interval
]]
if
column_num
==
1
:
total_height
=
batch_images
[
0
].
height
*
len
(
batch_images
)
else
:
total_height
=
batch_images
[
0
].
height
*
((
len
(
batch_images
)
-
1
)
//
column_num
+
1
)
concatenated_image
=
Image
.
new
(
'RGB'
,
(
batch_images
[
0
].
width
*
column_num
,
total_height
),
'white'
)
x_offset
,
y_offset
=
0
,
0
for
count
,
image
in
enumerate
(
batch_images
):
concatenated_image
.
paste
(
image
,
(
x_offset
,
y_offset
))
x_offset
+=
image
.
width
if
(
count
+
1
)
%
column_num
==
0
:
y_offset
+=
image
.
height
x_offset
=
0
concatenated_images
.
append
(
concatenated_image
)
return
concatenated_images
def
eval_score
(
gt
,
pred
,
answer_type
):
if
answer_type
==
'Int'
:
try
:
gt
,
pred
=
int
(
gt
),
int
(
float
(
pred
))
except
:
pred
=
''
score
=
(
gt
==
pred
)
elif
answer_type
==
'Float'
:
try
:
gt
=
float
(
get_clean_string
(
str
(
gt
)))
pred
=
float
(
get_clean_string
(
str
(
pred
)))
except
:
pred
=
''
score
=
is_float_equal
(
gt
,
pred
,
include_percentage
=
True
,
is_close
=
True
)
elif
answer_type
==
'Str'
:
gt
=
get_clean_string
(
gt
)
pred
=
get_clean_string
(
pred
)
if
is_exact_match
(
gt
):
score
=
(
gt
==
pred
)
else
:
score
=
anls_compute
(
gt
,
pred
)
else
:
if
isinstance
(
gt
,
str
)
and
gt
.
startswith
(
'['
):
gt
=
eval
(
gt
)
if
not
isinstance
(
gt
,
list
):
gt
=
[
gt
]
if
isinstance
(
pred
,
str
)
and
pred
.
startswith
(
'['
):
pred
=
eval
(
pred
)
if
not
isinstance
(
pred
,
list
):
pred
=
[
pred
]
print
(
len
(
gt
),
len
(
pred
))
if
len
(
gt
)
!=
len
(
pred
):
score
=
0.0
else
:
gt
=
sorted
([
get_clean_string
(
a
)
for
a
in
gt
])
pred
=
sorted
([
get_clean_string
(
a
)
for
a
in
pred
])
print
(
gt
,
pred
)
if
isfloat
(
gt
[
0
])
or
is_exact_match
(
gt
[
0
]):
score
=
(
'-'
.
join
(
gt
)
==
'-'
.
join
(
pred
))
else
:
score
=
min
([
anls_compute
(
gt_v
,
pred_v
)
for
gt_v
,
pred_v
in
zip
(
gt
,
pred
)])
return
float
(
score
)
def
MMLongBench_auxeval
(
model
,
line
):
prompt
=
build_mmlongbench_gpt4_prompt
(
line
)
log
=
''
retry
=
5
for
i
in
range
(
retry
):
prediction
=
line
[
'prediction'
]
res
=
model
.
generate
(
prompt
,
temperature
=
i
*
0.5
)
if
FAIL_MSG
in
res
:
log
+=
f
'Try
{
i
}
: output is
{
prediction
}
, failed to parse.
\n
'
else
:
log
+=
'Succeed'
try
:
pred
=
res
.
split
(
'Answer format:'
)[
0
].
split
(
'Extracted answer:'
)[
1
].
strip
()
except
:
pred
=
''
return
dict
(
log
=
log
,
res
=
res
,
pred
=
pred
)
log
+=
'All 5 retries failed.
\n
'
return
dict
(
log
=
log
,
res
=
''
,
pred
=
''
)
def
get_f1
(
data
):
gt_pos_data
=
data
[
data
.
apply
(
lambda
k
:
k
[
'answer'
]
!=
'Not answerable'
,
axis
=
1
)]
pred_pos_data
=
data
[
data
.
apply
(
lambda
k
:
k
[
'pred'
]
!=
'Not answerable'
,
axis
=
1
)]
recall
=
sum
(
gt_pos_data
[
'score'
].
tolist
())
/
len
(
gt_pos_data
)
precision
=
sum
(
pred_pos_data
[
'score'
].
tolist
())
/
len
(
pred_pos_data
)
return
2
*
recall
*
precision
/
(
recall
+
precision
)
def
MMLongBench_acc
(
result_file
):
data
=
load
(
result_file
)
overall_score
=
0.0
score_list
=
list
()
for
i
in
range
(
len
(
data
)):
item
=
data
.
iloc
[
i
]
try
:
score
=
eval_score
(
item
[
'answer'
],
item
[
'pred'
],
item
[
'answer_format'
])
except
:
score
=
0.0
score_list
.
append
(
score
)
overall_score
+=
score
data
[
'score'
]
=
score_list
dump
(
data
,
result_file
)
data_chart
=
data
[
data
.
apply
(
lambda
k
:
'Chart'
in
eval
(
k
[
'evidence_sources'
]),
axis
=
1
)]
data_table
=
data
[
data
.
apply
(
lambda
k
:
'Table'
in
eval
(
k
[
'evidence_sources'
]),
axis
=
1
)]
data_image
=
data
[
data
.
apply
(
lambda
k
:
'Figure'
in
eval
(
k
[
'evidence_sources'
]),
axis
=
1
)]
data_text
=
data
[
data
.
apply
(
lambda
k
:
'Pure-text (Plain-text)'
in
eval
(
k
[
'evidence_sources'
]),
axis
=
1
)]
data_layout
=
data
[
data
.
apply
(
lambda
k
:
'Generalized-text (Layout)'
in
eval
(
k
[
'evidence_sources'
]),
axis
=
1
)]
data_single
=
data
[
data
.
apply
(
lambda
k
:
len
(
eval
(
k
[
'evidence_pages'
]))
==
1
,
axis
=
1
)]
data_multi
=
data
[
data
.
apply
(
lambda
k
:
len
(
eval
(
k
[
'evidence_pages'
]))
>
1
,
axis
=
1
)]
data_unans
=
data
[
data
.
apply
(
lambda
k
:
len
(
eval
(
k
[
'evidence_pages'
]))
==
0
,
axis
=
1
)]
res
=
dict
()
res
[
'category'
]
=
[
'overall_f1'
,
'overall_acc'
,
'text'
,
'layout'
,
'table'
,
'chart'
,
'image'
,
'single-page'
,
'multi-page'
,
'unanswerable'
]
res
[
'num'
]
=
[
len
(
data
),
len
(
data
),
len
(
data_text
),
len
(
data_layout
),
len
(
data_table
),
len
(
data_chart
),
len
(
data_image
),
len
(
data_single
),
len
(
data_multi
),
len
(
data_unans
)
]
res
[
'avg_score'
]
=
[
get_f1
(
data
),
overall_score
/
len
(
data
),
sum
(
data_text
[
'score'
].
tolist
())
/
len
(
data_text
)
if
len
(
data_text
)
>
0
else
0.0
,
sum
(
data_layout
[
'score'
].
tolist
())
/
len
(
data_layout
)
if
len
(
data_layout
)
>
0
else
0.0
,
sum
(
data_table
[
'score'
].
tolist
())
/
len
(
data_table
)
if
len
(
data_table
)
>
0
else
0.0
,
sum
(
data_chart
[
'score'
].
tolist
())
/
len
(
data_chart
)
if
len
(
data_chart
)
>
0
else
0.0
,
sum
(
data_image
[
'score'
].
tolist
())
/
len
(
data_image
)
if
len
(
data_image
)
>
0
else
0.0
,
sum
(
data_single
[
'score'
].
tolist
())
/
len
(
data_single
)
if
len
(
data_single
)
>
0
else
0.0
,
sum
(
data_multi
[
'score'
].
tolist
())
/
len
(
data_multi
)
if
len
(
data_multi
)
>
0
else
0.0
,
sum
(
data_unans
[
'score'
].
tolist
())
/
len
(
data_unans
)
if
len
(
data_unans
)
>
0
else
0.0
,
]
res
=
pd
.
DataFrame
(
res
)
return
res
class
MMLongBench
(
ImageBaseDataset
):
TYPE
=
'VQA'
DATASET_URL
=
{
'MMLongBench_DOC'
:
'https://opencompass.openxlab.space/utils/VLMEval/MMLongBench_DOC.tsv'
,
}
DATASET_MD5
=
{
'MMLongBench_DOC'
:
'9b393e1f4c52718380d50586197eac9b'
,
}
SUPPORTED_MODELS
=
{
'GPT4'
:
(
1
,
1
),
'GPT4V'
:
(
1
,
1
),
'GPT4V_HIGH'
:
(
1
,
1
),
'GPT4o'
:
(
1
,
1
),
'GPT4o_HIGH'
:
(
1
,
1
),
'GPT4o_MINI'
:
(
1
,
1
),
'MiniCPM-Llama3-V-2_5'
:
(
1
,
5
),
'InternVL-Chat-V1-5'
:
(
5
,
2
),
'XComposer2_4KHD'
:
(
1
,
5
),
'XComposer2d5'
:
(
1
,
-
1
),
}
def
__init__
(
self
,
dataset
,
**
kwargs
):
self
.
model_list
=
list
(
self
.
SUPPORTED_MODELS
.
keys
())
model_name
=
kwargs
[
'model'
]
if
not
listinstr
(
self
.
model_list
,
model_name
):
raise
AssertionError
(
"{} doesn't support the evaluation on MMLongBench_DOC."
.
format
(
model_name
))
super
(
MMLongBench
,
self
).
__init__
(
dataset
)
self
.
is_api
=
True
if
listinstr
([
'GPT4'
],
model_name
)
else
False
self
.
max_pages
=
120
concat_num
,
column_num
=
self
.
SUPPORTED_MODELS
.
get
(
model_name
)
self
.
concat_num
=
concat_num
self
.
column_num
=
column_num
def
dump_image
(
self
,
origin_line
):
os
.
makedirs
(
self
.
img_root
,
exist_ok
=
True
)
try
:
import
fitz
except
Exception
as
e
:
logging
.
critical
(
f
'
{
type
(
e
)
}
:
{
e
}
'
)
logging
.
critical
(
'Please use `pip install pymupdf` to parse PDF files.'
)
line
=
origin_line
.
copy
()
line
[
'image_path'
]
=
line
[
'image_path'
][:
self
.
max_pages
]
skip_pdf_parse
=
True
for
im_name
in
line
[
'image_path'
]:
path
=
osp
.
join
(
self
.
img_root
,
im_name
)
if
not
read_ok
(
path
):
skip_pdf_parse
=
False
break
# Just for being compatible with the zooped loop: zip(line['image'], line['image_path'])
if
skip_pdf_parse
:
line
[
'image'
]
=
line
[
'image_path'
]
else
:
pdf_data
=
base64
.
b64decode
(
line
[
'image'
])
pdf_file
=
io
.
BytesIO
(
pdf_data
)
encoded_images
=
[]
with
fitz
.
open
(
stream
=
pdf_file
,
filetype
=
'pdf'
)
as
doc
:
doc
=
doc
[:
self
.
max_pages
]
for
page
in
doc
:
image
=
page
.
get_pixmap
(
dpi
=
144
)
image_file
=
io
.
BytesIO
(
image
.
tobytes
(
output
=
'png'
))
image
=
Image
.
open
(
image_file
)
encoded_image
=
encode_image_to_base64
(
image
)
encoded_images
.
append
(
encoded_image
)
line
[
'image'
]
=
encoded_images
print
(
'process {}'
.
format
(
line
[
'doc_id'
]))
if
'image'
in
line
:
if
isinstance
(
line
[
'image'
],
list
):
tgt_path
=
[]
assert
'image_path'
in
line
for
img
,
im_name
in
zip
(
line
[
'image'
],
line
[
'image_path'
]):
path
=
osp
.
join
(
self
.
img_root
,
im_name
)
if
not
read_ok
(
path
):
decode_base64_to_image_file
(
img
,
path
)
tgt_path
.
append
(
path
)
else
:
tgt_path
=
osp
.
join
(
self
.
img_root
,
f
"
{
line
[
'index'
]
}
.jpg"
)
if
not
read_ok
(
tgt_path
):
decode_base64_to_image_file
(
line
[
'image'
],
tgt_path
)
tgt_path
=
[
tgt_path
]
else
:
assert
'image_path'
in
line
tgt_path
=
toliststr
(
line
[
'image_path'
])
if
self
.
concat_num
>
0
and
not
self
.
is_api
:
concatenated_images
=
concat_images
(
tgt_path
,
max_concat
=
self
.
concat_num
,
column_num
=
self
.
column_num
)
old_tgt_path
=
tgt_path
assert
isinstance
(
old_tgt_path
,
list
)
if
self
.
column_num
!=
-
1
:
tgt_path
=
[
'_'
.
join
(
old_tgt_path
[
0
].
split
(
'_'
)[:
-
1
])
+
'_concat{}_{}.jpg'
.
format
(
self
.
concat_num
,
i
)
for
i
in
range
(
len
(
concatenated_images
))
]
else
:
tgt_path
=
[
'_'
.
join
(
old_tgt_path
[
0
].
split
(
'_'
)[:
-
1
])
+
'_concat_all_{}.jpg'
.
format
(
i
)
for
i
in
range
(
len
(
concatenated_images
))
]
for
path
,
concatenated_image
in
zip
(
tgt_path
,
concatenated_images
):
if
not
read_ok
(
path
):
decode_base64_to_image_file
(
encode_image_to_base64
(
concatenated_image
),
path
)
num_images
,
image_size
=
len
(
old_tgt_path
),
concatenated_image
.
size
print
(
'concat {} images to a new one with size {}. save at {}'
.
format
(
num_images
,
image_size
,
path
))
return
tgt_path
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
judge_kwargs
):
logger
=
get_logger
(
'Evaluation'
)
model
=
judge_kwargs
[
'model'
]
suffix
=
eval_file
.
split
(
'.'
)[
-
1
]
storage
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
.xlsx'
)
tmp_file
=
eval_file
.
replace
(
f
'.
{
suffix
}
'
,
f
'_
{
model
}
.pkl'
)
if
osp
.
exists
(
storage
):
logger
.
warning
(
f
'GPT scoring file
{
storage
}
already exists, will reuse it in MMLongBench_eval. '
)
else
:
data
=
load
(
eval_file
)
model
=
build_judge
(
max_tokens
=
128
,
**
judge_kwargs
)
lt
=
len
(
data
)
lines
=
[
data
.
iloc
[
i
]
for
i
in
range
(
lt
)]
tups
=
[(
model
,
line
)
for
line
in
lines
]
indices
=
[
line
[
'index'
]
for
line
in
lines
]
ans
=
{}
if
osp
.
exists
(
tmp_file
):
ans
=
load
(
tmp_file
)
tups
=
[
x
for
x
,
i
in
zip
(
tups
,
indices
)
if
i
not
in
ans
]
indices
=
[
i
for
i
in
indices
if
i
not
in
ans
]
if
len
(
indices
):
new_results
=
list
()
for
model
,
line
in
tqdm
(
tups
):
res
=
MMLongBench_auxeval
(
model
,
line
)
new_results
.
append
(
res
)
log_map
,
res_map
,
pred_map
=
{},
{},
{}
all_inds
=
[
line
[
'index'
]
for
line
in
lines
]
for
k
,
v
in
zip
(
all_inds
,
new_results
):
log_map
[
k
]
=
v
[
'log'
]
res_map
[
k
]
=
v
[
'res'
]
pred_map
[
k
]
=
v
[
'pred'
]
data
[
'res'
]
=
[
res_map
[
idx
]
for
idx
in
data
[
'index'
]]
data
[
'log'
]
=
[
log_map
[
idx
]
for
idx
in
data
[
'index'
]]
data
[
'pred'
]
=
[
pred_map
[
idx
]
for
idx
in
data
[
'index'
]]
dump
(
data
,
storage
)
score
=
MMLongBench_acc
(
storage
)
score_pth
=
storage
.
replace
(
'.xlsx'
,
'_score.csv'
)
dump
(
score
,
score_pth
)
logger
.
info
(
f
'MMLongBench_eval successfully finished evaluating
{
eval_file
}
, results saved in
{
score_pth
}
'
)
logger
.
info
(
'Score: '
)
logger
.
info
(
score
)
VLMEvalKit/vlmeval/dataset/mmmath.py
0 → 100644
View file @
bc5ebf0f
import
re
import
json
import
sympy
as
sp
import
numpy
as
np
from
sympy
import
simplify
,
Eq
,
sympify
,
Pow
,
pi
from
sympy.parsing.latex
import
parse_latex
import
sys
import
math
import
os
import
argparse
from
.image_base
import
ImageBaseDataset
from
..utils
import
track_progress_rich
from
..smp
import
load
,
dump
class
AutoScoringJudge
:
def
__init__
(
self
):
# Map of special symbols to their replacements
self
.
special_signal_map
=
{
"
\\
left"
:
""
,
"
\\
right"
:
""
,
"厘米"
:
""
,
# "∶": ":",
","
:
","
,
"$"
:
""
,
"("
:
"("
,
")"
:
")"
,
"
\\
infty"
:
"oo"
,
"
\\
colon "
:
":"
,
# "\\approx": "=",
# "\\simeq": "=",
# "\\sim": "=",
# "^\\prime": "'",
# "^{\\prime}": "'",
"+"
:
"+"
,
"
\\
, "
:
""
,
"
\\
,"
:
""
,
"^
\\
circ"
:
""
,
"^{
\\
circ}"
:
""
,
# "%": "",
}
self
.
pi
=
parse_latex
(
"
\\
pi"
)
# MM-Math default precision
self
.
precision
=
1e-2
def
trans_greater_sign_to_interval
(
self
,
expr
:
str
):
expr_tmp
=
expr
.
split
(
"<"
)
return
"("
+
expr_tmp
[
0
]
+
", "
+
expr_tmp
[
-
1
]
+
")"
def
split_by_comma
(
self
,
expr
:
str
):
# Splits expressions by commas outside of brackets
in_bracket_num
=
0
splitted_expr
=
[]
start_idx
=
0
for
i
,
char
in
enumerate
(
expr
):
if
char
in
[
"("
,
"["
]:
in_bracket_num
+=
1
elif
char
in
[
")"
,
"]"
]:
in_bracket_num
-=
1
elif
char
==
","
and
in_bracket_num
==
0
:
splitted_expr
.
append
(
expr
[
start_idx
:
i
].
strip
())
start_idx
=
i
+
1
if
start_idx
<
len
(
expr
):
splitted_expr
.
append
(
expr
[
start_idx
:].
strip
())
return
splitted_expr
def
trans_plus_minus_sign
(
self
,
expr_list
:
list
):
# Translates plus-minus signs into separate expressions
new_expr_list
=
[]
for
expr
in
expr_list
:
if
"
\\
pm"
in
expr
:
new_expr_list
.
append
(
expr
.
replace
(
"
\\
pm"
,
"+"
))
new_expr_list
.
append
(
expr
.
replace
(
"
\\
pm"
,
"-"
))
else
:
new_expr_list
.
append
(
expr
)
return
new_expr_list
def
judge
(
self
,
expression1
,
expression2
,
precision
=
1e-2
):
# Judge if two expressions are equal (expression1 is considered as the Ground Truth)
# Default precision is a list for supporting multiple expressions
precision
=
precision
if
isinstance
(
precision
,
list
)
else
[
precision
]
try
:
expression1
,
expression2
=
self
.
preprocess
(
expression1
,
expression2
)
except
:
return
False
if
expression1
==
expression2
:
# print("Exactly equal")
return
True
# Remove Chinese characters from the string, as answers like "yes" or "no" in Chinese have been considered
expression1
=
expression1
if
re
.
fullmatch
(
r
"[\u4e00-\u9fff]+"
,
expression1
)
else
re
.
sub
(
r
'[\u4e00-\u9fff]+'
,
''
,
expression1
)
# noqa: E501
expression2
=
expression2
if
re
.
fullmatch
(
r
'[\u4e00-\u9fff]+'
,
expression2
)
else
re
.
sub
(
r
'[\u4e00-\u9fff]+'
,
''
,
expression2
)
# noqa: E501
# Check if two < or > in expression
if
self
.
is_two_greater_sign
(
expression1
):
expression1
=
self
.
trans_greater_sign_to_interval
(
expression1
)
if
self
.
is_two_greater_sign
(
expression2
):
expression2
=
self
.
trans_greater_sign_to_interval
(
expression2
)
expression1
=
self
.
split_by_comma
(
expression1
)
expression2
=
self
.
split_by_comma
(
expression2
)
temp_list1
=
self
.
trans_plus_minus_sign
(
expression1
)
temp_list2
=
self
.
trans_plus_minus_sign
(
expression2
)
# Set up a list for allowed errors
if
len
(
precision
)
<=
1
:
precision
=
precision
*
len
(
temp_list1
)
if
len
(
temp_list1
)
!=
len
(
temp_list2
):
return
False
# Check if elements in both lists can be paired and are equal
idx
=
-
1
while
len
(
temp_list1
)
!=
0
:
idx
=
(
idx
+
1
)
%
len
(
temp_list1
)
item1
=
temp_list1
[
idx
]
self
.
precision
=
precision
[
idx
]
for
item2
in
temp_list2
:
if
self
.
is_equal
(
item1
,
item2
):
temp_list1
.
remove
(
item1
)
temp_list2
.
remove
(
item2
)
precision
.
remove
(
self
.
precision
)
break
else
:
# If no match was found, return False
return
False
# If all elements are matched, return True
return
True
def
is_interval
(
self
,
expr
):
# Checks if an expression is an interval
return
expr
.
startswith
((
"("
,
"["
))
and
expr
.
endswith
((
")"
,
"]"
))
def
is_two_greater_sign
(
self
,
expr
):
match
=
re
.
findall
(
r
'<'
,
expr
)
return
len
(
match
)
==
2
def
sympy_sub_pi
(
self
,
expression_sympy
):
# Replaces the symbol for pi in sympy expressions with its numerical value
return
expression_sympy
.
subs
(
self
.
pi
,
math
.
pi
)
def
is_equal
(
self
,
expression1
,
expression2
):
# Default first expression is ground truth. Check if expressions are equal in different aspects
if
expression1
==
expression2
and
expression1
!=
""
and
expression2
!=
""
:
# print("Equivalent natively")
return
True
# First check if both are intervals
if
self
.
is_interval
(
expression1
)
and
self
.
is_interval
(
expression2
):
try
:
if
self
.
interval_equal
(
expression1
,
expression2
):
# print("Interval equivalent")
return
True
except
:
return
False
# Then check for numerical equality
try
:
if
self
.
numerical_equal
(
expression1
,
expression2
):
# print("Numerically equivalent")
return
True
except
:
pass
# Then check if expressions are mathematically equal
try
:
if
self
.
expression_equal
(
expression1
,
expression2
)
and
not
(
"="
in
expression1
and
"="
in
expression2
):
# print("Expression equivalent")
return
True
except
:
pass
# Lastly, check for equation equality
try
:
if
self
.
equation_equal
(
expression1
,
expression2
):
# print("Equation equivalent")
return
True
except
:
pass
return
False
def
numerical_equal
(
self
,
expression1
:
str
,
expression2
:
str
,
include_percentage
:
bool
=
True
):
# Check if two numerical values are equal within an allowed error range
# Includes possible percentage cases
reference
=
float
(
expression1
)
prediction
=
float
(
expression2
)
if
include_percentage
:
gt_result
=
[
reference
/
100
,
reference
,
reference
*
100
]
else
:
gt_result
=
[
reference
]
for
item
in
gt_result
:
if
abs
(
item
-
prediction
)
<=
self
.
precision
*
1.01
:
return
True
return
False
def
expression_equal
(
self
,
exp1
,
exp2
):
# Check if two expressions are mathematically equivalent
# Extract expression and use sympy for equivalence checking
def
extract_expression
(
expression
):
if
"="
in
expression
:
expression
=
expression
.
split
(
"="
)[
1
]
return
expression
.
strip
()
exp1
=
extract_expression
(
exp1
)
exp2
=
extract_expression
(
exp2
)
exp_too_long
=
len
(
exp1
)
>
300
or
len
(
exp2
)
>
300
expr1_sym
=
sympify
(
parse_latex
(
exp1
))
expr2_sym
=
sympify
(
parse_latex
(
exp2
))
if
expr1_sym
==
expr2_sym
:
return
True
else
:
expr1_sym
=
self
.
sympy_sub_pi
(
expr1_sym
)
expr2_sym
=
self
.
sympy_sub_pi
(
expr2_sym
)
if
(
expr1_sym
.
has
(
sp
.
Symbol
)
and
not
expr2_sym
.
has
(
sp
.
Symbol
))
or
\
(
not
expr1_sym
.
has
(
sp
.
Symbol
)
and
expr2_sym
.
has
(
sp
.
Symbol
)):
return
False
elif
not
expr1_sym
.
has
(
sp
.
Symbol
)
and
not
expr2_sym
.
has
(
sp
.
Symbol
):
try
:
if
not
(
self
.
can_compute_power
(
expr1_sym
)
and
self
.
can_compute_power
(
expr2_sym
)):
print
(
"These two numbers cannot be calculated by the current computer for: "
f
"
\"
{
str
(
expr1_sym
)
}
\"
and
\"
{
str
(
expr2_sym
)
}
\"
"
)
return
False
if
exp_too_long
:
print
(
f
'Expression
{
exp1
}
or
{
exp2
}
is too long to compute. '
)
return
False
if
abs
(
expr1_sym
.
evalf
()
-
expr2_sym
.
evalf
())
<=
self
.
precision
*
1.01
:
return
True
else
:
return
False
except
:
return
False
elif
exp_too_long
:
print
(
f
'Expression
{
exp1
}
or
{
exp2
}
is too long to compute. '
)
return
False
else
:
try
:
simplified_expr
=
simplify
(
expr1_sym
-
expr2_sym
)
num_value
=
simplified_expr
.
evalf
()
return
abs
(
num_value
)
<
1e-3
except
:
return
False
def
equation_equal
(
self
,
expression1
,
expression2
):
# Check if two equations are mathematically equivalent
# Simplify equations and use sympy for equivalence checking
def
simplify_equation
(
latex_eq
):
lhs
,
rhs
=
latex_eq
.
split
(
'='
)
lhs_expr
=
parse_latex
(
lhs
)
rhs_expr
=
parse_latex
(
rhs
)
equation
=
Eq
(
lhs_expr
,
rhs_expr
)
simplified_eq
=
simplify
(
equation
.
lhs
-
equation
.
rhs
)
return
simplified_eq
expr1_sym
=
simplify_equation
(
expression1
)
expr2_sym
=
simplify_equation
(
expression2
)
division_result_1
=
simplify
(
expr1_sym
/
expr2_sym
)
division_result_2
=
simplify
(
expr2_sym
/
expr1_sym
)
if
((
division_result_1
.
is_Integer
and
division_result_1
!=
0
)
or
# noqa: W504
(
division_result_2
.
is_Integer
and
division_result_2
!=
0
)):
return
True
else
:
return
False
def
interval_equal
(
self
,
expression1
,
expression2
):
# Check if two intervals are mathematically equivalent
def
compare_two_interval
(
inter1
,
inter2
):
if
inter1
[
0
]
!=
inter2
[
0
]
or
inter1
[
-
1
]
!=
inter2
[
-
1
]:
return
False
inter1
=
inter1
.
strip
(
'[]()'
)
inter2
=
inter2
.
strip
(
'[]()'
)
items_1
=
inter1
.
split
(
','
)
items_2
=
inter2
.
split
(
','
)
for
item_1
,
item_2
in
zip
(
items_1
,
items_2
):
if
not
self
.
expression_equal
(
item_1
,
item_2
):
return
False
return
True
interval1
=
expression1
interval2
=
expression2
if
interval1
==
interval2
:
return
True
else
:
inter_list1
=
interval1
.
split
(
"
\\
cup"
)
inter_list2
=
interval2
.
split
(
"
\\
cup"
)
if
len
(
inter_list1
)
!=
len
(
inter_list2
):
return
False
else
:
for
inter1
,
inter2
in
zip
(
inter_list1
,
inter_list2
):
if
not
compare_two_interval
(
inter1
,
inter2
):
return
False
return
True
def
preprocess
(
self
,
expression1
,
expression2
):
# Preprocess expressions to extract and replace special symbols
def
extract_boxed_content
(
latex_str
):
boxed_matches
=
re
.
finditer
(
r
'\\boxed{'
,
latex_str
)
results
=
""
for
match
in
boxed_matches
:
start_index
=
match
.
end
()
end_index
=
start_index
stack
=
1
while
stack
>
0
and
end_index
<
len
(
latex_str
):
if
latex_str
[
end_index
]
==
'{'
:
stack
+=
1
elif
latex_str
[
end_index
]
==
'}'
:
stack
-=
1
end_index
+=
1
if
stack
==
0
:
content
=
latex_str
[
start_index
:
end_index
-
1
]
results
+=
content
+
","
else
:
raise
ValueError
(
"Mismatched braces in LaTeX string."
)
if
results
==
""
:
last_line_ans
=
latex_str
.
strip
().
split
(
"
\n
"
)[
-
1
]
dollar_pattern
=
r
"\$(.*?)\$"
answers
=
re
.
findall
(
dollar_pattern
,
last_line_ans
)
if
answers
:
for
ans
in
answers
:
results
+=
ans
+
","
else
:
results
=
latex_str
return
results
def
sepcial_symbol_replace
(
expression
):
expression
=
expression
.
replace
(
"
\\
text{cm}^2"
,
''
).
replace
(
"
\\
text{cm}"
,
""
).
replace
(
"
\\
,cm"
,
''
).
replace
(
"
\\
text{ cm}"
,
''
).
replace
(
"cm"
,
''
).
replace
(
"
\\
text{分米}^2"
,
''
).
replace
(
"cm^{2}"
,
''
).
replace
(
"60
\\
text{ cm}^2"
,
''
).
replace
(
"
\\
\\
text{m}"
,
""
).
replace
(
"
\\
text{米}"
,
""
).
strip
()
# noqa: E501
expression
=
re
.
sub
(
r
"(.+)m$"
,
r
"\1"
,
expression
)
if
"
\\
in "
in
expression
:
expression
=
expression
.
split
(
"
\\
in "
)[
1
]
for
signal
in
self
.
special_signal_map
:
expression
=
expression
.
replace
(
signal
,
self
.
special_signal_map
[
signal
])
expression
=
re
.
sub
(
r
'(\\sin|\\cos|\\tan)(\d+)'
,
r
'\1((\2/180)\\pi)'
,
expression
)
expression
=
expression
.
strip
(
"
\n
,.:;^_=+`!@#%^&*~,。"
)
pattern
=
r
'\\(?:mathrm|mathbf)\{~?([^}]*)\}'
expression
=
re
.
sub
(
pattern
,
r
'\1'
,
expression
)
return
expression
exp1
,
exp2
=
extract_boxed_content
(
expression1
),
extract_boxed_content
(
expression2
)
exp1
,
exp2
=
sepcial_symbol_replace
(
exp1
),
sepcial_symbol_replace
(
exp2
)
return
exp1
,
exp2
def
can_compute_power
(
self
,
expr
):
# Checks if a power expression can be computed
if
isinstance
(
expr
,
Pow
):
base
,
exp
=
expr
.
as_base_exp
()
if
base
.
is_number
and
exp
.
is_number
:
MAX_EXP
=
1000
# Adjust based on computing environment
if
abs
(
exp
.
evalf
())
>
MAX_EXP
:
return
False
else
:
return
True
else
:
return
False
else
:
return
True
# Not a power expression, can compute
class
MMMath
(
ImageBaseDataset
):
TYPE
=
'VQA'
DATASET_URL
=
{
'MM-Math'
:
'https://opencompass.openxlab.space/utils/VLMEval/MM-Math.tsv'
,
}
DATASET_MD5
=
{
'MM-Math'
:
'1f064ed7c4e0e8926a3fa65849419ca5'
,
}
@
classmethod
def
evaluate
(
self
,
eval_file
,
**
kwargs
):
data
=
load
(
eval_file
)
judger
=
AutoScoringJudge
()
func
=
judger
.
judge
tups
=
[
dict
(
expression1
=
x
,
expression2
=
y
)
for
x
,
y
in
zip
(
data
[
'answer'
],
data
[
'prediction'
])]
res
=
track_progress_rich
(
func
,
tups
,
nproc
=
16
)
data
[
'hit'
]
=
res
dump
(
data
,
eval_file
)
score_file
=
eval_file
.
replace
(
'.xlsx'
,
'_score.json'
)
score
=
{}
score
[
'overall'
]
=
np
.
mean
(
data
[
'hit'
])
# Results by Difficulty
difficulties
=
set
(
data
[
'difficulty'
])
for
d
in
difficulties
:
score
[
f
'Difficulty-
{
d
}
'
]
=
np
.
mean
(
data
[
data
[
'difficulty'
]
==
d
][
'hit'
])
# Results by Year
years
=
set
(
data
[
'year'
])
for
y
in
years
:
score
[
f
'Year-
{
y
}
'
]
=
np
.
mean
(
data
[
data
[
'year'
]
==
y
][
'hit'
])
# Results by Knowledge-L1
points
=
set
(
data
[
'knowledge_l1'
])
for
p
in
points
:
score
[
f
'Knowledge-L1-
{
p
}
'
]
=
np
.
mean
(
data
[
data
[
'knowledge_l1'
]
==
p
][
'hit'
])
# Results by Knowledge-L2
points
=
set
(
data
[
'knowledge_l2'
])
for
p
in
points
:
score
[
f
'Knowledge-L2-
{
p
}
'
]
=
np
.
mean
(
data
[
data
[
'knowledge_l2'
]
==
p
][
'hit'
])
dump
(
score
,
score_file
)
return
score
Prev
1
2
3
4
5
6
7
8
9
…
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment