Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Qwen2-VL_pytorch
Commits
bc5ebf0f
Commit
bc5ebf0f
authored
Dec 27, 2024
by
luopl
Browse files
Initial commit
parents
Pipeline
#2167
canceled with stages
Changes
260
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3128 additions
and
0 deletions
+3128
-0
VLMEvalKit/vlmeval/vlm/misc/minigpt4_7b_eval.yaml
VLMEvalKit/vlmeval/vlm/misc/minigpt4_7b_eval.yaml
+38
-0
VLMEvalKit/vlmeval/vlm/misc/minigptv2_eval.yaml
VLMEvalKit/vlmeval/vlm/misc/minigptv2_eval.yaml
+36
-0
VLMEvalKit/vlmeval/vlm/mixsense.py
VLMEvalKit/vlmeval/vlm/mixsense.py
+46
-0
VLMEvalKit/vlmeval/vlm/mmalaya.py
VLMEvalKit/vlmeval/vlm/mmalaya.py
+379
-0
VLMEvalKit/vlmeval/vlm/molmo.py
VLMEvalKit/vlmeval/vlm/molmo.py
+69
-0
VLMEvalKit/vlmeval/vlm/monkey.py
VLMEvalKit/vlmeval/vlm/monkey.py
+165
-0
VLMEvalKit/vlmeval/vlm/moondream.py
VLMEvalKit/vlmeval/vlm/moondream.py
+173
-0
VLMEvalKit/vlmeval/vlm/mplug_owl2.py
VLMEvalKit/vlmeval/vlm/mplug_owl2.py
+126
-0
VLMEvalKit/vlmeval/vlm/mplug_owl3.py
VLMEvalKit/vlmeval/vlm/mplug_owl3.py
+336
-0
VLMEvalKit/vlmeval/vlm/nvlm.py
VLMEvalKit/vlmeval/vlm/nvlm.py
+148
-0
VLMEvalKit/vlmeval/vlm/omchat.py
VLMEvalKit/vlmeval/vlm/omchat.py
+159
-0
VLMEvalKit/vlmeval/vlm/omnilmm.py
VLMEvalKit/vlmeval/vlm/omnilmm.py
+183
-0
VLMEvalKit/vlmeval/vlm/open_flamingo.py
VLMEvalKit/vlmeval/vlm/open_flamingo.py
+100
-0
VLMEvalKit/vlmeval/vlm/ovis.py
VLMEvalKit/vlmeval/vlm/ovis.py
+307
-0
VLMEvalKit/vlmeval/vlm/paligemma.py
VLMEvalKit/vlmeval/vlm/paligemma.py
+44
-0
VLMEvalKit/vlmeval/vlm/pandagpt.py
VLMEvalKit/vlmeval/vlm/pandagpt.py
+63
-0
VLMEvalKit/vlmeval/vlm/parrot.py
VLMEvalKit/vlmeval/vlm/parrot.py
+216
-0
VLMEvalKit/vlmeval/vlm/phi3_vision.py
VLMEvalKit/vlmeval/vlm/phi3_vision.py
+162
-0
VLMEvalKit/vlmeval/vlm/pixtral.py
VLMEvalKit/vlmeval/vlm/pixtral.py
+70
-0
VLMEvalKit/vlmeval/vlm/points.py
VLMEvalKit/vlmeval/vlm/points.py
+308
-0
No files found.
VLMEvalKit/vlmeval/vlm/misc/minigpt4_7b_eval.yaml
0 → 100644
View file @
bc5ebf0f
model
:
arch
:
minigpt4
model_type
:
pretrain_vicuna_7b
max_txt_len
:
160
end_sym
:
"
###"
low_resource
:
True
prompt_template
:
'
###Human:
{}
###Assistant:
'
ckpt
:
"
please
set
this
value
to
the
path
of
pretrained
checkpoint"
# vit encoder
image_size
:
224
drop_path_rate
:
0
use_grad_checkpoint
:
False
vit_precision
:
"
fp16"
freeze_vit
:
True
freeze_qformer
:
True
# Q-Former
num_query_token
:
32
# generation configs
prompt
:
"
"
llama_model
:
"
please
set
this
value
to
the
path
of
vicuna-7b-v0"
datasets
:
cc_sbu_align
:
vis_processor
:
train
:
name
:
"
blip2_image_eval"
image_size
:
224
text_processor
:
train
:
name
:
"
blip_caption"
run
:
task
:
image_text_pretrain
VLMEvalKit/vlmeval/vlm/misc/minigptv2_eval.yaml
0 → 100644
View file @
bc5ebf0f
model
:
arch
:
minigpt_v2
model_type
:
pretrain
max_txt_len
:
160
end_sym
:
"
</s>"
low_resource
:
True
prompt_template
:
'
[INST]
{}
[/INST]'
ckpt
:
"
please
set
this
value
to
the
path
of
pretrained
checkpoint"
lora_r
:
64
lora_alpha
:
16
# vit encoder
image_size
:
448
drop_path_rate
:
0
use_grad_checkpoint
:
False
vit_precision
:
"
fp16"
freeze_vit
:
True
# generation configs
prompt
:
"
"
# LLM
llama_model
:
"
please
set
this
value
to
the
path
of
llama2-chat-7b"
datasets
:
cc_sbu_align
:
vis_processor
:
train
:
name
:
"
blip2_image_eval"
image_size
:
448
text_processor
:
train
:
name
:
"
blip_caption"
run
:
task
:
image_text_pretrain
VLMEvalKit/vlmeval/vlm/mixsense.py
0 → 100644
View file @
bc5ebf0f
import
torch
import
transformers
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
from
PIL
import
Image
import
warnings
from
.base
import
BaseModel
from
..smp
import
*
class
LLama3Mixsense
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
=
'Zero-Vision/Llama-3-MixSenseV1_1'
,
**
kwargs
):
assert
model_path
is
not
None
transformers
.
logging
.
set_verbosity_error
()
transformers
.
logging
.
disable_progress_bar
()
warnings
.
filterwarnings
(
'ignore'
)
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
).
to
(
'cuda'
).
eval
()
self
.
kwargs
=
kwargs
def
generate_inner
(
self
,
message
,
dataset
=
None
):
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
)
input_ids
=
self
.
model
.
text_process
(
prompt
,
self
.
tokenizer
).
to
(
device
=
'cuda'
)
image
=
Image
.
open
(
image_path
).
convert
(
'RGB'
)
image_tensor
=
self
.
model
.
image_process
([
image
]).
to
(
dtype
=
self
.
model
.
dtype
,
device
=
'cuda'
)
# generate
with
torch
.
inference_mode
():
output_ids
=
self
.
model
.
generate
(
input_ids
,
images
=
image_tensor
,
max_new_tokens
=
2048
,
use_cache
=
True
,
eos_token_id
=
[
self
.
tokenizer
.
eos_token_id
,
self
.
tokenizer
.
convert_tokens_to_ids
([
'<|eot_id|>'
])[
0
],
],
)
return
self
.
tokenizer
.
batch_decode
(
output_ids
,
skip_special_tokens
=
True
)[
0
].
strip
()
VLMEvalKit/vlmeval/vlm/mmalaya.py
0 → 100644
View file @
bc5ebf0f
import
torch
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
,
AutoModel
import
warnings
from
.base
import
BaseModel
from
PIL
import
Image
from
..smp
import
*
from
..dataset
import
DATASET_TYPE
import
pandas
as
pd
import
string
import
torchvision.transforms
as
T
import
transformers
from
torchvision.transforms.functional
import
InterpolationMode
class
MMAlaya
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
=
'DataCanvas/MMAlaya'
,
**
kwargs
):
assert
model_path
is
not
None
self
.
model_path
=
model_path
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
device_map
=
'cpu'
,
trust_remote_code
=
True
).
eval
()
# need initialize tokenizer
model
.
initialize_tokenizer
(
self
.
tokenizer
)
self
.
model
=
model
.
cuda
()
self
.
kwargs
=
kwargs
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
torch
.
cuda
.
empty_cache
()
def
generate_inner
(
self
,
message
,
dataset
=
None
):
# read image
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
image
=
Image
.
open
(
image_path
).
convert
(
'RGB'
)
# tokenize prompt, and proprecess image
input_ids
,
image_tensor
,
stopping_criteria
=
self
.
model
.
prepare_for_inference
(
prompt
,
self
.
tokenizer
,
image
,
return_tensors
=
'pt'
)
with
torch
.
inference_mode
():
output_ids
=
self
.
model
.
generate
(
inputs
=
input_ids
.
cuda
(),
images
=
image_tensor
.
cuda
(),
do_sample
=
False
,
max_new_tokens
=
512
,
num_beams
=
1
,
use_cache
=
True
,
stopping_criteria
=
[
stopping_criteria
],
)
# truncate input_ids in generate_ids and then decode to text
input_token_len
=
input_ids
.
shape
[
1
]
response
=
self
.
tokenizer
.
batch_decode
(
output_ids
[:,
input_token_len
:].
cpu
(),
skip_special_tokens
=
True
,
clean_up_tokenization_spaces
=
False
,
)[
0
].
strip
()
return
response
IMAGENET_MEAN
=
(
0.485
,
0.456
,
0.406
)
IMAGENET_STD
=
(
0.229
,
0.224
,
0.225
)
def
build_transform
(
input_size
):
MEAN
,
STD
=
IMAGENET_MEAN
,
IMAGENET_STD
transform
=
T
.
Compose
(
[
T
.
Lambda
(
lambda
img
:
img
.
convert
(
'RGB'
)
if
img
.
mode
!=
'RGB'
else
img
),
T
.
Resize
((
input_size
,
input_size
),
interpolation
=
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
T
.
Normalize
(
mean
=
MEAN
,
std
=
STD
),
]
)
return
transform
def
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
,
height
,
image_size
):
best_ratio_diff
=
float
(
'inf'
)
best_ratio
=
(
1
,
1
)
area
=
width
*
height
for
ratio
in
target_ratios
:
target_aspect_ratio
=
ratio
[
0
]
/
ratio
[
1
]
ratio_diff
=
abs
(
aspect_ratio
-
target_aspect_ratio
)
if
ratio_diff
<
best_ratio_diff
:
best_ratio_diff
=
ratio_diff
best_ratio
=
ratio
elif
ratio_diff
==
best_ratio_diff
:
if
area
>
0.5
*
image_size
*
image_size
*
ratio
[
0
]
*
ratio
[
1
]:
best_ratio
=
ratio
return
best_ratio
def
dynamic_preprocess
(
image
,
min_num
=
1
,
max_num
=
6
,
image_size
=
448
,
use_thumbnail
=
False
):
orig_width
,
orig_height
=
image
.
size
aspect_ratio
=
orig_width
/
orig_height
# calculate the existing image aspect ratio
target_ratios
=
set
(
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
i
*
j
<=
max_num
and
i
*
j
>=
min_num
)
target_ratios
=
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
orig_width
,
orig_height
,
image_size
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
,
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
def
load_image
(
image_file
,
input_size
=
448
,
max_num
=
6
,
upscale
=
False
):
image
=
Image
.
open
(
image_file
).
convert
(
'RGB'
)
if
upscale
:
image
=
image
.
resize
((
image
.
width
*
2
,
image
.
height
*
2
),
Image
.
BILINEAR
)
transform
=
build_transform
(
input_size
=
input_size
)
images
=
dynamic_preprocess
(
image
,
image_size
=
input_size
,
use_thumbnail
=
True
,
max_num
=
max_num
)
pixel_values
=
[
transform
(
image
)
for
image
in
images
]
pixel_values
=
torch
.
stack
(
pixel_values
)
return
pixel_values
# This function is used to split InternVL2-Llama3-76B
def
split_model
(
model_name
):
import
math
device_map
=
{}
num_gpus
=
torch
.
cuda
.
device_count
()
rank
,
world_size
=
get_rank_and_world_size
()
num_gpus
=
num_gpus
//
world_size
assert
num_gpus
>=
1
if
num_gpus
==
1
:
return
device_map
num_layers
=
{
'InternVL2-8B'
:
32
,
'InternVL2-26B'
:
48
,
'InternVL2-40B'
:
60
,
'InternVL2-Llama3-76B'
:
80
}[
model_name
]
# Since the first GPU will be used for ViT, treat it as 0.5 GPU.
num_layers_per_gpu
=
math
.
ceil
(
num_layers
/
(
num_gpus
-
0.5
))
num_layers_per_gpu
=
[
num_layers_per_gpu
]
*
num_gpus
num_layers_per_gpu
[
0
]
=
math
.
ceil
(
num_layers_per_gpu
[
0
]
*
0.5
)
layer_cnt
=
0
for
i
,
num_layer
in
enumerate
(
num_layers_per_gpu
):
for
j
in
range
(
num_layer
):
device_map
[
f
'language_model.model.layers.
{
layer_cnt
}
'
]
=
rank
+
world_size
*
i
layer_cnt
+=
1
device_map
[
'vision_model'
]
=
rank
device_map
[
'mlp1'
]
=
rank
device_map
[
'language_model.model.tok_embeddings'
]
=
rank
device_map
[
'language_model.model.embed_tokens'
]
=
rank
device_map
[
'language_model.output'
]
=
rank
device_map
[
'language_model.model.norm'
]
=
rank
device_map
[
'language_model.lm_head'
]
=
rank
device_map
[
f
'language_model.model.layers.
{
num_layers
-
1
}
'
]
=
rank
return
device_map
class
MMAlaya2
(
BaseModel
):
"""
This implementation fine-tunes 20 LoRA modules based on the InternVL-Chat-V1-5 model.
The fine-tuned LoRA modules are then merged with the InternVL-Chat-V1-5 model
using the PEFT model merging method, TIES.
The code is based on the implementation in `vlmeval/vlm/internvl_chat.py`.
"""
INSTALL_REQ
=
False
INTERLEAVE
=
True
def
__init__
(
self
,
model_path
=
'DataCanvas/MMAlaya2'
,
load_in_8bit
=
False
,
**
kwargs
,
):
assert
model_path
is
not
None
assert
version_cmp
(
transformers
.
__version__
,
'4.36.2'
,
'ge'
)
self
.
model_path
=
model_path
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
,
use_fast
=
False
)
# Regular expression to match the pattern "Image" followed by a number, e.g. Image1
self
.
pattern
=
r
'Image(\d+)'
# Replacement pattern to insert a hyphen between "Image" and the number, e.g. Image-1
self
.
replacement
=
r
'Image-\1'
# Convert InternVL2 response to dataset format
# e.g. Image1 -> Image-1
# Regular expression to match the pattern "Image-" followed by a number
self
.
reverse_pattern
=
r
'Image-(\d+)'
# Replacement pattern to remove the hyphen (Image-1 -> Image1)
self
.
reverse_replacement
=
r
'Image\1'
device_map
=
split_model
(
'InternVL2-26B'
)
if
len
(
device_map
)
==
0
:
device_map
=
{
''
:
'cuda'
}
self
.
model
=
AutoModel
.
from_pretrained
(
model_path
,
torch_dtype
=
torch
.
bfloat16
,
trust_remote_code
=
True
,
load_in_8bit
=
load_in_8bit
,
device_map
=
device_map
).
eval
()
self
.
image_size
=
self
.
model
.
config
.
vision_config
.
image_size
kwargs_default
=
dict
(
do_sample
=
False
,
max_new_tokens
=
1024
,
top_p
=
None
,
num_beams
=
1
)
kwargs_default
.
update
(
kwargs
)
self
.
kwargs
=
kwargs_default
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
def
use_custom_prompt
(
self
,
dataset
):
assert
dataset
is
not
None
if
listinstr
([
'MMDU'
,
'MME-RealWorld'
,
'MME-RealWorld-CN'
],
dataset
):
# For Multi-Turn we don't have custom prompt
return
False
else
:
return
True
def
build_multi_choice_prompt
(
self
,
line
,
dataset
=
None
):
question
=
line
[
'question'
]
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
if
hint
is
not
None
:
question
=
hint
+
'
\n
'
+
question
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
for
key
,
item
in
options
.
items
():
question
+=
f
'
\n
{
key
}
.
{
item
}
'
prompt
=
question
if
len
(
options
):
prompt
+=
(
'
\n
请直接回答选项字母。'
if
cn_string
(
prompt
)
else
"
\n
Answer with the option's letter from the given choices directly."
)
else
:
prompt
+=
(
'
\n
请直接回答问题。'
if
cn_string
(
prompt
)
else
'
\n
Answer the question directly.'
)
return
prompt
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
self
.
use_custom_prompt
(
dataset
)
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
if
dataset
is
not
None
and
listinstr
([
'MME'
],
dataset
):
question
=
line
[
'question'
]
prompt
=
question
+
' Answer the question using a single word or phrase.'
elif
dataset
is
not
None
and
listinstr
([
'HallusionBench'
],
dataset
):
question
=
line
[
'question'
]
prompt
=
(
question
+
' Please answer yes or no. Answer the question using a single word or phrase.'
)
elif
dataset
is
not
None
and
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
prompt
=
self
.
build_multi_choice_prompt
(
line
,
dataset
)
elif
dataset
is
not
None
and
DATASET_TYPE
(
dataset
)
==
'VQA'
:
if
listinstr
([
'MathVista'
,
'MathVision'
,
'MathVerse'
],
dataset
):
prompt
=
line
[
'question'
]
elif
listinstr
([
'LLaVABench'
],
dataset
):
question
=
line
[
'question'
]
prompt
=
question
+
'
\n
Answer this question in detail.'
elif
listinstr
([
'MMVet'
],
dataset
):
prompt
=
line
[
'question'
]
else
:
question
=
line
[
'question'
]
prompt
=
question
+
'
\n
Answer the question using a single word or phrase.'
else
:
prompt
=
line
[
'question'
]
message
=
[
dict
(
type
=
'text'
,
value
=
prompt
)]
message
.
extend
([
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
])
return
message
def
set_max_num
(
self
,
dataset
):
if
dataset
is
not
None
and
listinstr
([
'ChartQA_TEST'
,
'MMMU_DEV_VAL'
],
dataset
):
self
.
max_num
=
12
elif
dataset
is
not
None
and
listinstr
([
'DocVQA_VAL'
,
'DocVQA_TEST'
],
dataset
):
self
.
max_num
=
18
elif
dataset
is
not
None
and
listinstr
(
[
'InfoVQA_VAL'
,
'InfoVQA_TEST'
,
'OCRBench'
],
dataset
):
self
.
max_num
=
24
elif
dataset
is
not
None
and
listinstr
(
[
'MMBench-Video'
,
'Video-MME'
,
'Video'
],
dataset
):
self
.
max_num
=
1
else
:
self
.
max_num
=
6
def
generate_inner
(
self
,
message
,
dataset
=
None
):
self
.
set_max_num
(
dataset
)
image_num
=
len
([
x
for
x
in
message
if
x
[
'type'
]
==
'image'
])
prompt
=
'
\n
'
.
join
([
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'text'
])
if
image_num
>
1
:
image_path
=
[
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'image'
]
pixel_values_list
=
[]
max_num
=
max
(
1
,
self
.
max_num
//
image_num
)
for
file_name
in
image_path
:
pixel_values_list
.
append
(
load_image
(
file_name
,
max_num
=
max_num
).
cuda
().
to
(
torch
.
bfloat16
))
pixel_values
=
torch
.
cat
(
pixel_values_list
,
dim
=
0
)
elif
image_num
==
1
:
image_path
=
[
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'image'
][
0
]
pixel_values
=
(
load_image
(
image_path
,
max_num
=
self
.
max_num
).
cuda
().
to
(
torch
.
bfloat16
)
)
else
:
pixel_values
=
None
with
torch
.
no_grad
():
response
=
self
.
model
.
chat
(
self
.
tokenizer
,
pixel_values
=
pixel_values
,
question
=
prompt
,
generation_config
=
self
.
kwargs
,
# verbose=False,
)
return
response
if
__name__
==
'__main__'
:
model
=
MMAlaya2
(
max_new_tokens
=
1024
,
do_sample
=
False
)
response
=
model
.
generate_inner
(
[
{
'type'
:
'image'
,
'value'
:
'./assets/apple.jpg'
},
{
'type'
:
'text'
,
'value'
:
'请详细描述一下这张图片。'
},
]
)
print
(
response
)
VLMEvalKit/vlmeval/vlm/molmo.py
0 → 100644
View file @
bc5ebf0f
import
torch
from
PIL
import
Image
import
os.path
as
osp
import
sys
from
.base
import
BaseModel
from
..smp
import
*
from
..dataset
import
DATASET_TYPE
class
molmo
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
=
'allenai/Molmo-7B-D-0924'
,
**
kwargs
):
try
:
from
transformers
import
AutoModelForCausalLM
,
AutoProcessor
,
GenerationConfig
import
einops
except
Exception
as
e
:
logging
.
critical
(
'Please install transformer and einops before using molmo.'
)
raise
e
if
'72b'
not
in
model_path
.
lower
():
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
,
torch_dtype
=
torch
.
bfloat16
,
device_map
=
'cuda'
)
else
:
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
,
torch_dtype
=
torch
.
bfloat16
,
device_map
=
'auto'
)
self
.
processor
=
AutoProcessor
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
,
torch_dtype
=
torch
.
bfloat16
)
self
.
kwargs
=
kwargs
self
.
model_name
=
model_path
def
generate_inner
(
self
,
message
,
dataset
=
None
):
from
transformers
import
GenerationConfig
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
image
=
Image
.
open
(
image_path
)
if
image
.
mode
!=
"RGB"
:
image
=
image
.
convert
(
"RGB"
)
# process the image and text
inputs
=
self
.
processor
.
process
(
images
=
[
image
],
text
=
prompt
)
# move inputs to the correct device and make a batch of size 1
inputs
=
{
k
:
v
.
to
(
self
.
model
.
device
).
unsqueeze
(
0
)
for
k
,
v
in
inputs
.
items
()}
# generate output; maximum 200 new tokens; stop generation when <|endoftext|> is generated
with
torch
.
autocast
(
device_type
=
"cuda"
,
enabled
=
True
,
dtype
=
torch
.
bfloat16
):
output
=
self
.
model
.
generate_from_batch
(
inputs
,
GenerationConfig
(
max_new_tokens
=
200
,
stop_strings
=
"<|endoftext|>"
),
tokenizer
=
self
.
processor
.
tokenizer
)
# only get generated tokens; decode them to text
generated_tokens
=
output
[
0
,
inputs
[
'input_ids'
].
size
(
1
):]
generated_text
=
self
.
processor
.
tokenizer
.
decode
(
generated_tokens
,
skip_special_tokens
=
True
)
# print the generated text
return
generated_text
VLMEvalKit/vlmeval/vlm/monkey.py
0 → 100644
View file @
bc5ebf0f
import
torch
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
import
warnings
from
.base
import
BaseModel
from
..dataset
import
DATASET_TYPE
class
Monkey
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
=
'echo840/Monkey'
,
**
kwargs
):
assert
model_path
is
not
None
self
.
model_path
=
model_path
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
device_map
=
'cpu'
,
trust_remote_code
=
True
).
eval
()
self
.
model
=
model
.
cuda
()
self
.
kwargs
=
kwargs
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
torch
.
cuda
.
empty_cache
()
def
generate_vanilla
(
self
,
image_path
,
prompt
):
cur_prompt
=
f
'<img>
{
image_path
}
</img>
{
prompt
}
Answer: '
input_ids
=
self
.
tokenizer
(
cur_prompt
,
return_tensors
=
'pt'
,
padding
=
'longest'
)
attention_mask
=
input_ids
.
attention_mask
input_ids
=
input_ids
.
input_ids
output_ids
=
self
.
model
.
generate
(
input_ids
=
input_ids
.
cuda
(),
attention_mask
=
attention_mask
.
cuda
(),
do_sample
=
False
,
num_beams
=
1
,
max_new_tokens
=
512
,
min_new_tokens
=
1
,
length_penalty
=
1
,
num_return_sequences
=
1
,
output_hidden_states
=
True
,
use_cache
=
True
,
pad_token_id
=
self
.
tokenizer
.
eod_id
,
eos_token_id
=
self
.
tokenizer
.
eod_id
,
)
response
=
self
.
tokenizer
.
decode
(
output_ids
[
0
][
input_ids
.
size
(
1
):].
cpu
(),
skip_special_tokens
=
True
).
strip
()
return
response
def
generate_multichoice
(
self
,
image_path
,
prompt
):
cur_prompt
=
f
'<img>
{
image_path
}
</img>
\n
{
prompt
}
Answer: '
input_ids
=
self
.
tokenizer
(
cur_prompt
,
return_tensors
=
'pt'
,
padding
=
'longest'
)
attention_mask
=
input_ids
.
attention_mask
input_ids
=
input_ids
.
input_ids
output_ids
=
self
.
model
.
generate
(
input_ids
=
input_ids
.
cuda
(),
attention_mask
=
attention_mask
.
cuda
(),
do_sample
=
False
,
num_beams
=
1
,
max_new_tokens
=
10
,
min_new_tokens
=
1
,
length_penalty
=
1
,
num_return_sequences
=
1
,
output_hidden_states
=
True
,
use_cache
=
True
,
pad_token_id
=
self
.
tokenizer
.
eod_id
,
eos_token_id
=
self
.
tokenizer
.
eod_id
,
)
response
=
self
.
tokenizer
.
decode
(
output_ids
[
0
][
input_ids
.
size
(
1
):].
cpu
(),
skip_special_tokens
=
True
).
strip
()
return
response
def
generate_inner
(
self
,
message
,
dataset
=
None
):
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
if
dataset
is
None
:
return
self
.
generate_vanilla
(
image_path
,
prompt
)
assert
isinstance
(
dataset
,
str
)
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
or
DATASET_TYPE
(
dataset
)
==
'Y/N'
or
dataset
==
'HallusionBench'
:
return
self
.
generate_multichoice
(
image_path
,
prompt
)
else
:
return
self
.
generate_vanilla
(
image_path
,
prompt
)
class
MonkeyChat
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
=
'echo840/Monkey-Chat'
,
**
kwargs
):
assert
model_path
is
not
None
self
.
model_path
=
model_path
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
device_map
=
'cpu'
,
trust_remote_code
=
True
).
eval
()
self
.
model
=
model
.
cuda
()
self
.
kwargs
=
kwargs
self
.
tokenizer
.
padding_side
=
'left'
self
.
tokenizer
.
pad_token_id
=
self
.
tokenizer
.
eod_id
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
torch
.
cuda
.
empty_cache
()
def
generate_vanilla
(
self
,
image_path
,
prompt
):
cur_prompt
=
f
'<img>
{
image_path
}
</img>
{
prompt
}
Answer: '
input_ids
=
self
.
tokenizer
(
cur_prompt
,
return_tensors
=
'pt'
,
padding
=
'longest'
)
attention_mask
=
input_ids
.
attention_mask
input_ids
=
input_ids
.
input_ids
output_ids
=
self
.
model
.
generate
(
input_ids
=
input_ids
.
cuda
(),
attention_mask
=
attention_mask
.
cuda
(),
do_sample
=
False
,
num_beams
=
1
,
max_new_tokens
=
512
,
min_new_tokens
=
1
,
length_penalty
=
1
,
num_return_sequences
=
1
,
output_hidden_states
=
True
,
use_cache
=
True
,
pad_token_id
=
self
.
tokenizer
.
eod_id
,
eos_token_id
=
self
.
tokenizer
.
eod_id
,
)
response
=
self
.
tokenizer
.
decode
(
output_ids
[
0
][
input_ids
.
size
(
1
):].
cpu
(),
skip_special_tokens
=
True
).
strip
()
return
response
def
generate_multichoice
(
self
,
image_path
,
prompt
):
cur_prompt
=
f
'<img>
{
image_path
}
</img>
\n
{
prompt
}
Answer: '
input_ids
=
self
.
tokenizer
(
cur_prompt
,
return_tensors
=
'pt'
,
padding
=
'longest'
)
attention_mask
=
input_ids
.
attention_mask
input_ids
=
input_ids
.
input_ids
output_ids
=
self
.
model
.
generate
(
input_ids
=
input_ids
.
cuda
(),
attention_mask
=
attention_mask
.
cuda
(),
do_sample
=
False
,
num_beams
=
1
,
max_new_tokens
=
10
,
min_new_tokens
=
1
,
length_penalty
=
1
,
num_return_sequences
=
1
,
output_hidden_states
=
True
,
use_cache
=
True
,
pad_token_id
=
self
.
tokenizer
.
eod_id
,
eos_token_id
=
self
.
tokenizer
.
eod_id
,
)
response
=
self
.
tokenizer
.
decode
(
output_ids
[
0
][
input_ids
.
size
(
1
):].
cpu
(),
skip_special_tokens
=
True
).
strip
()
return
response
def
generate_inner
(
self
,
message
,
dataset
=
None
):
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
if
dataset
is
None
:
return
self
.
generate_vanilla
(
image_path
,
prompt
)
assert
isinstance
(
dataset
,
str
)
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
or
DATASET_TYPE
(
dataset
)
==
'Y/N'
or
dataset
==
'HallusionBench'
:
return
self
.
generate_multichoice
(
image_path
,
prompt
)
else
:
return
self
.
generate_vanilla
(
image_path
,
prompt
)
VLMEvalKit/vlmeval/vlm/moondream.py
0 → 100644
View file @
bc5ebf0f
import
torch
import
re
from
PIL
import
Image
from
abc
import
abstractproperty
import
sys
import
os.path
as
osp
from
.base
import
BaseModel
from
..smp
import
*
from
..dataset
import
DATASET_TYPE
import
copy
class
Moondream1
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
=
'vikhyatk/moondream1'
,
**
kwargs
):
try
:
from
transformers
import
AutoModelForCausalLM
,
CodeGenTokenizerFast
as
Tokenizer
except
Exception
as
e
:
logging
.
critical
(
"Please install Transformers version 4.36.2 by running: 'pip install transformers==4.36.2', "
"please intall torchvision>=0.16."
)
raise
e
assert
osp
.
exists
(
model_path
)
or
splitlen
(
model_path
)
==
2
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
,
torch_dtype
=
torch
.
float16
,
device_map
=
'cuda'
)
self
.
tokenizer
=
Tokenizer
.
from_pretrained
(
model_path
)
default_kwargs
=
dict
(
max_new_tokens
=
512
)
default_kwargs
.
update
(
kwargs
)
self
.
kwargs
=
default_kwargs
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
torch
.
cuda
.
empty_cache
()
def
generate_inner
(
self
,
message
,
dataset
=
None
):
prompt
,
img
=
self
.
message_to_promptimg
(
message
)
enc_image
=
self
.
model
.
encode_image
(
Image
.
open
(
img
))
prompt_wtmpl
=
f
'<image>
\n\n
Question:
{
prompt
}
\n\n
Answer:'
answer
=
self
.
model
.
generate
(
enc_image
,
prompt_wtmpl
,
eos_text
=
'<END>'
,
tokenizer
=
self
.
tokenizer
,
**
self
.
kwargs
)[
0
]
cleaned_answer
=
re
.
sub
(
'<$'
,
''
,
re
.
sub
(
'END$'
,
''
,
answer
)).
strip
()
return
cleaned_answer
def
use_custom_prompt
(
self
,
dataset
):
assert
dataset
is
not
None
if
listinstr
([
'MMMU'
],
dataset
):
return
False
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
or
dataset
==
'MMVet'
:
return
True
return
False
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
assert
self
.
use_custom_prompt
(
dataset
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
'question'
]
if
dataset
==
'MMVet'
:
prompt
=
question
+
'
\n
Answer the question directly. '
elif
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
options_prompt
=
''
for
key
,
item
in
options
.
items
():
options_prompt
+=
f
'
{
key
}
.
{
item
}
\n
'
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
prompt
=
f
'Hint:
{
hint
}
\n
'
if
hint
is
not
None
else
''
prompt
+=
f
'
{
question
}
\n
'
prompt
+=
(
f
'
{
options_prompt
}
\n
Answer with the option’s letter from the given choices directly. '
if
len
(
options
)
else
'Answer the question directly. '
)
else
:
raise
NotImplementedError
message
=
[
dict
(
type
=
'text'
,
value
=
prompt
)]
message
.
extend
([
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
])
return
message
class
Moondream2
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
=
"vikhyatk/moondream2"
,
revision
=
"2024-08-26"
,
**
kwargs
):
try
:
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
except
Exception
as
e
:
logging
.
critical
(
'''Please install Transformers version 4.44 by running: "pip install transformers==4.44.0",
please intall torchvision>=0.16.'''
)
raise
e
assert
osp
.
exists
(
model_path
)
or
splitlen
(
model_path
)
==
2
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
,
torch_dtype
=
torch
.
float16
,
device_map
=
'cuda'
,
revision
=
revision
)
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
)
default_kwargs
=
dict
(
max_new_tokens
=
512
)
default_kwargs
.
update
(
kwargs
)
self
.
kwargs
=
default_kwargs
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
torch
.
cuda
.
empty_cache
()
def
generate_inner
(
self
,
message
,
dataset
=
None
):
prompt
,
img
=
self
.
message_to_promptimg
(
message
)
enc_image
=
self
.
model
.
encode_image
(
Image
.
open
(
img
))
prompt_wtmpl
=
f
'<image>
\n\n
Question:
{
prompt
}
\n\n
Answer:'
answer
=
self
.
model
.
generate
(
enc_image
,
prompt_wtmpl
,
tokenizer
=
self
.
tokenizer
,
**
self
.
kwargs
)[
0
]
cleaned_answer
=
answer
.
strip
()
return
cleaned_answer
def
use_custom_prompt
(
self
,
dataset
):
assert
dataset
is
not
None
if
listinstr
([
'MMMU'
],
dataset
):
return
False
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
or
dataset
==
'MMVet'
:
return
True
return
False
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
assert
self
.
use_custom_prompt
(
dataset
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
'question'
]
if
dataset
==
'MMVet'
:
prompt
=
question
+
'
\n
Answer the question directly. '
elif
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
options_prompt
=
''
for
key
,
item
in
options
.
items
():
options_prompt
+=
f
'
{
key
}
.
{
item
}
\n
'
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
prompt
=
f
'Hint:
{
hint
}
\n
'
if
hint
is
not
None
else
''
prompt
+=
f
'
{
question
}
\n
'
prompt
+=
(
f
'
{
options_prompt
}
\n
Answer with the option’s letter from the given choices directly. '
if
len
(
options
)
else
'Answer the question directly. '
)
else
:
raise
NotImplementedError
message
=
[
dict
(
type
=
'text'
,
value
=
prompt
)]
message
.
extend
([
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
])
return
message
VLMEvalKit/vlmeval/vlm/mplug_owl2.py
0 → 100644
View file @
bc5ebf0f
import
sys
import
torch
from
PIL
import
Image
from
.base
import
BaseModel
from
..smp
import
*
from
..dataset
import
DATASET_TYPE
class
mPLUG_Owl2
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
=
'MAGAer13/mplug-owl2-llama2-7b'
,
**
kwargs
):
try
:
from
mplug_owl2.model.builder
import
load_pretrained_model
from
mplug_owl2.mm_utils
import
get_model_name_from_path
except
Exception
as
e
:
logging
.
critical
(
'Please install mPLUG_Owl2 before using mPLUG_Owl2. '
)
raise
e
model_name
=
get_model_name_from_path
(
model_path
)
tokenizer
,
model
,
image_processor
,
context_len
=
load_pretrained_model
(
model_path
,
None
,
model_name
,
load_8bit
=
False
,
load_4bit
=
False
,
device
=
'cpu'
)
self
.
model
=
model
.
cuda
()
self
.
device
=
self
.
model
.
device
self
.
image_processor
=
image_processor
tokenizer
.
padding_side
=
'left'
tokenizer
.
pad_token_id
=
tokenizer
.
eos_token_id
self
.
tokenizer
=
tokenizer
self
.
context_len
=
context_len
kwargs_default
=
dict
(
max_new_tokens
=
512
,
do_sample
=
False
,
num_beams
=
1
,
min_new_tokens
=
1
,
length_penalty
=
1
,
num_return_sequences
=
1
)
kwargs_default
.
update
(
kwargs
)
self
.
kwargs
=
kwargs_default
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
def
use_custom_prompt
(
self
,
dataset
):
assert
dataset
is
not
None
if
listinstr
([
'MMMU'
],
dataset
):
return
False
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
or
dataset
==
'MMVet'
:
return
True
return
False
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
assert
self
.
use_custom_prompt
(
dataset
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
'question'
]
if
dataset
==
'MMVet'
:
prompt
=
question
+
'
\n
Answer the question directly. '
elif
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
options_prompt
=
''
for
key
,
item
in
options
.
items
():
options_prompt
+=
f
'
{
key
}
.
{
item
}
\n
'
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
prompt
=
f
'Hint:
{
hint
}
\n
'
if
hint
is
not
None
else
''
prompt
+=
f
'
{
question
}
\n
'
prompt
+=
(
f
'
{
options_prompt
}
\n
Answer with the option’s letter from the given choices directly. '
if
len
(
options
)
else
'Answer the question directly. '
)
else
:
raise
NotImplementedError
message
=
[
dict
(
type
=
'text'
,
value
=
prompt
)]
message
.
extend
([
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
])
return
message
def
generate_inner
(
self
,
message
,
dataset
=
None
):
from
mplug_owl2.constants
import
IMAGE_TOKEN_INDEX
from
mplug_owl2.mm_utils
import
process_images
,
tokenizer_image_token
kwargs
=
cp
.
deepcopy
(
self
.
kwargs
)
if
dataset
in
[
'MMVet'
,
'LLaVABench'
]:
kwargs
[
'length_penalty'
]
=
0
elif
dataset
is
not
None
and
DATASET_TYPE
(
dataset
)
==
'VQA'
:
kwargs
[
'length_penalty'
]
=
0
elif
dataset
is
not
None
and
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
kwargs
[
'max_new_tokens'
]
=
10
num_images
=
len
([
x
for
x
in
message
if
x
[
'type'
]
==
'image'
])
assert
num_images
>=
0
prompt_full
=
'USER: '
images
=
[]
if
num_images
==
1
:
prompt
,
image
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
prompt_full
+=
f
'<|image|>
{
prompt
}
\n
ASSISTANT: '
images
.
append
(
image
)
else
:
for
msg
in
message
:
if
msg
[
'type'
]
==
'image'
:
images
.
append
(
msg
[
'value'
])
prompt_full
+=
'<|image|>'
elif
msg
[
'type'
]
==
'text'
:
prompt_full
+=
msg
[
'value'
]
prompt_full
+=
'
\n
ASSISTANT: '
def
preproc_image
(
fname
):
image
=
Image
.
open
(
fname
).
convert
(
'RGB'
)
max_edge
=
max
(
image
.
size
)
image
=
image
.
resize
((
max_edge
,
max_edge
))
return
image
images
=
[
preproc_image
(
fname
)
for
fname
in
images
]
image_tensor
=
process_images
(
images
,
self
.
image_processor
)
image_tensor
=
image_tensor
.
to
(
self
.
device
,
dtype
=
torch
.
float16
)
input_ids
=
tokenizer_image_token
(
prompt_full
,
self
.
tokenizer
,
IMAGE_TOKEN_INDEX
,
return_tensors
=
'pt'
).
unsqueeze
(
0
).
to
(
self
.
device
)
with
torch
.
inference_mode
():
output_ids
=
self
.
model
.
generate
(
input_ids
=
input_ids
,
images
=
image_tensor
,
output_hidden_states
=
True
,
use_cache
=
True
,
**
kwargs
)
answer
=
self
.
tokenizer
.
decode
(
output_ids
[
0
,
input_ids
.
shape
[
1
]:]).
strip
()
return
answer
.
split
(
'</s>'
)[
0
]
VLMEvalKit/vlmeval/vlm/mplug_owl3.py
0 → 100644
View file @
bc5ebf0f
import
torch
from
.base
import
BaseModel
from
..smp
import
*
from
..dataset
import
DATASET_TYPE
from
torchvision
import
transforms
from
transformers
import
AutoTokenizer
,
AutoModel
import
io
import
random
import
numpy
as
np
import
math
def
get_frame_indices
(
num_frames
,
vlen
,
sample
=
'rand'
,
fix_start
=
None
,
input_fps
=
1
,
max_num_frames
=-
1
):
if
sample
in
[
'rand'
,
'middle'
]:
acc_samples
=
min
(
num_frames
,
vlen
)
# split the video into `acc_samples` intervals, and sample from each interval.
intervals
=
np
.
linspace
(
start
=
0
,
stop
=
vlen
,
num
=
acc_samples
+
1
).
astype
(
int
)
ranges
=
[]
for
idx
,
interv
in
enumerate
(
intervals
[:
-
1
]):
ranges
.
append
((
interv
,
intervals
[
idx
+
1
]
-
1
))
if
sample
==
'rand'
:
try
:
frame_indices
=
[
random
.
choice
(
range
(
x
[
0
],
x
[
1
]))
for
x
in
ranges
]
except
:
frame_indices
=
np
.
random
.
permutation
(
vlen
)[:
acc_samples
]
frame_indices
.
sort
()
frame_indices
=
list
(
frame_indices
)
elif
fix_start
is
not
None
:
frame_indices
=
[
x
[
0
]
+
fix_start
for
x
in
ranges
]
elif
sample
==
'middle'
:
frame_indices
=
[(
x
[
0
]
+
x
[
1
])
//
2
for
x
in
ranges
]
else
:
raise
NotImplementedError
if
len
(
frame_indices
)
<
num_frames
:
# padded with last frame
padded_frame_indices
=
[
frame_indices
[
-
1
]]
*
num_frames
padded_frame_indices
[:
len
(
frame_indices
)]
=
frame_indices
frame_indices
=
padded_frame_indices
elif
'fps'
in
sample
:
# fps0.5, sequentially sample frames at 0.5 fps
output_fps
=
float
(
sample
[
3
:])
duration
=
float
(
vlen
)
/
input_fps
delta
=
1
/
output_fps
# gap between frames, this is also the clip length each frame represents
frame_seconds
=
np
.
arange
(
0
+
delta
/
2
,
duration
+
delta
/
2
,
delta
)
frame_indices
=
np
.
around
(
frame_seconds
*
input_fps
).
astype
(
int
)
frame_indices
=
[
e
for
e
in
frame_indices
if
e
<
vlen
]
if
max_num_frames
>
0
and
len
(
frame_indices
)
>
max_num_frames
:
frame_indices
=
frame_indices
[:
max_num_frames
]
# frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames)
elif
'interval'
in
sample
:
if
num_frames
==
1
:
frame_indices
=
[
random
.
randint
(
0
,
vlen
-
1
)]
else
:
# transform FPS
interval
=
8
clip_length
=
num_frames
*
interval
*
input_fps
/
30
max_idx
=
max
(
vlen
-
clip_length
,
0
)
start_idx
=
random
.
uniform
(
0
,
max_idx
)
end_idx
=
start_idx
+
clip_length
-
1
frame_indices
=
torch
.
linspace
(
start_idx
,
end_idx
,
num_frames
)
frame_indices
=
torch
.
clamp
(
frame_indices
,
0
,
vlen
-
1
).
long
().
tolist
()
else
:
raise
ValueError
return
frame_indices
def
get_frame_indices_start_end
(
num_frames
,
vlen
,
fps
,
start_time
,
end_time
):
start_idx
=
max
(
int
(
fps
*
start_time
),
0
)
if
start_time
is
not
None
and
not
math
.
isnan
(
start_time
)
else
0
end_idx
=
min
(
int
(
fps
*
end_time
),
vlen
)
if
end_time
is
not
None
and
not
math
.
isnan
(
end_time
)
else
vlen
clip_len
=
end_idx
-
start_idx
acc_samples
=
min
(
num_frames
,
clip_len
)
# split the video into `acc_samples` intervals, and sample from each interval.
intervals
=
np
.
linspace
(
start
=
start_idx
,
stop
=
end_idx
,
num
=
acc_samples
+
1
).
astype
(
int
)
ranges
=
[]
for
idx
,
interv
in
enumerate
(
intervals
[:
-
1
]):
ranges
.
append
((
interv
,
intervals
[
idx
+
1
]
-
1
))
try
:
frame_indices
=
[
random
.
choice
(
range
(
x
[
0
],
x
[
1
]))
for
x
in
ranges
]
except
:
frame_indices
=
np
.
random
.
permutation
(
list
(
range
(
start_idx
,
end_idx
)))[:
acc_samples
]
frame_indices
.
sort
()
frame_indices
=
list
(
frame_indices
)
if
len
(
frame_indices
)
<
num_frames
:
# padded with last frame
padded_frame_indices
=
[
frame_indices
[
-
1
]]
*
num_frames
padded_frame_indices
[:
len
(
frame_indices
)]
=
frame_indices
frame_indices
=
padded_frame_indices
return
frame_indices
def
read_frames_decord
(
video_path
,
width
=
None
,
height
=
None
,
num_frames
=
8
,
sample
=
'rand'
,
fix_start
=
None
,
max_num_frames
=-
1
,
start_time
=
None
,
end_time
=
None
):
import
decord
decord
.
bridge
.
set_bridge
(
'torch'
)
if
video_path
.
lower
().
endswith
(
'.webm'
):
# a workaround for webm, large/auto num_threads will cause error.
num_threads
=
2
else
:
num_threads
=
0
if
width
is
not
None
and
height
is
not
None
:
video_reader
=
decord
.
VideoReader
(
video_path
,
width
=
width
,
height
=
height
,
num_threads
=
num_threads
)
else
:
video_reader
=
decord
.
VideoReader
(
video_path
,
num_threads
=
num_threads
)
vlen
=
len
(
video_reader
)
fps
=
video_reader
.
get_avg_fps
()
if
start_time
and
end_time
:
frame_indices
=
get_frame_indices_start_end
(
num_frames
,
vlen
,
fps
,
start_time
,
end_time
)
else
:
frame_indices
=
get_frame_indices
(
num_frames
,
vlen
,
sample
=
sample
,
fix_start
=
fix_start
,
input_fps
=
fps
,
max_num_frames
=
max_num_frames
)
frames
=
video_reader
.
get_batch
(
frame_indices
)
if
isinstance
(
frames
,
torch
.
Tensor
):
frames
=
frames
.
numpy
()
# (T, H, W, C), torch.uint8
else
:
print
(
frames
.
shape
)
frames
=
frames
.
asnumpy
()
timestamp
=
{
'num_frames'
:
len
(
frame_indices
),
'timestamp'
:
', '
.
join
([
str
(
round
(
f
/
fps
,
1
))
for
f
in
frame_indices
])
}
return
frames
,
timestamp
class
mPLUG_Owl3
(
BaseModel
):
# No separate model module is required, but the dependencies must be met.
# https://github.com/X-PLUG/mPLUG-Owl/blob/main/mPLUG-Owl3/requirements.txt
INSTALL_REQ
=
True
INTERLEAVE
=
True
INSTALL_REQ_TXT
=
'https://github.com/X-PLUG/mPLUG-Owl/blob/main/mPLUG-Owl3/requirements.txt'
def
__init__
(
self
,
model_path
=
None
,
**
kwargs
):
assert
model_path
is
not
None
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
)
self
.
model
=
AutoModel
.
from_pretrained
(
model_path
,
attn_implementation
=
'sdpa'
,
torch_dtype
=
torch
.
half
,
trust_remote_code
=
True
)
self
.
model
.
eval
().
cuda
()
self
.
processor
=
self
.
model
.
init_processor
(
self
.
tokenizer
)
self
.
logger
=
get_logger
(
'mPLUG_Owl3'
)
if
self
.
INSTALL_REQ
:
self
.
logger
.
info
(
f
'Please remember to meet the requirements first
\n
'
f
'Here:
{
self
.
INSTALL_REQ_TXT
}
'
)
def
use_custom_prompt
(
self
,
dataset
):
assert
dataset
is
not
None
if
listinstr
([
'MMMU'
],
dataset
):
return
False
if
listinstr
([
'MVBench'
,
'MMVet'
],
dataset
):
return
True
return
False
def
save_video_into_images
(
self
,
line
,
num_frames
=
16
,
dataset_class
=
None
):
video_url
=
{
'video'
:
osp
.
join
(
line
[
'prefix'
],
line
[
'video'
]),
'num_frames'
:
num_frames
,
'bound'
:
line
.
get
(
'bound'
,
None
)
}
if
osp
.
isdir
(
video_url
[
'video'
]):
frame_paths
=
[]
max_frame
=
len
(
os
.
listdir
(
video_url
[
'video'
]))
fps
=
3
if
video_url
[
'bound'
]:
start
,
end
=
line
[
'start'
],
line
[
'end'
]
else
:
start
,
end
=
-
100000
,
100000
start_idx
=
max
(
1
,
round
(
start
*
fps
))
end_idx
=
min
(
round
(
end
*
fps
),
max_frame
)
seg_size
=
float
(
end_idx
-
start_idx
)
/
num_frames
frame_indices
=
np
.
array
([
int
(
start_idx
+
(
seg_size
/
2
)
+
np
.
round
(
seg_size
*
idx
))
for
idx
in
range
(
num_frames
)
])
for
frame_index
in
frame_indices
:
img
=
os
.
path
.
join
(
video_url
[
'video'
],
f
'
{
frame_index
:
05
d
}
.jpg'
)
frame_paths
.
append
(
img
)
return
frame_paths
if
isinstance
(
video_url
,
dict
):
if
video_url
[
'bound'
]:
start_time
=
line
[
'start'
]
end_time
=
line
[
'end'
]
else
:
start_time
=
None
end_time
=
None
num_frames
=
video_url
.
get
(
'num_frames'
,
num_frames
)
video_url
=
video_url
[
'video'
]
else
:
start_time
=
None
end_time
=
None
video_url
=
str
(
video_url
)
if
not
osp
.
exists
(
video_url
):
# for MVBench_MP4
video_url
=
osp
.
join
(
dataset_class
.
data_root
,
video_url
)
video
,
timestamp
=
read_frames_decord
(
video_url
,
num_frames
=
num_frames
,
sample
=
'middle'
,
start_time
=
start_time
,
end_time
=
end_time
)
to_pil
=
transforms
.
ToPILImage
()
frames
=
[
to_pil
(
video
[
ti
])
for
ti
in
range
(
video
.
shape
[
0
])]
lmu_root
=
LMUDataRoot
()
frame_root
=
osp
.
join
(
lmu_root
,
'images'
,
dataset_class
.
dataset_name
,
'mplug_owl3'
)
frame_root
=
osp
.
join
(
frame_root
,
video_url
.
split
(
'/'
)[
-
1
].
split
(
'.'
)[
0
])
os
.
makedirs
(
frame_root
,
exist_ok
=
True
)
frame_tmpl
=
'frame-{}-of-{}.jpg'
frame_paths
=
[
osp
.
join
(
frame_root
,
frame_tmpl
.
format
(
i
,
num_frames
))
for
i
in
range
(
1
,
num_frames
+
1
)]
for
im
,
pth
in
zip
(
frames
,
frame_paths
):
if
not
osp
.
exists
(
pth
):
im
.
save
(
pth
)
return
frame_paths
# Currently same to mPLUG_Owl2
def
build_prompt
(
self
,
line
,
dataset
=
None
,
num_frames
=
16
,
video_llm
=
False
):
if
not
isinstance
(
dataset
,
str
):
dataset_class
=
dataset
dataset
=
dataset_class
.
dataset_name
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
assert
self
.
use_custom_prompt
(
dataset
)
if
dataset_class
.
MODALITY
==
'VIDEO'
:
if
listinstr
([
'MVBench'
],
dataset
):
tgt_path
=
self
.
save_video_into_images
(
line
,
num_frames
,
dataset_class
)
else
:
tgt_path
=
dataset_class
.
save_video_into_images
(
line
,
num_frames
)
if
type
(
line
[
'candidates'
])
is
not
list
:
line
[
'candidates'
]
=
eval
(
line
[
'candidates'
])
for
idx
,
c
in
enumerate
(
line
[
'candidates'
]):
line
[
chr
(
ord
(
'A'
)
+
idx
)]
=
c
else
:
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
'question'
]
if
dataset
==
'MMVet'
:
prompt
=
question
+
'
\n
Answer the question directly. '
elif
listinstr
([
'MCQ'
,
'Video-MCQ'
],
DATASET_TYPE
(
dataset
)):
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
options_prompt
=
''
for
key
,
item
in
options
.
items
():
options_prompt
+=
f
'
{
key
}
.
{
item
}
\n
'
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
prompt
=
f
'Hint:
{
hint
}
\n
'
if
hint
is
not
None
else
''
prompt
+=
f
'
{
question
}
\n
'
prompt
+=
(
f
'
{
options_prompt
}
\n
Answer with the option’s letter from the given choices directly. '
if
len
(
options
)
else
'Answer the question directly. '
)
else
:
raise
NotImplementedError
message
=
[
dict
(
type
=
'text'
,
value
=
prompt
)]
message
.
extend
([
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
])
return
message
def
preproc_image
(
self
,
fname
,
dataset
=
None
):
from
PIL
import
Image
image
=
Image
.
open
(
fname
).
convert
(
'RGB'
)
# resize to max_size
max_size
=
448
*
16
if
max
(
image
.
size
)
>
max_size
and
not
listinstr
([
'MVBench'
],
dataset
):
w
,
h
=
image
.
size
if
w
>
h
:
new_w
=
max_size
new_h
=
int
(
h
*
max_size
/
w
)
else
:
new_h
=
max_size
new_w
=
int
(
w
*
max_size
/
h
)
image
=
image
.
resize
((
new_w
,
new_h
),
resample
=
Image
.
BICUBIC
)
return
image
def
generate_inner
(
self
,
message
,
dataset
=
None
):
num_images
=
len
([
x
for
x
in
message
if
x
[
'type'
]
==
'image'
])
assert
num_images
>=
0
images
=
[]
prompt_full
=
''
for
msg
in
message
:
if
msg
[
'type'
]
==
'image'
:
images
.
append
(
msg
[
'value'
])
prompt_full
+=
'<|image|>'
elif
msg
[
'type'
]
==
'text'
:
prompt_full
+=
msg
[
'value'
]
needed_messages
=
[
{
'role'
:
'user'
,
'content'
:
prompt_full
},
{
'role'
:
'assistant'
,
'content'
:
''
}
]
images
=
[
self
.
preproc_image
(
fname
,
dataset
)
for
fname
in
images
]
inputs
=
self
.
processor
(
needed_messages
,
images
=
images
,
videos
=
None
,
cut_enable
=
False
)
inputs
.
to
(
'cuda'
)
if
listinstr
([
'MVBench'
],
dataset
):
inputs
.
update
({
'tokenizer'
:
self
.
tokenizer
,
'max_new_tokens'
:
100
,
'decode_text'
:
True
,
'do_sample'
:
True
,
'top_k'
:
1
,
})
else
:
inputs
.
update
({
'tokenizer'
:
self
.
tokenizer
,
'max_new_tokens'
:
1024
,
'decode_text'
:
True
,
})
g
=
self
.
model
.
generate
(
**
inputs
)
return
g
[
0
]
VLMEvalKit/vlmeval/vlm/nvlm.py
0 → 100644
View file @
bc5ebf0f
import
torch
from
transformers
import
AutoTokenizer
,
AutoModel
import
math
from
PIL
import
Image
import
torchvision.transforms
as
T
from
torchvision.transforms.functional
import
InterpolationMode
from
.base
import
BaseModel
from
..smp
import
*
from
..dataset
import
DATASET_TYPE
IMAGENET_MEAN
=
(
0.485
,
0.456
,
0.406
)
IMAGENET_STD
=
(
0.229
,
0.224
,
0.225
)
def
split_model
():
device_map
=
{}
num_gpus
=
torch
.
cuda
.
device_count
()
rank
,
world_size
=
get_rank_and_world_size
()
num_gpus
=
num_gpus
//
world_size
num_layers
=
80
# Since the first GPU will be used for ViT, treat it as half a GPU.
num_layers_per_gpu
=
math
.
ceil
(
num_layers
/
(
num_gpus
-
0.5
))
num_layers_per_gpu
=
[
num_layers_per_gpu
]
*
num_gpus
num_layers_per_gpu
[
0
]
=
math
.
ceil
(
num_layers_per_gpu
[
0
]
*
0.5
)
layer_cnt
=
0
for
i
,
num_layer
in
enumerate
(
num_layers_per_gpu
):
for
j
in
range
(
num_layer
):
device_map
[
f
'language_model.model.layers.
{
layer_cnt
}
'
]
=
rank
+
i
*
world_size
layer_cnt
+=
1
device_map
[
'vision_model'
]
=
rank
device_map
[
'mlp1'
]
=
rank
device_map
[
'language_model.model.embed_tokens'
]
=
rank
device_map
[
'language_model.model.norm'
]
=
rank
device_map
[
'language_model.model.rotary_emb'
]
=
rank
device_map
[
'language_model.lm_head'
]
=
rank
device_map
[
f
'language_model.model.layers.
{
num_layers
-
1
}
'
]
=
rank
return
device_map
def
build_transform
(
input_size
):
MEAN
,
STD
=
IMAGENET_MEAN
,
IMAGENET_STD
transform
=
T
.
Compose
([
T
.
Lambda
(
lambda
img
:
img
.
convert
(
'RGB'
)
if
img
.
mode
!=
'RGB'
else
img
),
T
.
Resize
((
input_size
,
input_size
),
interpolation
=
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
T
.
Normalize
(
mean
=
MEAN
,
std
=
STD
)
])
return
transform
def
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
,
height
,
image_size
):
best_ratio_diff
=
float
(
'inf'
)
best_ratio
=
(
1
,
1
)
area
=
width
*
height
for
ratio
in
target_ratios
:
target_aspect_ratio
=
ratio
[
0
]
/
ratio
[
1
]
ratio_diff
=
abs
(
aspect_ratio
-
target_aspect_ratio
)
if
ratio_diff
<
best_ratio_diff
:
best_ratio_diff
=
ratio_diff
best_ratio
=
ratio
elif
ratio_diff
==
best_ratio_diff
:
if
area
>
0.5
*
image_size
*
image_size
*
ratio
[
0
]
*
ratio
[
1
]:
best_ratio
=
ratio
return
best_ratio
def
dynamic_preprocess
(
image
,
min_num
=
1
,
max_num
=
12
,
image_size
=
448
,
use_thumbnail
=
False
):
orig_width
,
orig_height
=
image
.
size
aspect_ratio
=
orig_width
/
orig_height
# calculate the existing image aspect ratio
target_ratios
=
set
(
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
i
*
j
<=
max_num
and
i
*
j
>=
min_num
)
target_ratios
=
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
orig_width
,
orig_height
,
image_size
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
def
load_image
(
image_file
,
input_size
=
448
,
max_num
=
12
):
image
=
Image
.
open
(
image_file
).
convert
(
'RGB'
)
transform
=
build_transform
(
input_size
=
input_size
)
images
=
dynamic_preprocess
(
image
,
image_size
=
input_size
,
use_thumbnail
=
True
,
max_num
=
max_num
)
pixel_values
=
[
transform
(
image
)
for
image
in
images
]
pixel_values
=
torch
.
stack
(
pixel_values
)
return
pixel_values
class
NVLM
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
=
'nvidia/NVLM-D-72B'
,
**
kwargs
):
assert
model_path
is
not
None
self
.
model_path
=
model_path
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
,
use_fast
=
False
)
kwargs_default
=
dict
(
max_new_tokens
=
1024
,
do_sample
=
False
)
kwargs_default
.
update
(
kwargs
)
self
.
kwargs
=
kwargs_default
self
.
model
=
AutoModel
.
from_pretrained
(
model_path
,
torch_dtype
=
torch
.
bfloat16
,
low_cpu_mem_usage
=
True
,
use_flash_attn
=
False
,
trust_remote_code
=
True
,
device_map
=
split_model
()).
eval
()
logging
.
info
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
torch
.
cuda
.
empty_cache
()
def
generate_inner
(
self
,
message
,
dataset
=
None
):
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
pixel_values
=
load_image
(
image_path
,
max_num
=
6
).
to
(
torch
.
bfloat16
).
cuda
()
response
=
self
.
model
.
chat
(
self
.
tokenizer
,
pixel_values
,
prompt
,
self
.
kwargs
)
return
response
.
strip
()
VLMEvalKit/vlmeval/vlm/omchat.py
0 → 100644
View file @
bc5ebf0f
import
torch
from
PIL
import
Image
import
re
from
transformers
import
AutoModel
,
AutoProcessor
from
.base
import
BaseModel
from
..smp
import
*
from
..dataset
import
DATASET_TYPE
class
OmChat
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
True
def
__init__
(
self
,
model_path
=
'omlab/omchat-v2.0-13B-single-beta_hf'
,
**
kwargs
):
# Recommend to install `transformers==4.44.0`
assert
model_path
is
not
None
self
.
model_path
=
model_path
print
(
f
'load from
{
self
.
model_path
}
'
)
model
=
AutoModel
.
from_pretrained
(
self
.
model_path
,
trust_remote_code
=
True
,
torch_dtype
=
torch
.
float16
)
self
.
model
=
model
.
cuda
().
eval
()
self
.
kwargs
=
kwargs
self
.
processor
=
AutoProcessor
.
from_pretrained
(
self
.
model_path
,
trust_remote_code
=
True
)
torch
.
cuda
.
empty_cache
()
# system prompt
self
.
default_system_prompt
=
'You are a helpful assistant. Focus on accuracy and reliability in your response.'
self
.
new1_system_prompt
=
'You are a helpful assistant.'
self
.
new2_system_prompt
=
(
'Read the following question carefully, '
'solve it step by step, '
'and then output the final answer in the format of '
"'Answer: single number or single word or phrase'.
\n\n
"
)
# suffix_prompt for MCQ
self
.
mcq_suffix_prompt_en
=
'Please select the correct answer from the options above.
\n
'
self
.
mcq_suffix_prompt_cn
=
'请直接回答选项字母。
\n
'
# suffix_prompt for Y/N
self
.
yorn_suffix_prompt
=
' Please answer yes or no. Answer the question using a single word or phrase.'
def
use_custom_prompt
(
self
,
dataset
):
assert
dataset
is
not
None
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
or
DATASET_TYPE
(
dataset
)
==
'Y/N'
:
return
True
return
False
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
assert
self
.
use_custom_prompt
(
dataset
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
if
isinstance
(
line
,
int
):
line
=
self
.
data
.
iloc
[
line
]
question
=
line
[
'question'
]
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
options_prompt
=
'Options:
\n
'
for
key
,
item
in
options
.
items
():
options_prompt
+=
f
'
{
key
}
.
{
item
}
\n
'
prompt
=
''
if
hint
is
not
None
:
prompt
+=
f
'Hint:
{
hint
}
\n
'
prompt
+=
f
'Question:
{
question
}
\n
'
if
len
(
options
):
prompt
+=
options_prompt
if
not
dataset
.
startswith
(
'MMMU_'
):
if
not
cn_string
(
prompt
):
prompt
+=
self
.
mcq_suffix_prompt_en
else
:
prompt
+=
self
.
mcq_suffix_prompt_cn
elif
DATASET_TYPE
(
dataset
)
==
'Y/N'
:
prompt
=
question
+
self
.
yorn_suffix_prompt
print
(
DATASET_TYPE
(
dataset
))
message
=
[]
if
isinstance
(
tgt_path
,
list
):
message
.
extend
([
dict
(
type
=
'image'
,
value
=
p
)
for
p
in
tgt_path
])
else
:
message
=
[
dict
(
type
=
'image'
,
value
=
tgt_path
)]
message
.
append
(
dict
(
type
=
'text'
,
value
=
prompt
))
return
message
def
message_to_promptimg
(
self
,
message
,
dataset
=
None
):
if
dataset
is
None
or
listinstr
([
'MMMU'
],
dataset
):
prompt
=
'
\n
'
.
join
([
re
.
sub
(
r
'<image\s*\d+>'
,
'<image>'
,
x
[
'value'
])
for
x
in
message
if
x
[
'type'
]
==
'text'
])
image
=
[
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'image'
]
else
:
prompt
=
'
\n
'
.
join
([
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'text'
])
image
=
[
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'image'
]
return
prompt
,
image
def
generate_inner
(
self
,
message
,
dataset
=
None
):
def
replace_last_dot
(
input_string
):
if
input_string
.
endswith
(
'.'
):
return
input_string
[:
-
1
]
else
:
return
input_string
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
image
=
[
Image
.
open
(
img_path
).
convert
(
'RGB'
)
for
img_path
in
image_path
]
default_kwargs
=
dict
(
max_new_tokens
=
1024
,
do_sample
=
False
,
temperature
=
0.0
,
top_p
=
1
)
if
dataset
is
not
None
and
listinstr
([
'MathVista_MINI'
],
dataset
):
system_prompt
=
self
.
new2_system_prompt
elif
dataset
is
not
None
and
listinstr
([
'MMMU_DEV_VAL'
,
'MMStar'
],
dataset
):
system_prompt
=
self
.
new1_system_prompt
else
:
system_prompt
=
self
.
default_system_prompt
inputs
=
self
.
processor
(
text
=
prompt
,
system_prompt
=
system_prompt
,
images
=
image
,
return_tensors
=
'pt'
).
to
(
'cuda'
)
default_kwargs
.
update
(
self
.
kwargs
)
with
torch
.
inference_mode
():
output_ids
=
self
.
model
.
generate
(
**
inputs
,
eos_token_id
=
self
.
model
.
generation_config
.
eos_token_id
,
**
default_kwargs
)
res
=
self
.
processor
.
tokenizer
.
decode
(
output_ids
[
0
,
inputs
.
input_ids
.
shape
[
1
]:]).
strip
()
if
'<|im_end|>'
in
res
:
res
=
res
.
split
(
'<|im_end|>'
)[
0
].
strip
()
if
dataset
!=
'MMMU_DEV_VAL'
:
if
res
.
startswith
(
'Answer: '
):
res
=
res
[
len
(
'Answer: '
):]
match
=
re
.
search
(
r
'\nThe answer is:(.+)'
,
res
)
if
match
:
res
=
match
.
group
(
1
).
strip
()
# for OCRBench
doc_match
=
re
.
search
(
r
'<doc>(.*?)<\/doc>'
,
res
)
if
doc_match
:
res
=
doc_match
.
group
(
1
).
strip
()
res
=
replace_last_dot
(
res
)
return
res
VLMEvalKit/vlmeval/vlm/omnilmm.py
0 → 100644
View file @
bc5ebf0f
import
torch
from
PIL
import
Image
from
transformers
import
AutoTokenizer
from
.base
import
BaseModel
from
..smp
import
*
from
..dataset
import
DATASET_TYPE
DEFAULT_IMAGE_TOKEN
=
'<image>'
DEFAULT_IMAGE_PATCH_TOKEN
=
'<im_patch>'
DEFAULT_IM_START_TOKEN
=
'<im_start>'
DEFAULT_IM_END_TOKEN
=
'<im_end>'
def
init_omni_lmm
(
model_path
):
from
omnilmm.model.omnilmm
import
OmniLMMForCausalLM
from
omnilmm.utils
import
disable_torch_init
from
omnilmm.model.utils
import
build_transform
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
True
disable_torch_init
()
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
model_max_length
=
2048
)
model
=
OmniLMMForCausalLM
.
from_pretrained
(
model_path
,
tune_clip
=
True
,
torch_dtype
=
torch
.
bfloat16
,
device_map
=
'cpu'
)
model
=
model
.
to
(
device
=
'cuda'
,
dtype
=
torch
.
bfloat16
)
image_processor
=
build_transform
(
is_train
=
False
,
input_size
=
model
.
model
.
config
.
image_size
,
std_mode
=
'OPENAI_CLIP'
)
mm_use_im_start_end
=
getattr
(
model
.
config
,
'mm_use_im_start_end'
,
False
)
assert
mm_use_im_start_end
tokenizer
.
add_tokens
(
[
DEFAULT_IMAGE_PATCH_TOKEN
,
DEFAULT_IM_START_TOKEN
,
DEFAULT_IM_END_TOKEN
],
special_tokens
=
True
,
)
vision_config
=
model
.
model
.
vision_config
vision_config
.
im_patch_token
=
tokenizer
.
convert_tokens_to_ids
(
[
DEFAULT_IMAGE_PATCH_TOKEN
]
)[
0
]
vision_config
.
use_im_start_end
=
mm_use_im_start_end
vision_config
.
im_start_token
,
vision_config
.
im_end_token
=
(
tokenizer
.
convert_tokens_to_ids
([
DEFAULT_IM_START_TOKEN
,
DEFAULT_IM_END_TOKEN
])
)
image_token_len
=
model
.
model
.
config
.
num_query
return
model
,
image_processor
,
image_token_len
,
tokenizer
def
expand_question_into_multimodal
(
question_text
,
image_token_len
,
im_st_token
,
im_ed_token
,
im_patch_token
):
if
'<image>'
in
question_text
[
0
][
'content'
]:
question_text
[
0
][
'content'
]
=
question_text
[
0
][
'content'
].
replace
(
'<image>'
,
im_st_token
+
im_patch_token
*
image_token_len
+
im_ed_token
)
else
:
question_text
[
0
][
'content'
]
=
(
im_st_token
+
im_patch_token
*
image_token_len
+
im_ed_token
+
'
\n
'
+
question_text
[
0
][
'content'
]
)
return
question_text
def
wrap_question_for_omni_lmm
(
question
,
image_token_len
,
tokenizer
):
from
omnilmm.train.train_utils
import
omni_preprocess
question
=
expand_question_into_multimodal
(
question
,
image_token_len
,
DEFAULT_IM_START_TOKEN
,
DEFAULT_IM_END_TOKEN
,
DEFAULT_IMAGE_PATCH_TOKEN
,
)
conversation
=
question
data_dict
=
omni_preprocess
(
sources
=
[
conversation
],
tokenizer
=
tokenizer
,
generation
=
True
)
data_dict
=
dict
(
input_ids
=
data_dict
[
'input_ids'
][
0
],
labels
=
data_dict
[
'labels'
][
0
])
return
data_dict
class
OmniLMM12B
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
,
root
,
**
kwargs
)
->
None
:
sys
.
path
.
append
(
root
)
model
,
img_processor
,
image_token_len
,
tokenizer
=
init_omni_lmm
(
model_path
)
self
.
model
=
model
self
.
image_token_len
=
image_token_len
self
.
image_transform
=
img_processor
self
.
tokenizer
=
tokenizer
self
.
model
.
eval
()
default_kwargs
=
dict
(
max_new_tokens
=
512
,
do_sample
=
False
,
output_scores
=
True
,
return_dict_in_generate
=
True
,
repetition_penalty
=
1.1
,
)
default_kwargs
.
update
(
kwargs
)
self
.
kwargs
=
default_kwargs
torch
.
cuda
.
empty_cache
()
def
generate_inner
(
self
,
message
,
dataset
=
None
):
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
try
:
image
=
Image
.
open
(
image_path
).
convert
(
'RGB'
)
except
:
logger
=
get_logger
(
'OmniLMM Inference'
)
logger
.
error
(
'Image Decode Error'
)
return
'Image Decode Error'
msgs
=
[
dict
(
role
=
'user'
,
content
=
prompt
)]
input_ids
=
wrap_question_for_omni_lmm
(
msgs
,
self
.
image_token_len
,
self
.
tokenizer
)[
'input_ids'
]
input_ids
=
torch
.
as_tensor
(
input_ids
)
image
=
self
.
image_transform
(
image
)
with
torch
.
inference_mode
():
output
=
self
.
model
.
generate_vllm
(
input_ids
=
input_ids
.
unsqueeze
(
0
).
cuda
(),
images
=
image
.
unsqueeze
(
0
).
half
().
cuda
(),
**
self
.
kwargs
,
)
response
=
self
.
tokenizer
.
decode
(
output
.
sequences
[
0
],
skip_special_tokens
=
True
)
response
=
response
.
strip
()
return
response
def
use_custom_prompt
(
self
,
dataset
):
assert
dataset
is
not
None
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
return
True
return
False
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
assert
self
.
use_custom_prompt
(
dataset
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
'question'
]
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
options_prompt
=
'Options:
\n
'
for
key
,
item
in
options
.
items
():
options_prompt
+=
f
'
{
key
}
.
{
item
}
\n
'
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
prompt
=
''
if
hint
is
not
None
:
prompt
+=
f
'Hint:
{
hint
}
\n
'
prompt
+=
f
'
{
question
}
\n
'
if
len
(
options
):
prompt
+=
options_prompt
prompt
=
(
"""
Study the image carefully and pick the option associated with the correct answer.
Focus solely on selecting the option and avoid including any other content.
\n
"""
+
prompt
)
message
=
[
dict
(
type
=
'text'
,
value
=
prompt
)]
message
.
extend
([
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
])
return
message
VLMEvalKit/vlmeval/vlm/open_flamingo.py
0 → 100644
View file @
bc5ebf0f
import
sys
import
torch
from
PIL
import
Image
import
os.path
as
osp
import
warnings
from
.base
import
BaseModel
from
..smp
import
*
from
huggingface_hub
import
snapshot_download
class
OpenFlamingo
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
True
def
__init__
(
self
,
name
,
mpt_pth
=
None
,
ckpt_pth
=
None
,
**
kwargs
):
if
mpt_pth
is
None
:
raise
ValueError
(
'Please set `mpt_pth` to the directory of MPT-7B, which is cloned from here: '
'https://huggingface.co/mosaicml/mpt-7b. '
)
raise
ValueError
if
ckpt_pth
is
None
:
raise
ValueError
(
'Please set `ckpt_pth` to the openflamingo ckpt, which is the `checkpoint.pt` file downloaded '
'from: https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b/tree/main. '
)
else
:
if
osp
.
exists
(
ckpt_pth
):
if
ckpt_pth
.
endswith
(
'checkpoint.pt'
):
pass
elif
osp
.
isdir
(
ckpt_pth
):
ckpt_pth
=
osp
.
join
(
ckpt_pth
,
'checkpoint.pt'
)
if
not
osp
.
exists
(
ckpt_pth
):
raise
ValueError
(
f
'File
{
ckpt_pth
}
does not exist. '
)
elif
splitlen
(
ckpt_pth
,
'/'
)
==
2
:
cache_path
=
get_cache_path
(
ckpt_pth
)
if
cache_path
is
None
:
snapshot_download
(
ckpt_pth
)
cache_path
=
get_cache_path
(
ckpt_pth
)
if
cache_path
is
None
:
raise
ValueError
(
f
'Directory
{
cache_path
}
does not exist. '
)
else
:
ckpt_pth
=
osp
.
join
(
cache_path
,
'checkpoint.pt'
)
self
.
name
=
name
assert
name
in
[
'v2'
]
self
.
mpt_pth
=
mpt_pth
try
:
from
open_flamingo
import
create_model_and_transforms
except
Exception
as
e
:
logging
.
critical
(
'Please first install open_flamingo to use OpenFlamingo'
)
raise
e
model
,
image_processor
,
tokenizer
=
create_model_and_transforms
(
clip_vision_encoder_path
=
'ViT-L-14'
,
clip_vision_encoder_pretrained
=
'openai'
,
lang_encoder_path
=
mpt_pth
,
tokenizer_path
=
mpt_pth
,
cross_attn_every_n_layers
=
4
)
ckpt
=
torch
.
load
(
ckpt_pth
)
model
.
load_state_dict
(
ckpt
,
strict
=
False
)
torch
.
cuda
.
empty_cache
()
self
.
model
=
model
.
eval
().
cuda
()
self
.
tokenizer
=
tokenizer
self
.
tokenizer
.
padding_side
=
'left'
self
.
image_proc
=
image_processor
kwargs_default
=
dict
(
max_new_tokens
=
512
,
num_beams
=
3
)
kwargs_default
.
update
(
kwargs
)
self
.
kwargs
=
kwargs_default
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
def
generate_inner
(
self
,
message
,
dataset
=
None
):
vision_x
=
[]
prompt
=
''
for
msg
in
message
:
if
msg
[
'type'
]
==
'image'
:
img
=
Image
.
open
(
msg
[
'value'
])
vision_x
.
append
(
self
.
image_proc
(
img
).
unsqueeze
(
0
))
prompt
+=
'<image>'
elif
msg
[
'type'
]
==
'text'
:
prompt
+=
msg
[
'value'
]
prompt
+=
'Answer: '
vision_x
=
torch
.
cat
(
vision_x
,
dim
=
0
)
if
len
(
vision_x
)
>
1
else
vision_x
[
0
]
vision_x
=
vision_x
.
unsqueeze
(
1
).
unsqueeze
(
0
)
lang_x
=
self
.
tokenizer
([
prompt
],
return_tensors
=
'pt'
)
generated_text
=
self
.
model
.
generate
(
vision_x
=
vision_x
.
cuda
(),
lang_x
=
lang_x
[
'input_ids'
].
cuda
(),
attention_mask
=
lang_x
[
'attention_mask'
].
cuda
(),
**
self
.
kwargs
)
generated_text
=
self
.
tokenizer
.
decode
(
generated_text
[
0
])
text
=
generated_text
[
len
(
prompt
):].
split
(
'<|endofchunk|>'
)[
0
]
return
text
VLMEvalKit/vlmeval/vlm/ovis.py
0 → 100644
View file @
bc5ebf0f
import
torch
from
transformers
import
AutoModelForCausalLM
from
.base
import
BaseModel
from
..dataset
import
DATASET_TYPE
from
..smp
import
*
class
Ovis
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
True
def
__init__
(
self
,
model_path
=
'AIDC-AI/Ovis1.5-Llama3-8B'
,
**
kwargs
):
assert
model_path
is
not
None
# Recommend to install `transformers==4.43.2` and `torch==2.1.2`.
self
.
model_path
=
model_path
self
.
device
=
torch
.
cuda
.
current_device
()
self
.
dtype
=
torch
.
bfloat16
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
self
.
model_path
,
torch_dtype
=
self
.
dtype
,
multimodal_max_length
=
8192
,
trust_remote_code
=
True
)
self
.
model
=
self
.
model
.
eval
().
to
(
device
=
self
.
device
)
self
.
eos_token_id
=
self
.
model
.
generation_config
.
eos_token_id
self
.
text_tokenizer
=
self
.
model
.
get_text_tokenizer
()
self
.
pad_token_id
=
self
.
text_tokenizer
.
pad_token_id
self
.
visual_tokenizer
=
self
.
model
.
get_visual_tokenizer
()
self
.
conversation_formatter
=
self
.
model
.
get_conversation_formatter
()
self
.
image_placeholder
=
'<image>'
self
.
gen_kwargs
=
dict
(
max_new_tokens
=
1024
,
do_sample
=
False
,
top_p
=
None
,
top_k
=
None
,
temperature
=
None
,
repetition_penalty
=
None
,
eos_token_id
=
self
.
eos_token_id
,
pad_token_id
=
self
.
pad_token_id
,
use_cache
=
True
)
self
.
gen_kwargs
.
update
(
kwargs
)
def
use_custom_prompt
(
self
,
dataset
):
if
DATASET_TYPE
(
dataset
)
==
'Y/N'
or
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
return
True
return
False
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
self
.
use_custom_prompt
(
dataset
)
assert
isinstance
(
dataset
,
str
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
if
DATASET_TYPE
(
dataset
)
==
'Y/N'
:
prompt
=
self
.
build_yorn_prompt
(
line
,
dataset
)
elif
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
prompt
=
self
.
build_multi_choice_prompt
(
line
,
dataset
)
else
:
raise
RuntimeError
(
f
'Invalid dataset type:
{
DATASET_TYPE
(
dataset
)
}
'
)
message
=
[
dict
(
type
=
'text'
,
value
=
prompt
)]
message
.
extend
([
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
])
# interleave dataset
if
dataset
.
startswith
(
'MMMU_'
):
from
..
import
MMMUDataset
message
=
MMMUDataset
.
split_MMMU
(
message
)
return
message
def
build_yorn_prompt
(
self
,
line
,
dataset
=
None
):
prompt
=
line
[
'question'
]
if
listinstr
([
'HallusionBench'
],
dataset
):
prompt
+=
' Please answer yes or no.'
prompt
+=
'
\n
请用单个词或短语回答问题。'
if
cn_string
(
prompt
)
else
'
\n
Answer the question using a single word or phrase.'
return
prompt
def
build_multi_choice_prompt
(
self
,
line
,
dataset
=
None
):
question
=
line
[
'question'
]
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
if
hint
is
not
None
:
question
=
hint
+
'
\n
'
+
question
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
for
key
,
item
in
options
.
items
():
question
+=
f
'
\n
{
key
}
.
{
item
}
'
prompt
=
question
if
len
(
options
):
prompt
+=
'
\n
请直接回答选项字母。'
if
cn_string
(
prompt
)
else
"
\n
Answer with the option's letter from the given choices directly."
else
:
prompt
+=
'
\n
请直接回答问题。'
if
cn_string
(
prompt
)
else
'
\n
Answer the question directly.'
return
prompt
def
generate_inner
(
self
,
message
,
dataset
=
None
):
prompt
,
input_ids
,
attention_mask
,
pixel_values
=
self
.
prepare_inputs
(
message
)
output_ids
=
self
.
model
.
generate
(
input_ids
,
pixel_values
=
pixel_values
,
attention_mask
=
attention_mask
,
**
self
.
gen_kwargs
)
response
=
self
.
text_tokenizer
.
decode
(
output_ids
[
0
],
skip_special_tokens
=
True
).
strip
()
return
response
def
prepare_inputs
(
self
,
message
):
# build query
images
=
[
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'image'
]
texts
=
[
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'text'
]
if
len
(
images
)
==
0
:
query
=
'
\n
'
.
join
(
texts
)
elif
len
(
images
)
==
1
and
len
(
texts
)
==
1
:
query
=
self
.
image_placeholder
+
'
\n
'
+
texts
[
0
]
else
:
# interleave sample
chunks
=
[
x
[
'value'
]
if
x
[
'type'
]
==
'text'
else
self
.
image_placeholder
for
x
in
message
]
query
=
'
\n
'
.
join
(
chunks
)
# format conversation
prompt
,
input_ids
=
self
.
conversation_formatter
.
format_query
(
query
)
attention_mask
=
torch
.
ne
(
input_ids
,
self
.
text_tokenizer
.
pad_token_id
)
input_ids
=
input_ids
.
unsqueeze
(
0
).
to
(
device
=
self
.
device
)
attention_mask
=
attention_mask
.
unsqueeze
(
0
).
to
(
device
=
self
.
device
)
# preprocess images
if
len
(
images
)
==
0
:
pixel_values
=
[
None
]
else
:
preprocessed_images
=
[
self
.
visual_tokenizer
.
preprocess_image
(
Image
.
open
(
image
))
for
image
in
images
]
pixel_values
=
[
torch
.
cat
(
preprocessed_images
,
dim
=
0
).
to
(
device
=
self
.
device
,
dtype
=
self
.
dtype
)]
return
prompt
,
input_ids
,
attention_mask
,
pixel_values
class
Ovis1_6
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
True
def
__init__
(
self
,
model_path
=
'AIDC-AI/Ovis1.6-Gemma2-9B'
,
**
kwargs
):
assert
model_path
is
not
None
# Recommend to install `python=3.10`, `transformers==4.44.2`, `torch==2.2.0`, and `numpy==1.24.3`
self
.
model_path
=
model_path
self
.
device
=
torch
.
cuda
.
current_device
()
self
.
dtype
=
torch
.
bfloat16
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
self
.
model_path
,
torch_dtype
=
self
.
dtype
,
multimodal_max_length
=
8192
,
trust_remote_code
=
True
)
self
.
model
=
self
.
model
.
eval
().
to
(
device
=
self
.
device
)
self
.
eos_token_id
=
self
.
model
.
generation_config
.
eos_token_id
self
.
text_tokenizer
=
self
.
model
.
get_text_tokenizer
()
self
.
pad_token_id
=
self
.
text_tokenizer
.
pad_token_id
self
.
visual_tokenizer
=
self
.
model
.
get_visual_tokenizer
()
self
.
max_partition
=
9
self
.
image_placeholder
=
'<image>'
self
.
gen_kwargs
=
dict
(
max_new_tokens
=
1024
,
do_sample
=
False
,
top_p
=
None
,
top_k
=
None
,
temperature
=
None
,
repetition_penalty
=
None
,
eos_token_id
=
self
.
eos_token_id
,
pad_token_id
=
self
.
pad_token_id
,
use_cache
=
True
)
self
.
gen_kwargs
.
update
(
kwargs
)
def
use_custom_prompt
(
self
,
dataset
):
if
DATASET_TYPE
(
dataset
)
==
'Y/N'
or
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
return
True
return
False
def
build_yorn_prompt
(
self
,
line
,
dataset
=
None
):
prompt
=
line
[
'question'
]
+
'
\n
Answer the question using a single word or phrase.'
return
prompt
def
build_multi_choice_prompt
(
self
,
line
,
dataset
=
None
):
question
=
line
[
'question'
]
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
if
hint
is
not
None
:
question
=
hint
+
'
\n
'
+
question
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
for
key
,
item
in
options
.
items
():
question
+=
f
'
\n
{
key
}
.
{
item
}
'
prompt
=
question
if
len
(
options
):
prompt
+=
"
\n
Answer with the option's letter from the given choices directly."
return
prompt
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
self
.
use_custom_prompt
(
dataset
)
assert
isinstance
(
dataset
,
str
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
if
DATASET_TYPE
(
dataset
)
==
'Y/N'
:
prompt
=
self
.
build_yorn_prompt
(
line
,
dataset
)
elif
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
prompt
=
self
.
build_multi_choice_prompt
(
line
,
dataset
)
else
:
raise
RuntimeError
(
f
'Invalid dataset type:
{
DATASET_TYPE
(
dataset
)
}
'
)
message
=
[
dict
(
type
=
'text'
,
value
=
prompt
)]
message
.
extend
([
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
])
# interleave dataset
if
dataset
.
startswith
(
'MMMU_'
):
from
..
import
MMMUDataset
message
=
MMMUDataset
.
split_MMMU
(
message
)
return
message
def
generate_inner
(
self
,
message
,
dataset
=
None
):
prompt
,
input_ids
,
attention_mask
,
pixel_values
=
self
.
prepare_inputs
(
message
)
output_ids
=
self
.
model
.
generate
(
input_ids
,
pixel_values
=
pixel_values
,
attention_mask
=
attention_mask
,
**
self
.
gen_kwargs
)
response
=
self
.
text_tokenizer
.
decode
(
output_ids
[
0
],
skip_special_tokens
=
True
)
return
response
def
prepare_inputs
(
self
,
message
):
# build query
images
=
[
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'image'
]
texts
=
[
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'text'
]
if
len
(
images
)
==
0
:
query
=
'
\n
'
.
join
(
texts
)
elif
len
(
images
)
==
1
and
len
(
texts
)
==
1
:
query
=
self
.
image_placeholder
+
'
\n
'
+
texts
[
0
]
else
:
# interleaved sample
chunks
=
[
x
[
'value'
]
if
x
[
'type'
]
==
'text'
else
self
.
image_placeholder
for
x
in
message
]
query
=
'
\n
'
.
join
(
chunks
)
# preprocess inputs
prompt
,
input_ids
,
pixel_values
=
self
.
model
.
preprocess_inputs
(
query
,
[
Image
.
open
(
image
)
for
image
in
images
],
max_partition
=
self
.
max_partition
)
# move to self.device
attention_mask
=
torch
.
ne
(
input_ids
,
self
.
text_tokenizer
.
pad_token_id
)
input_ids
=
input_ids
.
unsqueeze
(
0
).
to
(
device
=
self
.
device
)
attention_mask
=
attention_mask
.
unsqueeze
(
0
).
to
(
device
=
self
.
device
)
pixel_values
=
[
pixel_values
.
to
(
device
=
self
.
device
,
dtype
=
self
.
dtype
)
if
pixel_values
is
not
None
else
None
]
return
prompt
,
input_ids
,
attention_mask
,
pixel_values
class
Ovis1_6_Plus
(
Ovis1_6
):
# Recommend to install `python=3.10`, `transformers==4.46.2`, `torch==2.4.0`, and `numpy==1.25.0`
def
build_mmmu_prompt
(
self
,
line
,
dataset
:
str
)
->
list
[
dict
[
str
,
str
]]:
import
string
import
pandas
as
pd
question
=
line
[
'question'
]
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])}
options_prompt
=
'Options:
\n
'
for
key
,
item
in
options
.
items
():
options_prompt
+=
f
'
{
key
}
.
{
item
}
\n
'
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
prompt
=
''
if
hint
is
not
None
:
prompt
+=
f
'Hint:
{
hint
}
\n
'
prompt
+=
f
'Question:
{
question
}
\n
'
if
len
(
options
):
prompt
+=
options_prompt
prompt
+=
'Please select the correct answer from the options above.'
prompt
=
prompt
.
rstrip
()
return
prompt
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
self
.
use_custom_prompt
(
dataset
)
assert
isinstance
(
dataset
,
str
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
if
dataset
.
startswith
(
'MMMU_'
):
prompt
=
self
.
build_mmmu_prompt
(
line
,
dataset
)
elif
DATASET_TYPE
(
dataset
)
==
'Y/N'
:
prompt
=
self
.
build_yorn_prompt
(
line
,
dataset
)
elif
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
prompt
=
self
.
build_multi_choice_prompt
(
line
,
dataset
)
else
:
raise
RuntimeError
(
f
'Invalid dataset type:
{
DATASET_TYPE
(
dataset
)
}
'
)
message
=
[
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
]
+
[
dict
(
type
=
'text'
,
value
=
prompt
)]
return
message
VLMEvalKit/vlmeval/vlm/paligemma.py
0 → 100644
View file @
bc5ebf0f
from
PIL
import
Image
import
torch
from
.base
import
BaseModel
from
..smp
import
*
class
PaliGemma
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
=
'google/paligemma-3b-mix-448'
,
**
kwargs
):
try
:
from
transformers
import
AutoProcessor
,
PaliGemmaForConditionalGeneration
except
Exception
as
e
:
logging
.
critical
(
'Please install the latest version transformers.'
)
raise
e
model
=
PaliGemmaForConditionalGeneration
.
from_pretrained
(
model_path
,
torch_dtype
=
torch
.
bfloat16
,
device_map
=
'cpu'
,
revision
=
'bfloat16'
,
).
eval
()
self
.
model
=
model
.
cuda
()
self
.
processor
=
AutoProcessor
.
from_pretrained
(
model_path
)
self
.
kwargs
=
kwargs
def
generate_inner
(
self
,
message
,
dataset
=
None
):
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
image
=
Image
.
open
(
image_path
).
convert
(
'RGB'
)
model_inputs
=
self
.
processor
(
text
=
prompt
,
images
=
image
,
return_tensors
=
'pt'
).
to
(
'cuda'
)
input_len
=
model_inputs
[
'input_ids'
].
shape
[
-
1
]
with
torch
.
inference_mode
():
generation
=
self
.
model
.
generate
(
**
model_inputs
,
max_new_tokens
=
512
,
do_sample
=
False
)
generation
=
generation
[
0
][
input_len
:]
res
=
self
.
processor
.
decode
(
generation
,
skip_special_tokens
=
True
)
return
res
VLMEvalKit/vlmeval/vlm/pandagpt.py
0 → 100644
View file @
bc5ebf0f
import
sys
import
torch
import
os.path
as
osp
import
warnings
from
.base
import
BaseModel
from
..smp
import
*
class
PandaGPT
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
False
def
__init__
(
self
,
name
,
root
=
None
,
**
kwargs
):
if
root
is
None
:
raise
ValueError
(
'Please set `root` to PandaGPT code directory, which is cloned from here: '
)
assert
name
==
'PandaGPT_13B'
self
.
name
=
name
sys
.
path
.
append
(
osp
.
join
(
root
,
'code'
))
try
:
from
model.openllama
import
OpenLLAMAPEFTModel
except
Exception
as
e
:
logging
.
critical
(
'Please first install PandaGPT and set the root path to use PandaGPT, '
'which is cloned from here: https://github.com/yxuansu/PandaGPT. '
)
raise
e
self
.
args
=
{
'model'
:
'openllama_peft'
,
'imagebind_ckpt_path'
:
osp
.
join
(
root
,
'pretrained_ckpt/imagebind_ckpt'
),
'vicuna_ckpt_path'
:
osp
.
join
(
root
,
'pretrained_ckpt/vicuna_ckpt/13b_v0'
),
'delta_ckpt_path'
:
osp
.
join
(
root
,
'pretrained_ckpt/pandagpt_ckpt/13b/pytorch_model.pt'
),
'stage'
:
2
,
'max_tgt_len'
:
512
,
'lora_r'
:
32
,
'lora_alpha'
:
32
,
'lora_dropout'
:
0.1
,
}
model
=
OpenLLAMAPEFTModel
(
**
self
.
args
)
delta_ckpt
=
torch
.
load
(
self
.
args
[
'delta_ckpt_path'
],
map_location
=
torch
.
device
(
'cpu'
))
model
.
load_state_dict
(
delta_ckpt
,
strict
=
False
)
torch
.
cuda
.
empty_cache
()
self
.
model
=
model
.
eval
().
half
().
cuda
()
kwargs_default
=
{
'top_p'
:
0.9
,
'do_sample'
:
False
,
'max_tgt_len'
:
128
,
'temperature'
:
0.001
}
kwargs_default
.
update
(
kwargs
)
self
.
kwargs
=
kwargs_default
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
def
generate_inner
(
self
,
message
,
dataset
=
None
):
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
struct
=
{
'prompt'
:
prompt
,
'image_paths'
:
[
image_path
],
'audio_paths'
:
[],
'video_paths'
:
[],
'thermal_paths'
:
[],
'modality_embeds'
:
[]
}
struct
.
update
(
self
.
kwargs
)
resp
=
self
.
model
.
generate
(
struct
)
return
resp
VLMEvalKit/vlmeval/vlm/parrot.py
0 → 100644
View file @
bc5ebf0f
import
os
import
torch
from
PIL
import
Image
from
abc
import
abstractproperty
from
.base
import
BaseModel
from
..dataset
import
DATASET_TYPE
from
..smp
import
*
class
Parrot
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
=
'AIDC-AI/Parrot-7B'
,
**
kwargs
):
try
:
from
parrot.model.parrot_arch
import
ParrotMetaForCausalLM
from
parrot.utils.constants
import
DEFAULT_IMAGE_TOKEN
,
BEGIN_LINE
,
END_LINE
from
parrot.model.conversation_formatter
import
ConversationFormatter
from
parrot.utils.mm_utils
import
process_images
except
Exception
as
e
:
logging
.
critical
(
'Please install Parrot before using Parrot'
)
logging
.
critical
(
'Please install Parrot from https://github.com/AIDC-AI/Parrot'
)
logging
.
critical
(
'Using `pip install -e . --no-deps` in the Parrot directory'
)
logging
.
critical
(
'Recommend to install transformers==4.39.0'
)
raise
e
self
.
process_images
=
process_images
self
.
ConversationFormatter
=
ConversationFormatter
self
.
DEFAULT_IMAGE_TOKEN
=
DEFAULT_IMAGE_TOKEN
self
.
BEGIN_LINE
=
BEGIN_LINE
self
.
END_LINE
=
END_LINE
try
:
model_name
=
'parrot_qwen2'
model
,
tokenizer
,
conversation_formatter
=
ParrotMetaForCausalLM
.
build
(
model_name
,
model_path
,
mm_vision_tower
=
'openai/clip-vit-large-patch14-336'
)
self
.
model
=
model
.
cuda
()
self
.
vision_tower
=
self
.
model
.
get_vision_tower
()
self
.
tokenizer
=
tokenizer
self
.
conversation_formatter
=
conversation_formatter
self
.
image_processor
=
self
.
model
.
get_vision_tower
().
image_processor
except
Exception
as
e
:
logging
.
critical
(
'Error when loading Parrot model:'
)
raise
e
self
.
kwargs
=
dict
(
do_sample
=
False
,
num_beams
=
1
,
max_new_tokens
=
512
,
repetition_penalty
=
None
,
use_cache
=
True
,
eos_token_id
=
self
.
tokenizer
.
eos_token_id
,
pad_token_id
=
self
.
tokenizer
.
pad_token_id
)
if
int
(
os
.
environ
.
get
(
'LOCAL_RANK'
,
'0'
))
==
0
:
print
(
f
'Following kwargs
{
self
.
kwargs
}
will be used as generation config.'
)
self
.
count
=
0
def
use_custom_prompt
(
self
,
dataset
):
if
DATASET_TYPE
(
dataset
)
==
'Y/N'
or
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
return
True
return
False
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
self
.
use_custom_prompt
(
dataset
)
assert
isinstance
(
dataset
,
str
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
if
DATASET_TYPE
(
dataset
)
==
'Y/N'
:
prompt
=
self
.
built_yorn_prompt
(
line
,
dataset
)
elif
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
prompt
=
self
.
build_multi_choice_prompt
(
line
,
dataset
)
else
:
raise
ValueError
(
f
'Invalid dataset type:
{
DATASET_TYPE
(
dataset
)
}
'
)
message
=
[
dict
(
type
=
'text'
,
value
=
prompt
)]
message
.
extend
([
dict
(
type
=
'image'
,
value
=
p
)
for
p
in
tgt_path
])
return
message
def
built_yorn_prompt
(
self
,
line
,
dataset
=
None
):
prompt
=
line
[
'question'
]
previous_suffixs
=
[
' Please answer yes or no.'
,
' Yes or No'
,
' Answer in one sentence.'
]
for
previous_suffix
in
previous_suffixs
:
if
prompt
.
endswith
(
previous_suffix
):
prompt
=
prompt
[:
-
len
(
previous_suffix
)]
break
prompt
+=
'
\n
请直接回答Yes或No。请用单个词或短语回答问题。'
if
cn_string
(
prompt
)
else
'
\n
Please strictly answer Yes or No. Answer the question using a single word or phrase.'
return
prompt
def
build_multi_choice_prompt
(
self
,
line
,
dataset
=
None
):
question
=
line
[
'question'
]
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
if
hint
is
not
None
:
question
=
hint
+
'
\n
'
+
question
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
for
key
,
item
in
options
.
items
():
question
+=
f
'
\n
{
key
}
.
{
item
}
'
prompt
=
question
if
len
(
options
):
default_prompt
=
"
\n
Answer with the option's letter from the given choices directly."
if
dataset
[
-
3
:]
==
'_cn'
or
cn_string
(
prompt
):
default_prompt
=
'
\n
请直接用给定选项中的选项字母回答。'
elif
dataset
[
-
3
:]
==
'_pt'
:
default_prompt
=
'
\n
Responda diretamente com a letra da opção das escolhas dadas.'
elif
dataset
[
-
3
:]
==
'_ar'
:
default_prompt
=
'
\n
أجب مباشرةً بحرف الخيار من الاختيارات المعطاة.'
elif
dataset
[
-
3
:]
==
'_ru'
:
default_prompt
=
'
\n
Ответьте буквой варианта из предложенных вариантов напрямую.'
elif
dataset
[
-
3
:]
==
'_tr'
:
default_prompt
=
'
\n
Verilen seçeneklerden doğrudan seçeneğin harfi ile cevap verin.'
prompt
+=
default_prompt
# prompt += (
# '\n请直接回答选项字母。' if cn_string(prompt) else
# "\nAnswer with the option's letter from the given choices directly."
# )
else
:
prompt
+=
'
\n
请用单个词或短语回答问题。'
if
cn_string
(
prompt
)
else
'
\n
Answer the question using a single word or phrase.'
return
prompt
def
process_answer_prefix
(
self
,
answer
,
prefixes
):
for
prefix
in
prefixes
:
if
prefix
in
answer
.
lower
():
return
answer
[
answer
.
lower
().
find
(
prefix
)
+
len
(
prefix
):]
return
answer
def
generate_inner
(
self
,
message
,
dataset
=
None
):
query
,
image_paths
=
self
.
prepare_inputs
(
message
)
images_list
=
[
Image
.
open
(
image_path
).
convert
(
'RGB'
)
for
image_path
in
image_paths
]
args
=
abstractproperty
()
args
.
image_aspect_ratio
=
'pad'
image_tensors
=
self
.
process_images
(
images_list
,
self
.
image_processor
,
args
).
cuda
()
prompt
,
input_ids
=
self
.
conversation_formatter
.
format_query
(
query
)
input_ids
=
input_ids
.
unsqueeze
(
0
).
cuda
()
with
torch
.
inference_mode
():
kwargs
=
dict
(
images
=
image_tensors
,
)
kwargs
.
update
(
self
.
kwargs
)
output_ids
=
self
.
model
.
generate
(
input_ids
,
**
kwargs
)
input_token_len
=
input_ids
.
shape
[
1
]
n_diff_input_output
=
(
input_ids
!=
output_ids
[:,
:
input_token_len
]).
sum
().
item
()
if
n_diff_input_output
>
0
:
print
(
f
'[Warning]
{
n_diff_input_output
}
output_ids are not the same as the input_ids'
)
response
=
self
.
tokenizer
.
batch_decode
(
output_ids
[:,
input_token_len
:],
skip_special_tokens
=
True
)[
0
].
strip
(
string
.
whitespace
)
answer
=
response
if
query
.
endswith
(
"Answer with the option's letter from the given choices directly."
)
or
query
.
endswith
(
'请直接回答选项字母。'
):
qtype
=
'multiple-choice'
while
True
:
answer
=
answer
.
strip
(
string
.
punctuation
+
string
.
whitespace
)
if
len
(
answer
)
>
1
:
if
answer
[
0
]
in
string
.
ascii_uppercase
and
answer
[
1
]
in
string
.
whitespace
+
string
.
punctuation
:
answer
=
answer
[
0
]
break
elif
answer
[
-
1
]
in
string
.
ascii_uppercase
and
answer
[
-
2
]
in
string
.
whitespace
+
string
.
punctuation
:
answer
=
answer
[
-
1
]
break
elif
listinstr
([
'answer is'
,
'answer:'
],
answer
.
lower
()):
answer
=
self
.
process_answer_prefix
(
answer
,
[
'answer is'
,
'answer:'
])
answer
=
self
.
process_answer_prefix
(
answer
,
[
'option'
])
else
:
break
else
:
break
else
:
qtype
=
'open'
if
self
.
count
%
50
==
0
and
int
(
os
.
environ
.
get
(
'LOCAL_RANK'
,
'0'
))
==
0
:
print
(
f
'
\n
{
self
.
BEGIN_LINE
}
'
)
print
(
f
'image_paths:
{
image_paths
}
\n
'
)
print
(
f
'prompt:
{
prompt
}
\n
'
)
print
(
f
'qtype:
{
qtype
}
\n
'
)
print
(
f
'output:
{
response
}
\n
'
)
print
(
f
'answer:
{
answer
}
\n
'
)
print
(
f
'
{
self
.
END_LINE
}
\n
'
,
flush
=
True
)
self
.
count
+=
1
return
answer
def
prepare_inputs
(
self
,
message
):
prompt
=
''
image_paths
=
[]
image_count
=
0
text_count
=
0
pure_text
=
''
for
msg
in
message
:
if
msg
[
'type'
]
==
'text'
:
text_count
+=
1
prompt
+=
msg
[
'value'
]
pure_text
+=
msg
[
'value'
]
elif
msg
[
'type'
]
==
'image'
:
image_count
+=
1
prompt
+=
self
.
DEFAULT_IMAGE_TOKEN
image_paths
.
append
(
msg
[
'value'
])
if
image_count
==
1
and
text_count
==
1
:
prompt
=
self
.
DEFAULT_IMAGE_TOKEN
+
'
\n
'
+
pure_text
return
prompt
,
image_paths
VLMEvalKit/vlmeval/vlm/phi3_vision.py
0 → 100644
View file @
bc5ebf0f
from
PIL
import
Image
import
torch
from
.base
import
BaseModel
from
..smp
import
*
class
Phi3Vision
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
=
'microsoft/Phi-3-vision-128k-instruct'
,
**
kwargs
):
try
:
from
transformers
import
AutoProcessor
,
AutoModelForCausalLM
except
Exception
as
e
:
logging
.
critical
(
'Please install the latest version transformers.'
)
raise
e
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
device_map
=
'cuda'
,
trust_remote_code
=
True
,
torch_dtype
=
'auto'
).
eval
()
processor
=
AutoProcessor
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
self
.
model
=
model
self
.
processor
=
processor
self
.
kwargs
=
kwargs
def
generate_inner
(
self
,
message
,
dataset
=
None
):
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
image
=
Image
.
open
(
image_path
).
convert
(
'RGB'
)
messages
=
[
{
'role'
:
'user'
,
'content'
:
f
'<|image_1|>
\n
{
prompt
}
'
}
]
prompt
=
self
.
processor
.
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
inputs
=
self
.
processor
(
prompt
,
[
image
],
return_tensors
=
'pt'
).
to
(
'cuda'
)
generation_args
=
{
'max_new_tokens'
:
500
,
'temperature'
:
0.0
,
'do_sample'
:
False
,
}
generation_args
.
update
(
self
.
kwargs
)
generate_ids
=
self
.
model
.
generate
(
**
inputs
,
eos_token_id
=
self
.
processor
.
tokenizer
.
eos_token_id
,
**
generation_args
)
generate_ids
=
generate_ids
[:,
inputs
[
'input_ids'
].
shape
[
1
]:]
response
=
self
.
processor
.
batch_decode
(
generate_ids
,
skip_special_tokens
=
True
,
clean_up_tokenization_spaces
=
False
)[
0
]
return
response
def
chat_inner
(
self
,
message
,
dataset
=
None
):
messages
=
[]
image_cnt
=
1
image_list
=
[]
for
msg
in
message
:
content
=
''
# If message is just text in the conversation
if
len
(
msg
[
'content'
])
==
1
and
msg
[
'content'
][
0
][
'type'
]
==
'text'
:
msg_new
=
{
'role'
:
msg
[
'role'
],
'content'
:
msg
[
'content'
][
0
][
'value'
]}
messages
.
append
(
msg_new
)
continue
# If both image & text is present
for
x
in
msg
[
'content'
]:
if
x
[
'type'
]
==
'text'
:
content
+=
x
[
'value'
]
elif
x
[
'type'
]
==
'image'
:
image
=
Image
.
open
(
x
[
'value'
]).
convert
(
'RGB'
)
content
+=
f
'<|image_
{
image_cnt
}
|>
\n
'
image_list
.
append
(
image
)
image_cnt
+=
1
msg_new
=
{
'role'
:
msg
[
'role'
],
'content'
:
content
}
messages
.
append
(
msg_new
)
prompt
=
self
.
processor
.
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
inputs
=
self
.
processor
(
prompt
,
image_list
,
return_tensors
=
'pt'
).
to
(
'cuda'
)
generation_args
=
{
'max_new_tokens'
:
500
,
'temperature'
:
0.0
,
'do_sample'
:
False
,
}
generation_args
.
update
(
self
.
kwargs
)
generate_ids
=
self
.
model
.
generate
(
**
inputs
,
eos_token_id
=
self
.
processor
.
tokenizer
.
eos_token_id
,
**
generation_args
)
generate_ids
=
generate_ids
[:,
inputs
[
'input_ids'
].
shape
[
1
]:]
response
=
self
.
processor
.
batch_decode
(
generate_ids
,
skip_special_tokens
=
True
,
clean_up_tokenization_spaces
=
False
)[
0
]
return
response
class
Phi3_5Vision
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
=
'microsoft/Phi-3.5-vision-instruct'
,
**
kwargs
):
try
:
from
transformers
import
AutoProcessor
,
AutoModelForCausalLM
except
Exception
as
e
:
logging
.
critical
(
'Please install the latest version transformers.'
)
raise
e
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
device_map
=
'cuda'
,
trust_remote_code
=
True
,
torch_dtype
=
'auto'
,
_attn_implementation
=
'flash_attention_2'
).
eval
()
# for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
processor
=
AutoProcessor
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
,
num_crops
=
4
)
self
.
model
=
model
self
.
processor
=
processor
self
.
kwargs
=
kwargs
def
generate_inner
(
self
,
message
,
dataset
=
None
):
prompt
=
'
\n
'
.
join
([
msg
[
'value'
]
for
msg
in
message
if
msg
[
'type'
]
==
'text'
])
images
=
[
Image
.
open
(
msg
[
'value'
]).
convert
(
'RGB'
)
for
msg
in
message
if
msg
[
'type'
]
==
'image'
]
num_images
=
len
(
images
)
placeholder
=
''
for
i
in
range
(
1
,
num_images
+
1
):
placeholder
+=
f
'<|image_
{
i
}
|>
\n
'
messages
=
[
{
'role'
:
'user'
,
'content'
:
placeholder
+
prompt
}
]
prompt
=
self
.
processor
.
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
inputs
=
self
.
processor
(
prompt
,
images
,
return_tensors
=
'pt'
).
to
(
'cuda'
)
generation_args
=
{
'max_new_tokens'
:
1000
,
'temperature'
:
0.0
,
'do_sample'
:
False
,
}
generation_args
.
update
(
self
.
kwargs
)
generate_ids
=
self
.
model
.
generate
(
**
inputs
,
eos_token_id
=
self
.
processor
.
tokenizer
.
eos_token_id
,
**
generation_args
)
# remove input tokens
generate_ids
=
generate_ids
[:,
inputs
[
'input_ids'
].
shape
[
1
]:]
response
=
self
.
processor
.
batch_decode
(
generate_ids
,
skip_special_tokens
=
True
,
clean_up_tokenization_spaces
=
False
)[
0
]
return
response
VLMEvalKit/vlmeval/vlm/pixtral.py
0 → 100644
View file @
bc5ebf0f
import
torch
from
PIL
import
Image
from
.base
import
BaseModel
from
..smp
import
*
import
warnings
from
huggingface_hub
import
snapshot_download
class
Pixtral
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
True
def
__init__
(
self
,
model_path
=
'mistralai/Pixtral-12B-2409'
,
**
kwargs
):
self
.
model_path
=
model_path
try
:
from
mistral_inference.transformer
import
Transformer
from
mistral_common.tokens.tokenizers.mistral
import
MistralTokenizer
except
ImportError
as
err
:
logging
.
critical
(
'Please install `mistral-inference` and `mistral_common`'
)
raise
err
if
os
.
path
.
exists
(
model_path
):
cache_path
=
model_path
else
:
if
get_cache_path
(
model_path
)
is
None
:
snapshot_download
(
repo_id
=
model_path
)
cache_path
=
get_cache_path
(
self
.
model_path
)
self
.
tokenizer
=
MistralTokenizer
.
from_file
(
f
'
{
cache_path
}
/tekken.json'
)
model
=
Transformer
.
from_folder
(
cache_path
,
device
=
'cpu'
)
model
.
cuda
()
self
.
model
=
model
self
.
max_tokens
=
512
def
generate_inner
(
self
,
message
,
dataset
=
None
):
try
:
from
mistral_inference.generate
import
generate
from
mistral_common.protocol.instruct.messages
import
UserMessage
,
TextChunk
,
ImageURLChunk
from
mistral_common.protocol.instruct.request
import
ChatCompletionRequest
except
ImportError
as
err
:
logging
.
critical
(
'Please install `mistral-inference` and `mistral_common`'
)
raise
err
msg_new
=
[]
for
msg
in
message
:
tp
,
val
=
msg
[
'type'
],
msg
[
'value'
]
if
tp
==
'text'
:
msg_new
.
append
(
TextChunk
(
text
=
val
))
elif
tp
==
'image'
:
b64
=
encode_image_file_to_base64
(
val
)
image_url
=
f
'data:image/jpeg;base64,
{
b64
}
'
msg_new
.
append
(
ImageURLChunk
(
image_url
=
image_url
))
completion_request
=
ChatCompletionRequest
(
messages
=
[
UserMessage
(
content
=
msg_new
)])
encoded
=
self
.
tokenizer
.
encode_chat_completion
(
completion_request
)
images
=
encoded
.
images
tokens
=
encoded
.
tokens
out_tokens
,
_
=
generate
(
[
tokens
],
self
.
model
,
images
=
[
images
],
max_tokens
=
self
.
max_tokens
,
temperature
=
0
,
eos_id
=
self
.
tokenizer
.
instruct_tokenizer
.
tokenizer
.
eos_id
)
result
=
self
.
tokenizer
.
decode
(
out_tokens
[
0
])
return
result
VLMEvalKit/vlmeval/vlm/points.py
0 → 100644
View file @
bc5ebf0f
import
transformers
from
PIL
import
Image
import
torch
import
re
from
.base
import
BaseModel
from
..dataset
import
DATASET_TYPE
from
..smp
import
cn_string
,
listinstr
import
pandas
as
pd
import
string
from
typing
import
List
class
POINTS
(
BaseModel
):
"""Official implementation of POINTS: Improving Your Vision-language Model with Affordable Strategies # noqa
Paper link: https://arxiv.org/abs/2409.04828
POINTS is a vision-language model developed by researchers at WeChat AI. This model represents the inaugural version in our
series of multimodal models, known as WePOINTS.
Args:
model_path (str): The path or the name (the unique huggingface id) of the model.
"""
def
__init__
(
self
,
model_path
:
str
,
**
kwargs
)
->
None
:
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
from
transformers
import
CLIPImageProcessor
version
=
transformers
.
__version__
use_fast
=
True
if
'yi'
in
model_path
.
lower
():
assert
version
==
'4.38.2'
,
f
'The version of transformers for Yi-1.5 should be 4.38.2, but got
{
version
}
.'
# noqa
use_fast
=
False
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
use_fast
=
use_fast
)
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
,
# noqa
device_map
=
'cuda'
).
to
(
torch
.
bfloat16
)
self
.
image_processor
=
CLIPImageProcessor
.
from_pretrained
(
model_path
)
def
use_custom_prompt
(
self
,
dataset
:
str
)
->
bool
:
"""Whether to use custom prompt for the dataset.
Args:
dataset (str): The name of the dataset.
Returns:
bool: Whether to use custom prompt for the dataset.
"""
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
return
True
return
False
def
build_prompt
(
self
,
line
:
str
,
dataset
:
str
)
->
List
[
dict
]:
"""Build prompt for multi-choice dataset.
Args:
line (str): one line of the dataset.
dataset (str): The name of the dataset.
Returns:
List[dict]: A list of elements constructed for current line.
"""
assert
self
.
use_custom_prompt
(
dataset
)
assert
isinstance
(
dataset
,
str
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
'question'
]
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
if
hint
is
not
None
:
question
=
hint
+
'
\n
'
+
question
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
for
key
,
item
in
options
.
items
():
question
+=
f
'
\n
{
key
}
.
{
item
}
'
prompt
=
question
if
len
(
options
):
prompt
+=
(
'
\n
请直接回答选项字母。'
if
cn_string
(
prompt
)
else
# noqa
"
\n
Answer with the option
\'
s letter from the given choices directly."
# noqa
)
else
:
prompt
+=
'
\n
请直接回答问题。'
if
cn_string
(
# noqa
prompt
)
else
'
\n
Answer the question directly.'
message
=
[
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
]
message
.
append
(
dict
(
type
=
'text'
,
value
=
prompt
))
return
message
def
generate_inner
(
self
,
message
:
List
[
dict
],
dataset
:
str
=
None
)
->
str
:
"""Generate response for the given message.
Args:
message (List[dict]): A list of elements constructed for
current line.
dataset (str): The name of the dataset.
Returns:
str: The generated response.
"""
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
)
catty
=
True
# whether to use catty
if
dataset
==
'HallusionBench'
:
prompt
=
prompt
+
\
' Please answer yes or no. Answer the question using a single word or phrase.'
# noqa
elif
dataset
==
'MMVet'
:
prompt
=
prompt
+
' Answer this question in detail.'
catty
=
False
else
:
# use default setting
pass
if
dataset
is
None
:
max_splits
=
8
elif
listinstr
([
'MMBench'
,
'OCRBench'
],
dataset
):
max_splits
=
12
else
:
max_splits
=
8
image
=
Image
.
open
(
image_path
).
convert
(
'RGB'
)
generation_config
=
{
'max_new_tokens'
:
1024
,
'temperature'
:
0.0
,
'top_p'
:
0.0
,
'num_beams'
:
1
,
}
response
=
self
.
model
.
chat
(
image
,
prompt
,
self
.
tokenizer
,
self
.
image_processor
,
catty
,
generation_config
,
max_splits
)
return
response
class
POINTSV15
(
BaseModel
):
"""Official implementation of POINTSv1.5
This implementation is based on the official implementation of POINTSv1.5
(https://github.com/WePOINTS/WePOINTS)
Args:
model_path (str): The path or the name (the unique huggingface id)
of the model.
"""
def
__init__
(
self
,
model_path
:
str
,
**
kwargs
)
->
None
:
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
from
transformers
import
QuantoConfig
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
quant_config
=
QuantoConfig
(
modules_to_not_convert
=
[
'vision_encoder'
])
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
,
# noqa
device_map
=
'cuda'
,
torch_dtype
=
torch
.
bfloat16
,
quantization_config
=
quant_config
)
try
:
from
wepoints.utils.images
import
Qwen2ImageProcessorForPOINTSV15
except
ImportError
:
print
(
'Please install WePOINTS, and refer to https://github.com/WePOINTS/WePOINTS'
)
self
.
image_processor
=
Qwen2ImageProcessorForPOINTSV15
.
from_pretrained
(
model_path
)
# noqa
def
use_custom_prompt
(
self
,
dataset
:
str
)
->
bool
:
"""Whether to use custom prompt for the dataset.
Args:
dataset (str): The name of the dataset.
Returns:
bool: Whether to use custom prompt for the dataset.
"""
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
return
True
return
False
def
build_prompt
(
self
,
line
:
str
,
dataset
:
str
)
->
List
[
dict
]:
"""Build prompt for multi-choice dataset.
Args:
line (str): one line of the dataset.
dataset (str): The name of the dataset.
Returns:
List[dict]: A list of elements constructed for current line.
"""
assert
self
.
use_custom_prompt
(
dataset
)
assert
isinstance
(
dataset
,
str
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
'question'
]
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
if
hint
is
not
None
:
question
=
hint
+
'
\n
'
+
question
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
for
key
,
item
in
options
.
items
():
question
+=
f
'
\n
{
key
}
.
{
item
}
'
prompt
=
question
if
len
(
options
):
prompt
+=
(
'
\n
请直接回答选项字母。'
if
cn_string
(
prompt
)
else
# noqa
"
\n
Answer with the option
\'
s letter from the given choices directly."
# noqa
)
else
:
prompt
+=
'
\n
请直接回答问题。'
if
cn_string
(
# noqa
prompt
)
else
'
\n
Answer the question directly.'
message
=
[
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
]
message
.
append
(
dict
(
type
=
'text'
,
value
=
prompt
))
return
message
def
set_image_processor
(
self
,
dataset
:
str
)
->
None
:
"""Set the image processor for the dataset.
Args:
dataset (str): The name of the dataset.
"""
if
dataset
in
[
'OCRBench'
]:
self
.
image_processor
.
min_pixels
=
280
*
280
elif
dataset
in
[
'MMMU_DEV_VAL'
]:
self
.
image_processor
.
min_pixels
=
1280
*
28
*
28
self
.
image_processor
.
max_pixels
=
16384
*
28
*
28
elif
dataset
in
[
'MathVista_MINI'
]:
self
.
image_processor
.
min_pixels
=
56
*
56
elif
dataset
in
[
'MMVet'
,
'HallusionBench'
,
'MMBench_TEST_EN_V11'
,
'MMBench_TEST_CN_V11'
]:
self
.
image_processor
.
min_pixels
=
1280
*
28
*
28
else
:
self
.
image_processor
.
min_pixels
=
840
*
840
def
construct_messages
(
self
,
prompt
:
str
,
image_paths
:
List
[
str
])
->
List
[
dict
]:
"""Construct messages for the given prompt and image paths.
Args:
prompt (str): The prompt for the generation.
image_paths (List[str]): A list of image paths.
Returns:
List[dict]: A list of elements constructed for current line.
"""
content
=
[]
for
image_path
in
image_paths
:
content
.
append
(
dict
(
type
=
'image'
,
image
=
image_path
)
)
content
.
append
(
dict
(
type
=
'text'
,
text
=
prompt
)
)
messages
=
[
{
'role'
:
'user'
,
'content'
:
content
}
]
return
messages
def
generate_inner
(
self
,
message
:
List
[
dict
],
dataset
:
str
=
None
)
->
str
:
"""Generate response for the given message.
Args:
message (List[dict]): A list of elements constructed for
current line.
dataset (str): The name of the dataset.
Returns:
str: The generated response.
"""
self
.
set_image_processor
(
dataset
)
prompt
,
image_paths
=
self
.
message_to_promptimg
(
message
)
image_paths
=
[
image_paths
]
if
dataset
==
'HallusionBench'
:
prompt
=
prompt
+
\
' Please answer yes or no. Answer the question using a single word or phrase.'
# noqa
elif
dataset
==
'MMVet'
:
prompt
=
prompt
+
' Answer this question in detail.'
else
:
# use default setting
pass
pattern
=
r
'<image \d+>'
prompt
=
re
.
sub
(
pattern
,
'
\n
'
,
prompt
)
messages
=
self
.
construct_messages
(
prompt
,
image_paths
)
generation_config
=
{
'max_new_tokens'
:
1024
,
'temperature'
:
0.0
,
'top_p'
:
0.0
,
'num_beams'
:
1
,
}
response
=
self
.
model
.
chat
(
messages
,
self
.
tokenizer
,
self
.
image_processor
,
generation_config
)
return
response
Prev
1
…
6
7
8
9
10
11
12
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment