Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Qwen2-VL_pytorch
Commits
bc5ebf0f
Commit
bc5ebf0f
authored
Dec 27, 2024
by
luopl
Browse files
Initial commit
parents
Pipeline
#2167
canceled with stages
Changes
260
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4357 additions
and
0 deletions
+4357
-0
VLMEvalKit/vlmeval/vlm/h2ovl_mississippi.py
VLMEvalKit/vlmeval/vlm/h2ovl_mississippi.py
+117
-0
VLMEvalKit/vlmeval/vlm/idefics.py
VLMEvalKit/vlmeval/vlm/idefics.py
+309
-0
VLMEvalKit/vlmeval/vlm/instructblip.py
VLMEvalKit/vlmeval/vlm/instructblip.py
+57
-0
VLMEvalKit/vlmeval/vlm/internvl/__init__.py
VLMEvalKit/vlmeval/vlm/internvl/__init__.py
+3
-0
VLMEvalKit/vlmeval/vlm/internvl/internvl_chat.py
VLMEvalKit/vlmeval/vlm/internvl/internvl_chat.py
+353
-0
VLMEvalKit/vlmeval/vlm/internvl/utils.py
VLMEvalKit/vlmeval/vlm/internvl/utils.py
+349
-0
VLMEvalKit/vlmeval/vlm/janus.py
VLMEvalKit/vlmeval/vlm/janus.py
+136
-0
VLMEvalKit/vlmeval/vlm/kosmos.py
VLMEvalKit/vlmeval/vlm/kosmos.py
+114
-0
VLMEvalKit/vlmeval/vlm/llama_vision.py
VLMEvalKit/vlmeval/vlm/llama_vision.py
+204
-0
VLMEvalKit/vlmeval/vlm/llava/__init__.py
VLMEvalKit/vlmeval/vlm/llava/__init__.py
+4
-0
VLMEvalKit/vlmeval/vlm/llava/llava.py
VLMEvalKit/vlmeval/vlm/llava/llava.py
+900
-0
VLMEvalKit/vlmeval/vlm/llava/llava_xtuner.py
VLMEvalKit/vlmeval/vlm/llava/llava_xtuner.py
+239
-0
VLMEvalKit/vlmeval/vlm/mantis.py
VLMEvalKit/vlmeval/vlm/mantis.py
+201
-0
VLMEvalKit/vlmeval/vlm/mgm.py
VLMEvalKit/vlmeval/vlm/mgm.py
+158
-0
VLMEvalKit/vlmeval/vlm/minicpm_v.py
VLMEvalKit/vlmeval/vlm/minicpm_v.py
+471
-0
VLMEvalKit/vlmeval/vlm/minigpt4.py
VLMEvalKit/vlmeval/vlm/minigpt4.py
+85
-0
VLMEvalKit/vlmeval/vlm/minimonkey.py
VLMEvalKit/vlmeval/vlm/minimonkey.py
+534
-0
VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml
VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml
+43
-0
VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml
VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml
+43
-0
VLMEvalKit/vlmeval/vlm/misc/minigpt4_13b_eval.yaml
VLMEvalKit/vlmeval/vlm/misc/minigpt4_13b_eval.yaml
+37
-0
No files found.
VLMEvalKit/vlmeval/vlm/h2ovl_mississippi.py
0 → 100644
View file @
bc5ebf0f
import
torch
from
transformers
import
AutoTokenizer
,
AutoModel
import
warnings
from
.base
import
BaseModel
from
..smp
import
*
from
..dataset
import
DATASET_TYPE
import
pandas
as
pd
import
string
class
H2OVLChat
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
True
def
__init__
(
self
,
model_path
=
'h2oai/h2ovl-mississippi-2b'
,
**
kwargs
):
assert
model_path
is
not
None
self
.
model_path
=
model_path
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
,
use_fast
=
False
)
device
=
torch
.
cuda
.
current_device
()
self
.
device
=
device
self
.
model
=
AutoModel
.
from_pretrained
(
model_path
,
torch_dtype
=
torch
.
bfloat16
,
trust_remote_code
=
True
).
eval
()
self
.
model
=
self
.
model
.
to
(
device
)
self
.
image_size
=
self
.
model
.
config
.
vision_config
.
image_size
kwargs_default
=
dict
(
do_sample
=
False
,
max_new_tokens
=
1024
,
top_p
=
None
,
num_beams
=
1
)
kwargs_default
.
update
(
kwargs
)
self
.
kwargs
=
kwargs_default
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
def
use_custom_prompt
(
self
,
dataset
):
return
True
def
build_multi_choice_prompt
(
self
,
line
,
dataset
=
None
):
question
=
line
[
'question'
]
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
if
hint
is
not
None
:
question
=
hint
+
'
\n
'
+
question
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
for
key
,
item
in
options
.
items
():
question
+=
f
'
\n
{
key
}
.
{
item
}
'
prompt
=
question
if
len
(
options
):
prompt
+=
'
\n
请直接回答选项字母。'
if
cn_string
(
prompt
)
else
"
\n
Answer with the option's letter from the given choices directly."
else
:
prompt
+=
'
\n
请直接回答问题。'
if
cn_string
(
prompt
)
else
'
\n
Answer the question directly.'
return
prompt
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
self
.
use_custom_prompt
(
dataset
)
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
if
dataset
is
not
None
and
listinstr
([
'MME'
],
dataset
):
question
=
line
[
'question'
]
prompt
=
question
+
' Answer the question using a single word or phrase.'
elif
dataset
is
not
None
and
listinstr
([
'HallusionBench'
],
dataset
):
question
=
line
[
'question'
]
prompt
=
question
+
' Please answer yes or no. Answer the question using a single word or phrase.'
elif
dataset
is
not
None
and
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
prompt
=
self
.
build_multi_choice_prompt
(
line
,
dataset
)
elif
dataset
is
not
None
and
DATASET_TYPE
(
dataset
)
==
'VQA'
:
if
'MathVista'
in
dataset
:
prompt
=
line
[
'question'
]
elif
listinstr
([
'LLaVABench'
],
dataset
):
question
=
line
[
'question'
]
prompt
=
question
+
'
\n
Answer this question in detail.'
elif
listinstr
([
'MMVet'
],
dataset
):
prompt
=
line
[
'question'
]
else
:
question
=
line
[
'question'
]
prompt
=
question
+
'
\n
Answer the question using a single word or phrase.'
else
:
prompt
=
line
[
'question'
]
message
=
[
dict
(
type
=
'text'
,
value
=
prompt
)]
message
.
extend
([
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
])
return
message
def
generate_inner
(
self
,
message
,
dataset
=
None
):
image_num
=
len
([
x
for
x
in
message
if
x
[
'type'
]
==
'image'
])
question
=
''
image_files
=
[
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'image'
]
if
image_num
==
1
:
question
=
'<image>
\n
'
+
'
\n
'
.
join
([
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'text'
])
elif
image_num
>
1
:
text_part
=
' '
.
join
([
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'text'
])
image_part
=
' '
.
join
([
f
'<image-
{
i
+
1
}
>: <image>'
for
i
in
range
(
image_num
)])
question
=
image_part
+
'
\n
'
+
text_part
else
:
question
=
'
\n
'
.
join
([
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'text'
])
image_files
=
None
response
,
history
=
self
.
model
.
chat
(
self
.
tokenizer
,
image_files
=
image_files
,
question
=
question
,
generation_config
=
self
.
kwargs
,
max_tiles
=
6
,
history
=
None
,
return_history
=
True
)
return
response
VLMEvalKit/vlmeval/vlm/idefics.py
0 → 100644
View file @
bc5ebf0f
import
torch
import
os.path
as
osp
import
warnings
from
.base
import
BaseModel
from
..smp
import
splitlen
,
listinstr
from
PIL
import
Image
from
transformers
import
AutoProcessor
,
AutoModelForVision2Seq
from
transformers.image_utils
import
load_image
class
IDEFICS
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
True
def
__init__
(
self
,
model_path
=
'HuggingFaceM4/idefics-9b-instruct'
,
**
kwargs
):
assert
osp
.
exists
(
model_path
)
or
splitlen
(
model_path
)
==
2
from
transformers
import
IdeficsForVisionText2Text
,
AutoProcessor
self
.
model
=
IdeficsForVisionText2Text
.
from_pretrained
(
model_path
,
torch_dtype
=
torch
.
bfloat16
,
device_map
=
'auto'
)
self
.
processor
=
AutoProcessor
.
from_pretrained
(
model_path
)
kwargs_default
=
{
'max_new_tokens'
:
512
}
kwargs_default
.
update
(
kwargs
)
self
.
kwargs
=
kwargs_default
self
.
file_root
=
osp
.
dirname
(
__file__
)
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
def
generate_inner
(
self
,
message
,
dataset
=
None
):
prompts
=
(
[
'Users:'
]
+
[
msg
[
'value'
]
if
msg
[
'type'
]
==
'text'
else
Image
.
open
(
msg
[
'value'
])
for
msg
in
message
]
+
[
'<end_of_utterance>'
,
'
\n
Assistant: '
]
)
inputs
=
self
.
processor
(
prompts
,
add_end_of_utterance_token
=
False
,
return_tensors
=
'pt'
).
to
(
'cuda'
)
exit_condition
=
self
.
processor
.
tokenizer
(
'<end_of_utterance>'
,
add_special_tokens
=
False
).
input_ids
bad_words_ids
=
self
.
processor
.
tokenizer
(
[
'<image>'
,
'<fake_token_around_image>'
],
add_special_tokens
=
False
).
input_ids
generated_ids
=
self
.
model
.
generate
(
**
inputs
,
eos_token_id
=
exit_condition
,
bad_words_ids
=
bad_words_ids
,
**
self
.
kwargs
,
)
generated_text
=
self
.
processor
.
batch_decode
(
generated_ids
,
skip_special_tokens
=
True
)
text
=
generated_text
[
0
].
split
(
'
\n
Assistant: '
)[
-
1
]
return
text
class
IDEFICS2
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
True
def
__init__
(
self
,
model_path
=
'HuggingFaceM4/idefics2-8b'
,
**
kwargs
):
assert
model_path
is
not
None
self
.
model_path
=
model_path
if
'Idefics3'
in
self
.
model_path
.
lower
():
warnings
.
warn
(
'Install transfomers from source: PR https://github.com/open-compass/VLMEvalKit/pull/379'
)
warnings
.
warn
(
'Reference: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3'
)
self
.
processor
=
AutoProcessor
.
from_pretrained
(
model_path
)
model
=
AutoModelForVision2Seq
.
from_pretrained
(
model_path
,
torch_dtype
=
torch
.
bfloat16
,
_attn_implementation
=
'flash_attention_2'
,
device_map
=
'cpu'
)
self
.
model
=
model
.
to
(
'cuda'
)
kwargs_default
=
{
'max_new_tokens'
:
1024
}
kwargs_default
.
update
(
kwargs
)
self
.
kwargs
=
kwargs_default
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
torch
.
cuda
.
empty_cache
()
def
_process
(
self
,
formatted_messages
,
formatted_images
):
inputs
=
self
.
processor
(
text
=
formatted_messages
,
images
=
formatted_images
,
return_tensors
=
'pt'
)
inputs
=
{
k
:
v
.
to
(
self
.
model
.
device
)
for
k
,
v
in
inputs
.
items
()}
return
inputs
def
build_prompt_default
(
self
,
message
,
add_brief
=
False
,
add_yes_or_no
=
False
,
change_the_img_place
=
False
):
if
change_the_img_place
:
new_message
=
[]
for
s
in
message
:
if
s
[
'type'
]
==
'image'
:
new_message
.
append
(
s
)
for
s
in
message
:
if
s
[
'type'
]
==
'text'
:
new_message
.
append
(
s
)
message
=
new_message
prompt
,
images
=
'User:'
,
[]
for
msg
in
message
:
if
msg
[
'type'
]
==
'image'
:
img
=
load_image
(
msg
[
'value'
])
images
.
append
(
img
)
prompt
+=
'<image>'
elif
msg
[
'type'
]
==
'text'
:
prompt
+=
msg
[
'value'
].
strip
()
if
add_brief
:
prompt
+=
'
\n
Give a very brief answer.'
if
add_yes_or_no
:
prompt
+=
'
\n
Answer yes or no.'
prompt
+=
'<end_of_utterance>
\n
Assistant:'
return
prompt
,
images
def
build_prompt_puremcq
(
self
,
message
):
replace_mapping
=
{
'
\n
Options:'
:
'
\n
Choices:'
,
'Please select the correct answer from the options above.'
:
'Answer with the letter.'
,
}
prompt
,
images
=
'User:'
,
[]
for
msg
in
message
:
if
msg
[
'type'
]
==
'image'
:
img
=
load_image
(
msg
[
'value'
])
images
.
append
(
img
)
prompt
+=
'<image>'
elif
msg
[
'type'
]
==
'text'
:
instruction
=
msg
[
'value'
].
strip
()
for
k
,
v
in
replace_mapping
.
items
():
instruction
=
instruction
.
replace
(
k
,
v
)
prompt
+=
instruction
prompt
+=
'<end_of_utterance>
\n
Assistant: Answer:'
return
prompt
,
images
def
build_prompt_mt
(
self
,
message
):
prompt
,
images
=
''
,
[]
for
msg
in
message
:
if
msg
[
'role'
]
==
'user'
:
prompt
+=
'User: '
elif
msg
[
'role'
]
==
'assistant'
:
prompt
+=
'Assistant: '
for
item
in
msg
[
'content'
]:
if
item
[
'type'
]
==
'image'
:
img
=
load_image
(
item
[
'value'
])
images
.
append
(
img
)
prompt
+=
'<image>'
elif
item
[
'type'
]
==
'text'
:
prompt
+=
item
[
'value'
].
strip
()
prompt
+=
'<end_of_utterance>
\n
'
return
prompt
+
'Assistant: '
def
build_prompt_mmbench
(
self
,
message
):
replace_mapping
=
{
'
\n
Options:'
:
'
\n
Choices:'
,
'Please select the correct answer from the options above.'
:
'Answer with a letter.'
,
}
prompt
,
images
=
'User:'
,
[]
for
msg
in
message
:
if
msg
[
'type'
]
==
'image'
:
img
=
load_image
(
msg
[
'value'
])
images
.
append
(
img
)
prompt
+=
'<image>'
elif
msg
[
'type'
]
==
'text'
:
instruction
=
msg
[
'value'
].
strip
()
for
k
,
v
in
replace_mapping
.
items
():
instruction
=
instruction
.
replace
(
k
,
v
)
# Swap hint and question
if
instruction
.
startswith
(
'Hint:'
):
hint
,
question
=
instruction
.
split
(
'
\n
Question:'
)
question
,
choices
=
question
.
split
(
'
\n
Choices:'
)
instruction
=
(
'Question:'
+
question
+
'
\n
'
+
hint
+
'
\n
Choices:'
+
choices
)
prompt
+=
instruction
prompt
+=
'<end_of_utterance>
\n
Assistant: Answer:'
return
prompt
,
images
def
build_prompt_mmmu
(
self
,
message
):
replace_mapping
=
{
'Question:'
:
''
,
'Please select the correct answer from the options above.'
:
'Answer with the letter.'
,
'
\n
Options:'
:
'
\n
Choices:'
,
}
prompt
,
images
,
img_counter
=
'User: Question: '
,
[],
1
for
msg
in
message
:
if
msg
[
'type'
]
==
'image'
:
prompt
+=
f
'<image
{
img_counter
}
>:<image>
\n
'
img_counter
+=
1
img_counter
=
1
for
msg
in
message
:
if
msg
[
'type'
]
==
'image'
:
img
=
load_image
(
msg
[
'value'
])
images
.
append
(
img
)
prompt
+=
f
' <image
{
img_counter
}
> '
img_counter
+=
1
elif
msg
[
'type'
]
==
'text'
:
instruction
=
msg
[
'value'
].
strip
()
for
k
,
v
in
replace_mapping
.
items
():
instruction
=
instruction
.
replace
(
k
,
v
)
prompt
+=
instruction
.
strip
()
prompt
+=
'<end_of_utterance>
\n
Assistant:'
if
'A.'
in
prompt
and
'B.'
in
prompt
:
prompt
+=
' Answer:'
return
prompt
,
images
def
build_prompt_mathvista
(
self
,
message
):
replace_mapping
=
{
'(A) '
:
'A. '
,
'(B) '
:
'B. '
,
'(C) '
:
'C. '
,
'(D) '
:
'D. '
,
'(E) '
:
'E. '
,
'(F) '
:
'F. '
,
'(G) '
:
'G. '
,
'(H) '
:
'H. '
,
'
\n
Options:'
:
'
\n
Choices:'
,
'Hint: '
:
''
,
}
prompt
,
images
=
'User:'
,
[]
for
msg
in
message
:
if
msg
[
'type'
]
==
'image'
:
img
=
load_image
(
msg
[
'value'
])
images
.
append
(
img
)
prompt
+=
'<image>'
elif
msg
[
'type'
]
==
'text'
:
instruction
=
msg
[
'value'
].
strip
()
for
k
,
v
in
replace_mapping
.
items
():
instruction
=
instruction
.
replace
(
k
,
v
)
prompt
+=
instruction
.
strip
()
if
'A.'
in
prompt
and
'B.'
in
prompt
:
prompt
+=
'
\n
Answer with the letter.'
prompt
+=
'<end_of_utterance>
\n
Assistant:'
if
'A.'
in
prompt
and
'B.'
in
prompt
:
prompt
+=
' Answer:'
return
prompt
,
images
def
chat_inner
(
self
,
message
,
dataset
=
None
):
formatted_messages
,
formatted_images
=
self
.
build_prompt_mt
(
message
)
inputs
=
self
.
_process
(
formatted_messages
,
formatted_images
)
generated_ids
=
self
.
model
.
generate
(
**
inputs
,
**
self
.
kwargs
)
generated_text
=
self
.
processor
.
batch_decode
(
generated_ids
[:,
inputs
[
'input_ids'
].
size
(
1
):],
skip_special_tokens
=
True
)[
0
]
response
=
generated_text
.
strip
()
# print(dataset, " | ", formatted_messages.replace("\n", "\\n"), " | ", response.replace("\n", "\\n"))
return
response
def
generate_inner
(
self
,
message
,
dataset
=
None
):
if
dataset
in
[
'MMBench_DEV_EN'
,
'MMBench_DEV_EN_V11'
,
'MMBench_TEST_EN'
,
'MMBench_TEST_EN_V11'
,
'MMBench_DEV_CN'
,
'MMBench_DEV_CN_V11'
,
'MMBench_TEST_CN'
,
'MMBench_TEST_CN_V11'
,
'MMBench'
,
'MMBench_V11'
,
'MMBench_CN'
,
'MMBench_CN_V11'
]:
formatted_messages
,
formatted_images
=
self
.
build_prompt_mmbench
(
message
)
elif
dataset
in
[
'MMMU_DEV_VAL'
,
'MMMU_TEST'
]:
formatted_messages
,
formatted_images
=
self
.
build_prompt_mmmu
(
message
)
elif
dataset
in
[
'MathVista_MINI'
]:
formatted_messages
,
formatted_images
=
self
.
build_prompt_mathvista
(
message
)
elif
dataset
in
[
'MME'
,
'MMVet'
,
'OCRVQA_TEST'
,
'OCRVQA_TESTCORE'
,
'TextVQA_VAL'
,
'ChartQA_TEST'
,
'DocVQA_VAL'
,
'DocVQA_TEST'
,
'InfoVQA_VAL'
,
'InfoVQA_TEST'
,
]:
formatted_messages
,
formatted_images
=
self
.
build_prompt_default
(
message
,
add_brief
=
True
)
elif
dataset
==
'HallusionBench'
:
formatted_messages
,
formatted_images
=
self
.
build_prompt_default
(
message
,
add_yes_or_no
=
True
)
elif
dataset
in
[
'MMStar'
,
'SEEDBench_IMG'
,
'AI2D_TEST'
,
'ScienceQA_VAL'
,
'ScienceQA_TEST'
,
]:
formatted_messages
,
formatted_images
=
self
.
build_prompt_puremcq
(
message
)
elif
listinstr
([
'MLVU'
,
'TempCompass'
,
'MVBench'
],
dataset
):
formatted_messages
,
formatted_images
=
self
.
build_prompt_default
(
message
,
change_the_img_place
=
True
)
else
:
formatted_messages
,
formatted_images
=
self
.
build_prompt_default
(
message
)
inputs
=
self
.
_process
(
formatted_messages
,
formatted_images
)
generated_ids
=
self
.
model
.
generate
(
**
inputs
,
**
self
.
kwargs
)
generated_text
=
self
.
processor
.
batch_decode
(
generated_ids
[:,
inputs
[
'input_ids'
].
size
(
1
):],
skip_special_tokens
=
True
)[
0
]
response
=
generated_text
.
strip
()
# print(dataset, " | ", formatted_messages.replace("\n", "\\n"), " | ", response.replace("\n", "\\n"))
return
response
VLMEvalKit/vlmeval/vlm/instructblip.py
0 → 100644
View file @
bc5ebf0f
import
torch
from
PIL
import
Image
import
os.path
as
osp
import
sys
from
.base
import
BaseModel
from
..smp
import
*
class
InstructBLIP
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
False
def
__init__
(
self
,
name
):
self
.
config_map
=
{
'instructblip_7b'
:
'misc/blip2_instruct_vicuna7b.yaml'
,
'instructblip_13b'
:
'misc/blip2_instruct_vicuna13b.yaml'
,
}
self
.
file_path
=
__file__
config_root
=
osp
.
dirname
(
self
.
file_path
)
try
:
from
lavis.models
import
load_preprocess
from
omegaconf
import
OmegaConf
from
lavis.common.registry
import
registry
except
Exception
as
e
:
logging
.
critical
(
'Please install lavis before using InstructBLIP. '
)
raise
e
assert
name
in
self
.
config_map
cfg_path
=
osp
.
join
(
config_root
,
self
.
config_map
[
name
])
cfg
=
OmegaConf
.
load
(
cfg_path
)
model_cfg
=
cfg
.
model
assert
osp
.
exists
(
model_cfg
.
llm_model
)
or
splitlen
(
model_cfg
.
llm_model
)
==
2
model_cls
=
registry
.
get_model_class
(
name
=
'blip2_vicuna_instruct'
)
model
=
model_cls
.
from_config
(
model_cfg
)
model
.
eval
()
self
.
device
=
torch
.
device
(
'cuda'
)
if
torch
.
cuda
.
is_available
()
else
'cpu'
device
=
self
.
device
model
.
to
(
device
)
self
.
model
=
model
self
.
kwargs
=
{
'max_length'
:
512
}
preprocess_cfg
=
cfg
.
preprocess
vis_processors
,
_
=
load_preprocess
(
preprocess_cfg
)
self
.
vis_processors
=
vis_processors
def
generate_inner
(
self
,
message
,
dataset
=
None
):
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
vis_processors
=
self
.
vis_processors
raw_image
=
Image
.
open
(
image_path
).
convert
(
'RGB'
)
image_tensor
=
vis_processors
[
'eval'
](
raw_image
).
unsqueeze
(
0
).
to
(
self
.
device
)
outputs
=
self
.
model
.
generate
(
dict
(
image
=
image_tensor
,
prompt
=
prompt
))
return
outputs
[
0
]
VLMEvalKit/vlmeval/vlm/internvl/__init__.py
0 → 100644
View file @
bc5ebf0f
from
.internvl_chat
import
InternVLChat
__all__
=
[
'InternVLChat'
]
VLMEvalKit/vlmeval/vlm/internvl/internvl_chat.py
0 → 100644
View file @
bc5ebf0f
import
math
import
pandas
as
pd
import
random
import
re
import
string
import
torch
import
torch.distributed
as
dist
import
torchvision.transforms
as
T
import
transformers
import
warnings
from
PIL
import
Image
from
torchvision.transforms.functional
import
InterpolationMode
from
transformers
import
AutoTokenizer
,
AutoConfig
,
AutoModel
,
CLIPImageProcessor
from
.utils
import
(
build_multi_choice_prompt
,
build_video_prompt
,
build_mpo_prompt
,
build_mcq_cot_prompt
,
build_qa_cot_prompt
,
mpo_post_processing
,
reorganize_prompt
,
split_model
,
load_image
)
from
.utils
import
mpo_prompt_with_final_answer
,
mpo_prompt_without_final_answer
from
..base
import
BaseModel
from
...dataset
import
DATASET_TYPE
,
DATASET_MODALITY
from
...smp
import
*
class
InternVLChat
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
True
def
__init__
(
self
,
model_path
=
'OpenGVLab/InternVL-Chat-V1-5'
,
load_in_8bit
=
False
,
use_mpo_prompt
=
False
,
version
=
'V1.0'
,
**
kwargs
):
assert
model_path
is
not
None
assert
version_cmp
(
transformers
.
__version__
,
'4.37.2'
,
'ge'
)
self
.
use_mpo_prompt
=
use_mpo_prompt
self
.
model_path
=
model_path
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
,
use_fast
=
False
)
# Regular expression to match the pattern 'Image' followed by a number, e.g. Image1
self
.
pattern
=
r
'Image(\d+)'
# Replacement pattern to insert a hyphen between 'Image' and the number, e.g. Image-1
self
.
replacement
=
r
'Image-\1'
# Convert InternVL2 response to dataset format
# e.g. Image1 -> Image-1
# Regular expression to match the pattern 'Image-' followed by a number
self
.
reverse_pattern
=
r
'Image-(\d+)'
# Replacement pattern to remove the hyphen (Image-1 -> Image1)
self
.
reverse_replacement
=
r
'Image\1'
if
auto_split_flag
():
device_map
,
visible_devices
=
split_model
(
model_path
=
model_path
)
self
.
device
=
visible_devices
[
0
]
self
.
model
=
AutoModel
.
from_pretrained
(
model_path
,
torch_dtype
=
torch
.
bfloat16
,
load_in_8bit
=
load_in_8bit
,
trust_remote_code
=
True
,
low_cpu_mem_usage
=
True
,
device_map
=
device_map
).
eval
()
else
:
self
.
model
=
AutoModel
.
from_pretrained
(
model_path
,
torch_dtype
=
torch
.
bfloat16
,
load_in_8bit
=
load_in_8bit
,
trust_remote_code
=
True
,
low_cpu_mem_usage
=
True
).
eval
().
cuda
()
self
.
device
=
'cuda'
self
.
image_size
=
self
.
model
.
config
.
vision_config
.
image_size
self
.
version
=
version
kwargs_default
=
dict
(
do_sample
=
False
,
max_new_tokens
=
4096
,
top_p
=
None
)
kwargs_default
.
update
(
kwargs
)
self
.
kwargs
=
kwargs_default
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
def
use_custom_prompt
(
self
,
dataset
):
assert
dataset
is
not
None
if
listinstr
([
'MMDU'
,
'MME-RealWorld'
,
'MME-RealWorld-CN'
],
dataset
):
# For Multi-Turn we don't have custom prompt
return
False
if
DATASET_MODALITY
(
dataset
)
==
'VIDEO'
:
# For Video benchmarks we don't have custom prompt at here
return
False
else
:
return
True
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
self
.
use_custom_prompt
(
dataset
)
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
if
dataset
is
not
None
and
DATASET_TYPE
(
dataset
)
==
'Y/N'
:
question
=
line
[
'question'
]
if
listinstr
([
'MME'
],
dataset
):
prompt
=
question
+
' Answer the question using a single word or phrase.'
elif
listinstr
([
'HallusionBench'
,
'AMBER'
],
dataset
):
prompt
=
question
+
' Please answer yes or no. Answer the question using a single word or phrase.'
else
:
prompt
=
question
elif
dataset
is
not
None
and
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
prompt
=
build_multi_choice_prompt
(
line
,
dataset
)
if
os
.
getenv
(
'USE_COT'
)
==
'1'
:
prompt
=
build_mcq_cot_prompt
(
line
,
prompt
)
elif
dataset
is
not
None
and
DATASET_TYPE
(
dataset
)
==
'VQA'
:
question
=
line
[
'question'
]
if
listinstr
([
'LLaVABench'
,
'WildVision'
],
dataset
):
prompt
=
question
+
'
\n
Answer this question in detail.'
elif
listinstr
([
'OCRVQA'
,
'TextVQA'
,
'ChartQA'
,
'DocVQA'
,
'InfoVQA'
,
'OCRBench'
,
'DUDE'
,
'SLIDEVQA'
,
'GQA'
,
'MMLongBench_DOC'
],
dataset
):
prompt
=
question
+
'
\n
Answer the question using a single word or phrase.'
elif
listinstr
([
'MathVista'
,
'MathVision'
,
'VCR'
,
'MTVQA'
,
'MMVet'
,
'MathVerse'
,
'MMDU'
,
'CRPE'
,
'MIA-Bench'
,
'MM-Math'
,
'DynaMath'
,
'QSpatial'
],
dataset
):
prompt
=
question
if
os
.
getenv
(
'USE_COT'
)
==
'1'
:
prompt
=
build_qa_cot_prompt
(
line
,
prompt
)
else
:
prompt
=
question
+
'
\n
Answer the question using a single word or phrase.'
else
:
# VQA_ex_prompt: OlympiadBench, VizWiz
prompt
=
line
[
'question'
]
if
os
.
getenv
(
'USE_COT'
)
==
'1'
:
prompt
=
build_qa_cot_prompt
(
line
,
prompt
)
message
=
[
dict
(
type
=
'text'
,
value
=
prompt
)]
message
.
extend
([
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
])
if
self
.
use_mpo_prompt
:
message
=
build_mpo_prompt
(
message
,
line
,
dataset
)
return
message
def
set_max_num
(
self
,
dataset
):
# The total limit on the number of images processed, set to avoid Out-of-Memory issues.
self
.
total_max_num
=
64
if
dataset
is
None
:
self
.
max_num
=
6
return
None
res_12_datasets
=
[
'ChartQA_TEST'
,
'MMMU_DEV_VAL'
,
'MMMU_TEST'
,
'MME-RealWorld'
,
'VCR_EN'
,
'VCR_ZH'
,
'OCRVQA'
]
res_18_datasets
=
[
'DocVQA_VAL'
,
'DocVQA_TEST'
,
'DUDE'
,
'MMLongBench_DOC'
,
'SLIDEVQA'
]
res_24_datasets
=
[
'InfoVQA_VAL'
,
'InfoVQA_TEST'
,
'OCRBench'
,
'HRBench4K'
,
'HRBench8K'
]
if
DATASET_MODALITY
(
dataset
)
==
'VIDEO'
:
self
.
max_num
=
1
elif
listinstr
(
res_12_datasets
,
dataset
):
self
.
max_num
=
12
elif
listinstr
(
res_18_datasets
,
dataset
):
self
.
max_num
=
18
elif
listinstr
(
res_24_datasets
,
dataset
):
self
.
max_num
=
24
else
:
self
.
max_num
=
6
def
generate_v1_2
(
self
,
message
,
dataset
=
None
):
self
.
INTERLEAVE
=
False
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
image
=
Image
.
open
(
image_path
).
convert
(
'RGB'
)
image
=
image
.
resize
((
self
.
image_size
,
self
.
image_size
))
image_processor
=
CLIPImageProcessor
.
from_pretrained
(
self
.
model_path
)
pixel_values
=
image_processor
(
images
=
image
,
return_tensors
=
'pt'
).
pixel_values
pixel_values
=
pixel_values
.
to
(
torch
.
bfloat16
).
to
(
self
.
device
)
with
torch
.
no_grad
():
response
=
self
.
model
.
chat
(
self
.
tokenizer
,
pixel_values
=
pixel_values
,
question
=
prompt
,
generation_config
=
self
.
kwargs
)
return
response
def
generate_v1_5
(
self
,
message
,
dataset
=
None
):
image_num
=
len
([
x
for
x
in
message
if
x
[
'type'
]
==
'image'
])
max_num
=
max
(
1
,
min
(
self
.
max_num
,
self
.
total_max_num
//
image_num
))
prompt
=
'
\n
'
.
join
([
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'text'
])
if
DATASET_MODALITY
(
dataset
)
==
'VIDEO'
:
prompt
=
build_video_prompt
(
prompt
,
dataset
)
if
image_num
>
1
:
image_path
=
[
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'image'
]
pixel_values_list
=
[]
for
file_name
in
image_path
:
pixel_values_list
.
append
(
load_image
(
file_name
,
max_num
=
max_num
).
to
(
self
.
device
).
to
(
torch
.
bfloat16
))
pixel_values
=
torch
.
cat
(
pixel_values_list
,
dim
=
0
)
elif
image_num
==
1
:
image_path
=
[
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'image'
][
0
]
pixel_values
=
load_image
(
image_path
,
max_num
=
max_num
).
to
(
self
.
device
).
to
(
torch
.
bfloat16
)
else
:
pixel_values
=
None
with
torch
.
no_grad
():
response
=
self
.
model
.
chat
(
self
.
tokenizer
,
pixel_values
=
pixel_values
,
question
=
prompt
,
generation_config
=
self
.
kwargs
,
verbose
=
True
)
return
response
def
generate_v2
(
self
,
message
,
dataset
=
None
):
image_num
=
len
([
x
for
x
in
message
if
x
[
'type'
]
==
'image'
])
max_num
=
max
(
1
,
min
(
self
.
max_num
,
self
.
total_max_num
//
image_num
))
prompt
=
reorganize_prompt
(
message
,
image_num
,
dataset
=
dataset
)
if
dataset
is
not
None
and
DATASET_MODALITY
(
dataset
)
==
'VIDEO'
:
prompt
=
build_video_prompt
(
prompt
,
dataset
)
if
image_num
>
1
:
image_path
=
[
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'image'
]
num_patches_list
,
pixel_values_list
=
[],
[]
for
image_idx
,
file_name
in
enumerate
(
image_path
):
upscale_flag
=
image_idx
==
0
and
dataset
is
not
None
and
listinstr
([
'MMMU'
],
dataset
)
curr_pixel_values
=
load_image
(
file_name
,
max_num
=
max_num
,
upscale
=
upscale_flag
).
to
(
self
.
device
).
to
(
torch
.
bfloat16
)
num_patches_list
.
append
(
curr_pixel_values
.
size
(
0
))
pixel_values_list
.
append
(
curr_pixel_values
)
pixel_values
=
torch
.
cat
(
pixel_values_list
,
dim
=
0
)
elif
image_num
==
1
:
image_path
=
[
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'image'
][
0
]
upscale_flag
=
dataset
is
not
None
and
listinstr
([
'MMMU'
],
dataset
)
pixel_values
=
load_image
(
image_path
,
max_num
=
max_num
,
upscale
=
upscale_flag
).
to
(
self
.
device
).
to
(
torch
.
bfloat16
)
num_patches_list
=
[
pixel_values
.
size
(
0
)]
else
:
pixel_values
=
None
num_patches_list
=
[]
with
torch
.
no_grad
():
response
=
self
.
model
.
chat
(
self
.
tokenizer
,
pixel_values
=
pixel_values
,
num_patches_list
=
num_patches_list
,
question
=
prompt
,
generation_config
=
self
.
kwargs
,
verbose
=
True
)
if
self
.
use_mpo_prompt
:
response
=
mpo_post_processing
(
response
,
dataset
)
return
response
def
generate_inner
(
self
,
message
,
dataset
=
None
):
self
.
set_max_num
(
dataset
)
print
(
f
'InternVL model version:
{
self
.
version
}
'
)
if
self
.
version
in
[
'V1.1'
,
'V1.2'
]:
return
self
.
generate_v1_2
(
message
,
dataset
)
elif
self
.
version
==
'V1.5'
:
return
self
.
generate_v1_5
(
message
,
dataset
)
elif
self
.
version
==
'V2.0'
:
return
self
.
generate_v2
(
message
,
dataset
)
else
:
raise
ValueError
(
f
'Unsupported version:
{
self
.
version
}
'
)
def
build_history
(
self
,
message
):
# Global Variables
image_path
=
[]
image_cnt
=
0
def
concat_tilist
(
tilist
):
nonlocal
image_cnt
# Declare image_cnt as nonlocal to modify it
prompt
=
''
for
item
in
tilist
:
# Substitute the pattern in the text
if
item
[
'type'
]
==
'text'
:
prompt
+=
re
.
sub
(
self
.
pattern
,
self
.
replacement
,
item
[
'value'
])
elif
item
[
'type'
]
==
'image'
:
image_cnt
+=
1
prompt
+=
'<image>
\n
'
image_path
.
append
(
item
[
'value'
])
return
prompt
# Only previous messages
assert
len
(
message
)
%
2
==
0
history
=
[]
for
i
in
range
(
len
(
message
)
//
2
):
m1
,
m2
=
message
[
2
*
i
],
message
[
2
*
i
+
1
]
assert
m1
[
'role'
]
==
'user'
and
m2
[
'role'
]
==
'assistant'
history
.
append
((
concat_tilist
(
m1
[
'content'
]),
concat_tilist
(
m2
[
'content'
])))
return
history
,
image_path
,
image_cnt
def
chat_inner_v2
(
self
,
message
,
dataset
=
None
):
if
len
(
message
)
>
1
:
history
,
image_path
,
image_cnt
=
self
.
build_history
(
message
[:
-
1
])
else
:
history
,
image_path
,
image_cnt
=
None
,
[],
1
current_msg
=
message
[
-
1
]
question
=
''
# If message is just text in the conversation
if
len
(
current_msg
[
'content'
])
==
1
and
current_msg
[
'content'
][
0
][
'type'
]
==
'text'
:
question
=
current_msg
[
'content'
][
0
][
'value'
]
question
=
re
.
sub
(
self
.
pattern
,
self
.
replacement
,
question
)
# Fix pattern as per InternVL
else
:
for
msg
in
current_msg
[
'content'
]:
if
msg
[
'type'
]
==
'text'
:
question
+=
re
.
sub
(
self
.
pattern
,
self
.
replacement
,
msg
[
'value'
])
elif
msg
[
'type'
]
==
'image'
:
image_cnt
+=
1
question
+=
'<image>
\n
'
image_path
.
append
(
msg
[
'value'
])
if
image_cnt
>
1
:
num_patches_list
=
[]
pixel_values_list
=
[]
for
image_idx
,
file_name
in
enumerate
(
image_path
):
upscale_flag
=
image_idx
==
0
and
dataset
is
not
None
and
listinstr
([
'MMMU_DEV_VAL'
],
dataset
)
curr_pixel_values
=
load_image
(
file_name
,
max_num
=
self
.
max_num
,
upscale
=
upscale_flag
).
to
(
self
.
device
).
to
(
torch
.
bfloat16
)
num_patches_list
.
append
(
curr_pixel_values
.
size
(
0
))
pixel_values_list
.
append
(
curr_pixel_values
)
pixel_values
=
torch
.
cat
(
pixel_values_list
,
dim
=
0
)
elif
image_cnt
==
1
:
upscale_flag
=
listinstr
([
'MMMU_DEV_VAL'
],
dataset
)
pixel_values
=
load_image
(
image_path
,
max_num
=
self
.
max_num
,
upscale
=
upscale_flag
).
to
(
self
.
device
).
to
(
torch
.
bfloat16
)
num_patches_list
=
[
pixel_values
.
size
(
0
)]
else
:
pixel_values
=
None
num_patches_list
=
[]
response
,
history
=
self
.
model
.
chat
(
self
.
tokenizer
,
pixel_values
=
pixel_values
,
num_patches_list
=
num_patches_list
,
question
=
question
,
generation_config
=
self
.
kwargs
,
history
=
history
,
return_history
=
True
)
response
=
re
.
sub
(
self
.
reverse_pattern
,
self
.
reverse_replacement
,
response
)
return
response
def
chat_inner
(
self
,
message
,
dataset
=
None
):
self
.
set_max_num
(
dataset
)
if
self
.
version
in
[
'V1.1'
,
'V1.2'
]:
raise
ValueError
(
f
'Unsupported version for Multi-Turn:
{
self
.
version
}
'
)
elif
self
.
version
==
'V1.5'
:
raise
ValueError
(
f
'Unsupported version for Multi-Turn:
{
self
.
version
}
'
)
elif
self
.
version
==
'V2.0'
:
kwargs_default
=
dict
(
do_sample
=
False
,
max_new_tokens
=
512
,
top_p
=
None
,
num_beams
=
1
)
self
.
kwargs
=
kwargs_default
return
self
.
chat_inner_v2
(
message
,
dataset
)
else
:
raise
ValueError
(
f
'Unsupported version for Multi-Turn:
{
self
.
version
}
'
)
VLMEvalKit/vlmeval/vlm/internvl/utils.py
0 → 100644
View file @
bc5ebf0f
import
math
import
pandas
as
pd
import
random
import
re
import
string
import
torch
import
torch.distributed
as
dist
import
torchvision.transforms
as
T
import
transformers
import
warnings
from
PIL
import
Image
from
torchvision.transforms.functional
import
InterpolationMode
from
transformers
import
AutoTokenizer
,
AutoConfig
,
AutoModel
,
CLIPImageProcessor
from
..base
import
BaseModel
from
...dataset
import
DATASET_TYPE
,
DATASET_MODALITY
from
...smp
import
*
IMAGENET_MEAN
=
(
0.485
,
0.456
,
0.406
)
IMAGENET_STD
=
(
0.229
,
0.224
,
0.225
)
def
build_transform
(
input_size
):
MEAN
,
STD
=
IMAGENET_MEAN
,
IMAGENET_STD
transform
=
T
.
Compose
([
T
.
Lambda
(
lambda
img
:
img
.
convert
(
'RGB'
)
if
img
.
mode
!=
'RGB'
else
img
),
T
.
Resize
((
input_size
,
input_size
),
interpolation
=
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
T
.
Normalize
(
mean
=
MEAN
,
std
=
STD
)
])
return
transform
def
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
,
height
,
image_size
):
best_ratio_diff
=
float
(
'inf'
)
best_ratio
=
(
1
,
1
)
area
=
width
*
height
for
ratio
in
target_ratios
:
target_aspect_ratio
=
ratio
[
0
]
/
ratio
[
1
]
ratio_diff
=
abs
(
aspect_ratio
-
target_aspect_ratio
)
if
ratio_diff
<
best_ratio_diff
:
best_ratio_diff
=
ratio_diff
best_ratio
=
ratio
elif
ratio_diff
==
best_ratio_diff
:
if
area
>
0.5
*
image_size
*
image_size
*
ratio
[
0
]
*
ratio
[
1
]:
best_ratio
=
ratio
return
best_ratio
def
dynamic_preprocess
(
image
,
min_num
=
1
,
max_num
=
6
,
image_size
=
448
,
use_thumbnail
=
False
):
orig_width
,
orig_height
=
image
.
size
aspect_ratio
=
orig_width
/
orig_height
# calculate the existing image aspect ratio
target_ratios
=
set
(
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
i
*
j
<=
max_num
and
i
*
j
>=
min_num
)
target_ratios
=
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
orig_width
,
orig_height
,
image_size
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
def
load_image
(
image_file
,
input_size
=
448
,
max_num
=
6
,
upscale
=
False
):
image
=
Image
.
open
(
image_file
).
convert
(
'RGB'
)
if
upscale
:
image
=
image
.
resize
((
image
.
width
*
2
,
image
.
height
*
2
),
Image
.
BILINEAR
)
transform
=
build_transform
(
input_size
=
input_size
)
images
=
dynamic_preprocess
(
image
,
image_size
=
input_size
,
use_thumbnail
=
True
,
max_num
=
max_num
)
pixel_values
=
[
transform
(
image
)
for
image
in
images
]
pixel_values
=
torch
.
stack
(
pixel_values
)
return
pixel_values
def
get_local_rank_and_local_world_size
():
if
not
dist
.
is_available
():
return
0
,
1
if
not
dist
.
is_initialized
():
return
0
,
1
if
'SLURM_LOCALID'
in
os
.
environ
:
local_rank
=
int
(
os
.
environ
[
'SLURM_LOCALID'
])
local_world_size
=
int
(
os
.
environ
[
'SLURM_NTASKS_PER_NODE'
])
return
local_rank
,
local_world_size
if
'LOCAL_RANK'
in
os
.
environ
and
'LOCAL_WORLD_SIZE'
in
os
.
environ
:
return
int
(
os
.
environ
[
'LOCAL_RANK'
]),
int
(
os
.
environ
[
'LOCAL_WORLD_SIZE'
])
raise
NotImplementedError
(
"Fail to get local_rank and local_world_size! "
"Please ensure that you set the environment variable "
"`LOCAL_RANK` and `LOCAL_WORLD_SIZE`"
)
def
split_model
(
model_path
):
num_gpus_per_node
=
8
rank
,
world_size
=
get_rank_and_world_size
()
try
:
local_rank
,
local_world_size
=
get_local_rank_and_local_world_size
()
except
:
local_rank
=
rank
if
'GPUS_PER_PROCESS'
in
os
.
environ
:
gpus_per_process
=
int
(
os
.
environ
[
'GPUS_PER_PROCESS'
])
else
:
gpus_per_process
=
8
# default to use 8 GPUs for one model
start_gpu
=
local_rank
*
gpus_per_process
end_gpu
=
start_gpu
+
gpus_per_process
assert
end_gpu
<=
num_gpus_per_node
,
f
"Process
{
local_rank
}
tries to access GPU
{
end_gpu
}
, "
\
f
"but only
{
num_gpus_per_node
}
GPUs are available per node."
visible_devices
=
list
(
range
(
start_gpu
,
end_gpu
))
device_map
=
{}
config
=
AutoConfig
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
num_gpus_for_vit
=
0.5
num_layers
=
config
.
llm_config
.
num_hidden_layers
num_layers_per_gpu
=
math
.
ceil
(
num_layers
/
(
len
(
visible_devices
)
-
num_gpus_for_vit
))
num_layers_per_gpu
=
[
num_layers_per_gpu
]
*
len
(
visible_devices
)
num_layers_per_gpu
[
0
]
=
math
.
ceil
(
num_layers_per_gpu
[
0
]
*
0.5
)
layer_cnt
=
0
for
i
,
num_layer
in
enumerate
(
num_layers_per_gpu
):
for
j
in
range
(
num_layer
):
device_map
[
f
'language_model.model.layers.
{
layer_cnt
}
'
]
=
visible_devices
[
i
]
layer_cnt
+=
1
device_map
[
'vision_model'
]
=
visible_devices
[
0
]
device_map
[
'mlp1'
]
=
visible_devices
[
0
]
device_map
[
'language_model.model.tok_embeddings'
]
=
visible_devices
[
0
]
device_map
[
'language_model.model.embed_tokens'
]
=
visible_devices
[
0
]
device_map
[
'language_model.output'
]
=
visible_devices
[
0
]
device_map
[
'language_model.model.norm'
]
=
visible_devices
[
0
]
device_map
[
'language_model.lm_head'
]
=
visible_devices
[
0
]
device_map
[
f
'language_model.model.layers.
{
num_layers
-
1
}
'
]
=
visible_devices
[
0
]
return
device_map
,
visible_devices
def
split_model_old
(
model_name
):
import
math
device_map
=
{}
num_gpus
=
torch
.
cuda
.
device_count
()
rank
,
world_size
=
get_rank_and_world_size
()
num_gpus
=
num_gpus
//
world_size
num_layers_map
=
{
'InternVL2-8B'
:
32
,
'InternVL2-26B'
:
48
,
'InternVL2-40B'
:
60
,
'InternVL2-Llama3-76B'
:
80
}
if
model_name
not
in
num_layers_map
:
return
'cuda'
num_layers
=
num_layers_map
[
model_name
]
# Since the first GPU will be used for ViT, treat it as 0.5 GPU.
num_layers_per_gpu
=
math
.
ceil
(
num_layers
/
(
num_gpus
-
0.5
))
num_layers_per_gpu
=
[
num_layers_per_gpu
]
*
num_gpus
num_layers_per_gpu
[
0
]
=
math
.
ceil
(
num_layers_per_gpu
[
0
]
*
0.5
)
layer_cnt
=
0
for
i
,
num_layer
in
enumerate
(
num_layers_per_gpu
):
for
j
in
range
(
num_layer
):
device_map
[
f
'language_model.model.layers.
{
layer_cnt
}
'
]
=
rank
+
world_size
*
i
layer_cnt
+=
1
device_map
[
'vision_model'
]
=
rank
device_map
[
'mlp1'
]
=
rank
device_map
[
'language_model.model.tok_embeddings'
]
=
rank
device_map
[
'language_model.model.embed_tokens'
]
=
rank
device_map
[
'language_model.output'
]
=
rank
device_map
[
'language_model.model.norm'
]
=
rank
device_map
[
'language_model.lm_head'
]
=
rank
device_map
[
'language_model.model.rotary_emb'
]
=
rank
device_map
[
f
'language_model.model.layers.
{
num_layers
-
1
}
'
]
=
rank
return
device_map
def
build_mcq_cot_prompt
(
line
,
prompt
):
cot_prompt
=
(
"Answer the preceding multiple choice question. The last line of your response should follow "
"this format: 'Answer:
\\
boxed{$LETTER}' (without quotes), where LETTER is one of the options. "
"If you are uncertain or the problem is too complex, make a reasoned guess based on the "
"information provided. Avoid repeating steps indefinitely—provide your best guess even if "
"unsure. Think step by step logically, considering all relevant information before answering."
)
prompt
=
prompt
.
replace
(
"Answer with the option's letter from the given choices directly."
,
''
).
strip
()
prompt
=
prompt
+
'
\n
'
+
cot_prompt
return
prompt
def
build_qa_cot_prompt
(
line
,
prompt
):
cot_prompt
=
(
"Answer the preceding question. The last line of your response should follow this format: "
"'Answer:
\\
boxed{$FINAL_ANSWER}' (without quotes), where 'FINAL_ANSWER' is your conclusion "
"based on the reasoning provided. If you are uncertain or the problem is too complex, make "
"a reasoned guess based on the information provided. Avoid repeating steps indefinitely—"
"provide your best guess even if unsure. Think step by step logically, considering all "
"relevant information before answering."
)
prompt
=
prompt
+
'
\n
'
+
cot_prompt
return
prompt
def
build_multi_choice_prompt
(
line
,
dataset
=
None
):
question
=
line
[
'question'
]
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
if
hint
is
not
None
:
question
=
hint
+
'
\n
'
+
question
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
for
key
,
item
in
options
.
items
():
question
+=
f
'
\n
{
key
}
.
{
item
}
'
prompt
=
question
if
len
(
options
):
prompt
+=
'
\n
请直接回答选项字母。'
if
cn_string
(
prompt
)
else
"
\n
Answer with the option's letter from the given choices directly."
else
:
prompt
+=
'
\n
请直接回答问题。'
if
cn_string
(
prompt
)
else
'
\n
Answer the question directly.'
return
prompt
def
build_video_prompt
(
prompt
,
dataset
=
None
,
max_frames
=
64
):
for
start
in
range
(
0
,
max_frames
,
8
):
images_to_remove
=
''
.
join
([
f
'<Image-
{
i
}
>'
for
i
in
range
(
start
+
1
,
start
+
9
)])
prompt
=
prompt
.
replace
(
images_to_remove
,
''
)
for
i
in
range
(
max_frames
):
prompt
=
prompt
.
replace
(
f
'Image-
{
i
+
1
}
'
,
f
'Frame-
{
i
+
1
}
'
)
if
listinstr
([
'MMBench-Video'
],
dataset
):
prompt
=
prompt
.
replace
(
'
\n
Answer:'
,
''
)
elif
listinstr
([
'Video-MME'
],
dataset
):
prompt
=
prompt
.
replace
(
'
\n
Answer:'
,
''
)
prompt
+=
"
\n
Answer with the option's letter from the given choices directly."
elif
listinstr
([
'MVBench'
],
dataset
):
prompt
=
prompt
.
replace
(
'Best option:('
,
''
)
return
prompt
def
reorganize_prompt
(
message
,
image_num
,
dataset
=
None
):
if
dataset
is
not
None
and
listinstr
([
'MUIRBench'
],
dataset
):
prompt
=
'
\n
'
.
join
([
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'text'
])
images_to_remove
=
' '
.
join
([
'<image>'
]
*
image_num
)
prompt
=
prompt
.
replace
(
images_to_remove
,
''
)
for
i
in
range
(
image_num
):
prompt
=
prompt
.
replace
(
'<image>'
,
f
'<Image-
{
i
+
1
}
>'
,
1
)
prompt
=
''
.
join
([
f
'Image-
{
i
+
1
}
: <image>
\n
'
for
i
in
range
(
image_num
)])
+
prompt
elif
image_num
==
1
:
prompt
=
'<image>
\n
'
+
'
\n
'
.
join
([
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'text'
])
else
:
prompt
,
image_idx
=
''
,
1
for
x
in
message
:
if
x
[
'type'
]
==
'text'
:
prompt
+=
x
[
'value'
]
elif
x
[
'type'
]
==
'image'
:
prompt
+=
f
'<Image-
{
image_idx
}
>'
image_idx
+=
1
prompt
=
''
.
join
([
f
'Image-
{
i
+
1
}
: <image>
\n
'
for
i
in
range
(
image_num
)])
+
prompt
images_to_remove
=
''
.
join
([
f
'<Image-
{
i
+
1
}
>'
for
i
in
range
(
image_num
)])
prompt
=
prompt
.
replace
(
images_to_remove
,
''
)
return
prompt
mpo_prompt_with_final_answer
=
(
"Your task is to answer the question below. "
"Give step by step reasoning before you answer, and when you're ready to answer, "
"please use the format
\"
Final answer: ..
\"
"
"
\n\n
"
"Question:"
"
\n\n
"
"{question}"
)
mpo_prompt_without_final_answer
=
(
"Your task is to answer the question below. "
"Give step by step reasoning. "
"
\n\n
"
"Question:"
"
\n\n
"
"{question}"
)
def
mpo_post_processing
(
response
,
dataset
):
def
extract_answer
(
text
):
match
=
re
.
search
(
r
'(Final answer:|Answer:)\s*(.*)'
,
text
,
re
.
IGNORECASE
)
if
match
:
return
match
.
group
(
2
).
strip
()
return
text
if
dataset
is
not
None
and
(
DATASET_TYPE
(
dataset
)
in
[
'Y/N'
,
'MCQ'
]
or
listinstr
([
'CRPE'
],
dataset
)):
response
=
extract_answer
(
response
).
strip
()
return
response
def
build_mpo_prompt
(
message
,
line
,
dataset
):
if
not
listinstr
([
'LLaVABench'
],
dataset
):
if
listinstr
([
'MMVet'
],
dataset
):
cot_prompt
=
mpo_prompt_without_final_answer
else
:
cot_prompt
=
mpo_prompt_with_final_answer
question_orig
=
line
[
'question'
]
if
listinstr
([
'MathVerse'
,
'MathVision'
],
dataset
):
question_orig
=
question_orig
.
split
(
'Question:'
,
1
)[
-
1
].
strip
()
question_orig
=
question_orig
.
replace
(
'Choices:
\n
'
,
''
).
strip
()
prompt
=
cot_prompt
.
format
(
question
=
question_orig
)
else
:
prompt
=
line
[
'question'
]
message
[
0
][
'value'
]
=
prompt
return
message
VLMEvalKit/vlmeval/vlm/janus.py
0 → 100644
View file @
bc5ebf0f
import
sys
import
torch
from
transformers
import
AutoModelForCausalLM
,
AutoConfig
import
warnings
from
.base
import
BaseModel
from
..smp
import
*
from
..dataset
import
DATASET_TYPE
class
Janus
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
True
def
check_install
(
self
):
try
:
import
janus
except
Exception
as
e
:
logging
.
critical
(
'Please first install janus from source codes in: https://github.com/deepseek-ai/Janus'
)
raise
e
def
__init__
(
self
,
model_path
=
'deepseek-ai/Janus-1.3B'
,
**
kwargs
):
self
.
check_install
()
assert
model_path
is
not
None
self
.
model_path
=
model_path
from
janus.models
import
VLChatProcessor
self
.
vl_chat_processor
=
VLChatProcessor
.
from_pretrained
(
model_path
)
self
.
tokenizer
=
self
.
vl_chat_processor
.
tokenizer
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
self
.
model
=
model
.
to
(
torch
.
bfloat16
).
cuda
().
eval
()
torch
.
cuda
.
empty_cache
()
default_kwargs
=
dict
(
max_new_tokens
=
512
,
do_sample
=
False
,
use_cache
=
True
,
output_logits
=
False
,
output_scores
=
False
,
return_dict_in_generate
=
False
)
default_kwargs
.
update
(
kwargs
)
self
.
kwargs
=
default_kwargs
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
def
prepare_inputs
(
self
,
message
):
def
prepare_itlist
(
msgs
):
content
,
images
=
''
,
[]
for
s
in
msgs
:
if
s
[
'type'
]
==
'image'
:
images
.
append
(
s
[
'value'
])
content
+=
'<image_placeholder>'
elif
s
[
'type'
]
==
'text'
:
content
+=
s
[
'value'
]
return
content
,
images
conversation
=
[]
if
'role'
not
in
message
[
0
]:
content
,
images
=
prepare_itlist
(
message
)
conversation
.
append
(
dict
(
role
=
'User'
,
content
=
content
,
images
=
images
))
else
:
role_map
=
{
'user'
:
'User'
,
'assistant'
:
'Assistant'
}
for
msgs
in
message
:
role
=
role_map
[
msgs
[
'role'
]]
content
,
images
=
prepare_itlist
(
msgs
[
'content'
])
conversation
.
append
(
dict
(
role
=
role
,
content
=
content
,
images
=
images
))
conversation
.
append
(
dict
(
role
=
'Assistant'
,
content
=
''
))
return
conversation
def
generate_inner
(
self
,
message
,
dataset
=
None
):
if
dataset
is
None
or
not
(
'MMVet'
in
dataset
):
self
.
vl_chat_processor
.
system_prompt
=
""
else
:
self
.
vl_chat_processor
.
system_prompt
=
"You are a helpful assistant. Please answer truthfully and write out your thinking step by step to be sure you get the right answer."
# noqa: E501
conversation
=
self
.
prepare_inputs
(
message
)
from
janus.utils.io
import
load_pil_images
pil_images
=
load_pil_images
(
conversation
)
prepare_inputs
=
self
.
vl_chat_processor
(
conversations
=
conversation
,
images
=
pil_images
,
force_batchify
=
True
)
prepare_inputs
=
prepare_inputs
.
to
(
self
.
model
.
device
,
dtype
=
torch
.
bfloat16
)
inputs_embeds
=
self
.
model
.
prepare_inputs_embeds
(
**
prepare_inputs
)
outputs
=
self
.
model
.
language_model
.
generate
(
inputs_embeds
=
inputs_embeds
,
attention_mask
=
prepare_inputs
.
attention_mask
,
pad_token_id
=
self
.
tokenizer
.
eos_token_id
,
bos_token_id
=
self
.
tokenizer
.
bos_token_id
,
eos_token_id
=
self
.
tokenizer
.
eos_token_id
,
**
self
.
kwargs
)
answer
=
self
.
tokenizer
.
decode
(
outputs
[
0
].
cpu
().
tolist
(),
skip_special_tokens
=
True
)
return
answer
def
chat_inner
(
self
,
message
,
dataset
=
None
):
return
self
.
generate_inner
(
message
,
dataset
=
dataset
)
def
use_custom_prompt
(
self
,
dataset
):
assert
dataset
is
not
None
if
DATASET_TYPE
(
dataset
)
==
'Y/N'
or
DATASET_TYPE
(
dataset
)
==
'MCQ'
or
dataset
==
'MMVet'
:
return
True
return
False
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
assert
self
.
use_custom_prompt
(
dataset
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
'question'
]
if
DATASET_TYPE
(
dataset
)
==
'Y/N'
:
if
dataset
==
'POPE'
:
question
=
question
.
replace
(
" Please answer yes or no."
,
""
)
prompt
=
'
\n
'
+
question
+
"
\n
Answer the question using a single word or phrase."
elif
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
options_prompt
=
''
for
key
,
item
in
options
.
items
():
options_prompt
+=
f
'
{
key
}
.
{
item
}
\n
'
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
prompt
=
f
'
\n
Hint:
{
hint
}
\n
'
if
hint
is
not
None
else
'
\n
'
prompt
+=
f
'
{
question
}
\n
'
prompt
+=
(
f
"
{
options_prompt
}
\n
Answer with the option's letter from the given choices directly."
if
len
(
options
)
else
'Answer the question directly. '
)
elif
dataset
==
'MMVet'
:
prompt
=
'
\n
'
+
question
else
:
raise
NotImplementedError
message
=
[
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
]
message
.
extend
([
dict
(
type
=
'text'
,
value
=
prompt
)])
return
message
VLMEvalKit/vlmeval/vlm/kosmos.py
0 → 100644
View file @
bc5ebf0f
import
torch
import
re
from
PIL
import
Image
from
abc
import
abstractproperty
import
sys
import
os.path
as
osp
from
.base
import
BaseModel
from
..smp
import
*
from
..dataset
import
DATASET_TYPE
import
copy
class
Kosmos2
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
True
def
__init__
(
self
,
model_path
=
'microsoft/kosmos-2-patch14-224'
,
**
kwargs
):
try
:
from
transformers
import
AutoProcessor
,
Kosmos2ForConditionalGeneration
except
Exception
as
e
:
logging
.
critical
(
"Please install Transformers version 4.45.1 by running: pip install transformers==4.45.1"
)
raise
e
assert
osp
.
exists
(
model_path
)
or
splitlen
(
model_path
)
==
2
self
.
model
=
(
Kosmos2ForConditionalGeneration
.
from_pretrained
(
model_path
,
torch_dtype
=
torch
.
float16
)
.
to
(
torch
.
device
(
'cuda'
))
)
self
.
processor
=
AutoProcessor
.
from_pretrained
(
model_path
)
default_kwargs
=
dict
(
max_new_tokens
=
512
,
use_cache
=
True
)
default_kwargs
.
update
(
kwargs
)
self
.
kwargs
=
default_kwargs
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
torch
.
cuda
.
empty_cache
()
def
generate_inner
(
self
,
message
,
dataset
=
None
):
TASK_TOKEN
=
'<grounding> '
QEUSTION_TOKEN
=
'Question: '
ANSWER_TOKEN
=
'Answer: '
images
=
[]
prompt
=
''
prompt
+=
TASK_TOKEN
for
s
in
message
:
if
s
[
'type'
]
==
'image'
:
images
.
append
(
s
[
'value'
])
elif
s
[
'type'
]
==
'text'
:
prompt
+=
QEUSTION_TOKEN
prompt
+=
s
[
'value'
]
prompt
+=
ANSWER_TOKEN
images
=
[
Image
.
open
(
s
)
for
s
in
images
]
inputs
=
self
.
processor
(
text
=
prompt
,
images
=
images
[
0
],
return_tensors
=
'pt'
).
to
(
torch
.
device
(
'cuda'
))
generated_ids
=
self
.
model
.
generate
(
pixel_values
=
inputs
[
'pixel_values'
],
input_ids
=
inputs
[
'input_ids'
],
attention_mask
=
inputs
[
'attention_mask'
],
image_embeds
=
None
,
image_embeds_position_mask
=
inputs
[
'image_embeds_position_mask'
],
**
self
.
kwargs
)
generated_text
=
self
.
processor
.
batch_decode
(
generated_ids
,
skip_special_tokens
=
True
)[
0
]
processed_text
=
self
.
processor
.
post_process_generation
(
generated_text
,
cleanup_and_extract
=
True
)[
0
]
cleaned_answer
=
re
.
sub
(
r
'(Question:.*?Answer:|Question:.*)'
,
''
,
processed_text
).
strip
()
return
cleaned_answer
def
use_custom_prompt
(
self
,
dataset
):
assert
dataset
is
not
None
if
listinstr
([
'MMMU'
],
dataset
):
return
False
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
or
dataset
==
'MMVet'
:
return
True
return
False
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
assert
self
.
use_custom_prompt
(
dataset
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
'question'
]
if
dataset
==
'MMVet'
:
prompt
=
question
+
'
\n
Answer the question directly. '
elif
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
options_prompt
=
''
for
key
,
item
in
options
.
items
():
options_prompt
+=
f
'
{
key
}
.
{
item
}
\n
'
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
prompt
=
f
'Hint:
{
hint
}
\n
'
if
hint
is
not
None
else
''
prompt
+=
f
'
{
question
}
\n
'
prompt
+=
(
f
'
{
options_prompt
}
\n
Answer with the option’s letter from the given choices directly. '
if
len
(
options
)
else
'Answer the question directly. '
)
else
:
raise
NotImplementedError
message
=
[
dict
(
type
=
'text'
,
value
=
prompt
)]
message
.
extend
([
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
])
return
message
VLMEvalKit/vlmeval/vlm/llama_vision.py
0 → 100644
View file @
bc5ebf0f
import
torch
from
PIL
import
Image
import
os.path
as
osp
import
sys
from
.base
import
BaseModel
from
..smp
import
*
from
..dataset
import
DATASET_TYPE
class
llama_vision
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
False
# This function is used to split Llama-3.2-90B
def
split_model
(
self
):
import
math
device_map
=
{}
num_gpus
=
torch
.
cuda
.
device_count
()
rank
,
world_size
=
get_rank_and_world_size
()
num_gpus
=
num_gpus
//
world_size
num_layers
=
100
# GPU0: -5, GPU-1: -7
total_cost
=
num_layers
+
5
+
7
# Since the first GPU will be used for ViT, treat it as 0.8 GPU.
num_layers_per_gpu
=
total_cost
//
num_gpus
num_layers_per_gpu
=
[
num_layers_per_gpu
]
*
num_gpus
# The total number of GPUs might be odd
num_layers_per_gpu
[
-
1
]
=
total_cost
-
sum
(
num_layers_per_gpu
[:
-
1
])
num_layers_per_gpu
[
0
]
-=
5
num_layers_per_gpu
[
-
1
]
-=
7
layer_cnt
=
0
for
i
,
num_layer
in
enumerate
(
num_layers_per_gpu
):
for
j
in
range
(
num_layer
):
device_map
[
f
'language_model.model.layers.
{
layer_cnt
}
'
]
=
rank
+
world_size
*
i
layer_cnt
+=
1
device_map
[
'vision_model'
]
=
rank
device_map
[
'language_model.model.embed_tokens'
]
=
rank
device_map
[
'language_model.model.rotary_emb'
]
=
rank
device_map
[
'language_model.model.norm'
]
=
rank
+
world_size
*
(
num_gpus
-
1
)
device_map
[
'language_model.lm_head'
]
=
rank
+
world_size
*
(
num_gpus
-
1
)
device_map
[
'multi_modal_projector'
]
=
rank
+
world_size
*
(
num_gpus
-
1
)
return
device_map
def
__init__
(
self
,
model_path
=
'meta-llama/Llama-3.2-11B-Vision-Instruct'
,
**
kwargs
):
try
:
from
transformers
import
MllamaForConditionalGeneration
,
AutoProcessor
except
Exception
as
e
:
logging
.
critical
(
'Please install transformers>=4.45.0 before using llama_vision.'
)
raise
e
rank
,
world_size
=
get_rank_and_world_size
()
if
'11b'
in
model_path
.
lower
()
and
auto_split_flag
():
assert
world_size
==
1
,
'We only support world_size == 1 when AUTO_SPLIT is set for Llama-3.2-11B'
logging
.
warning
(
'Currently, we only support to split the 11B model across all GPUs.'
)
self
.
model
=
MllamaForConditionalGeneration
.
from_pretrained
(
model_path
,
torch_dtype
=
torch
.
bfloat16
,
device_map
=
'auto'
,
).
eval
()
elif
'90b'
in
model_path
.
lower
():
device_map
=
self
.
split_model
()
self
.
model
=
MllamaForConditionalGeneration
.
from_pretrained
(
model_path
,
torch_dtype
=
torch
.
bfloat16
,
device_map
=
device_map
,
).
eval
()
else
:
self
.
model
=
MllamaForConditionalGeneration
.
from_pretrained
(
model_path
,
torch_dtype
=
torch
.
bfloat16
,
device_map
=
'cpu'
,
).
cuda
().
eval
()
self
.
device
=
'cuda'
self
.
processor
=
AutoProcessor
.
from_pretrained
(
model_path
)
if
'Instruct'
in
model_path
:
kwargs_default
=
dict
(
do_sample
=
True
,
temperature
=
0.6
,
top_p
=
0.9
)
else
:
kwargs_default
=
dict
(
do_sample
=
False
,
max_new_tokens
=
512
,
temperature
=
0.0
,
top_p
=
None
,
num_beams
=
1
)
kwargs
.
update
(
kwargs_default
)
print
(
f
'Following kwargs received:
{
kwargs
}
, will use as generation config. '
)
self
.
kwargs
=
kwargs
self
.
model_name
=
model_path
def
use_custom_prompt
(
self
,
dataset
):
if
dataset
is
None
:
return
False
if
listinstr
([
'AI2D'
,
'MMMU'
,
'MathVista'
,
'ChartQA'
,
'DocVQA'
],
dataset
):
# For Certain dataset we use custom prompt
return
True
else
:
return
False
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
self
.
use_custom_prompt
(
dataset
)
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
'question'
]
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
if
listinstr
([
'AI2D'
],
dataset
):
self
.
kwargs
[
'max_new_tokens'
]
=
400
for
key
,
item
in
options
.
items
():
question
+=
f
'
\n
{
key
}
.
{
item
}
'
if
'11B'
in
self
.
model_name
:
prompt
=
(
f
'Look at the scientific diagram carefully and answer the following question:
{
question
}
\n
'
f
'Think step by step and finally respond to the question '
f
"with only the correct option number as
\"
FINAL ANSWER
\"
."
f
"<cot_start>Let's think step by step."
)
elif
'90B'
in
self
.
model_name
:
prompt
=
(
f
'Look at the scientific diagram carefully and answer the following question:
{
question
}
\n
'
f
'Respond only with the correct option digit.'
)
elif
listinstr
([
'MMMU'
],
dataset
):
self
.
kwargs
[
'max_new_tokens'
]
=
2048
options
=
'
\n
'
.
join
([
f
'
{
key
}
.
{
item
}
'
for
key
,
item
in
options
.
items
()])
prompt
=
(
f
'Look at the image carefully and solve the following question step-by-step. '
f
'Question:
{
question
}
Options:
{
options
}
Indicate the correct answer at the end.'
)
for
i
in
range
(
len
(
tgt_path
)):
prompt
=
prompt
.
replace
(
f
'<image
{
i
+
1
}
>'
,
''
)
elif
listinstr
([
'MathVista'
],
dataset
):
self
.
kwargs
[
'max_new_tokens'
]
=
2048
prompt
=
f
'
{
question
}
'
elif
listinstr
([
'ChartQA'
],
dataset
):
self
.
kwargs
[
'max_new_tokens'
]
=
512
if
'11B'
in
self
.
model_name
:
prompt
=
(
f
'You are provided a chart image and will be asked a question. '
f
'You have to think through your answer and provide a step-by-step solution. '
f
'Once you have the solution, write the final answer in at most a few words at the end '
f
"with the phrase
\"
FINAL ANSWER:
\"
. "
f
"The question is:
{
question
}
<cot_start>Let's think step by step."
)
elif
'90B'
in
self
.
model_name
:
prompt
=
(
f
'You are provided a chart image and will be asked a question. '
f
'Follow these steps carefully:
\n
'
f
'Step 1: Analyze the question to understand what specific data or information is being asked for. '
f
'Focus on whether the question is asking for a specific number or category '
f
'from the chart image.
\n
'
f
'Step 2: Identify any numbers, categories, or groups mentioned in the question '
f
'and take note of them. Focus on detecting and matching them directly to the image.
\n
'
f
'Step 3: Study the image carefully and find the relevant data corresponding to the categories '
f
'or numbers mentioned. Avoid unnecessary assumptions or calculations; '
f
'simply read the correct data from the image.
\n
'
f
'Step 4: Develop a clear plan to solve the question by locating the right data. '
f
'Focus only on the specific category or group that matches the question.
\n
'
f
'Step 5: Use step-by-step reasoning to ensure you are referencing the correct numbers '
f
'or data points from the image, avoiding unnecessary extra steps or interpretations.
\n
'
f
"Step 6: Provide the final answer, starting with
\"
FINAL ANSWER:
\"
"
f
'and using as few words as possible, '
f
'simply stating the number or data point requested.
\n\n
'
f
"The question is:
{
question
}
<cot_start>Let's think step by step."
)
elif
listinstr
([
'DocVQA'
],
dataset
):
self
.
kwargs
[
'max_new_tokens'
]
=
512
prompt
=
(
f
'Read the text in the image carefully and answer the question '
f
'with the text as seen exactly in the image. '
f
'For yes/no questions, just respond Yes or No. '
f
'If the answer is numeric, just respond with the number and nothing else. '
f
'If the answer has multiple words, just respond with the words and absolutely nothing else. '
f
'Never respond in a sentence or a phrase.
\n
Question:
{
question
}
'
)
else
:
raise
NotImplementedError
(
f
'Dataset
{
dataset
}
) not supported.'
)
message
=
[
dict
(
type
=
'text'
,
value
=
prompt
)]
message
.
extend
([
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
])
return
message
def
generate_inner
(
self
,
message
,
dataset
=
None
):
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
image
=
Image
.
open
(
image_path
)
messages
=
[
{
'role'
:
'user'
,
'content'
:
[
{
'type'
:
'image'
},
{
'type'
:
'text'
,
'text'
:
prompt
}
]}
]
input_text
=
self
.
processor
.
apply_chat_template
(
messages
,
add_generation_prompt
=
True
)
inputs
=
self
.
processor
(
image
,
input_text
,
return_tensors
=
'pt'
).
to
(
self
.
device
)
if
not
self
.
use_custom_prompt
(
dataset
):
if
dataset
is
not
None
and
DATASET_TYPE
(
dataset
)
in
[
'MCQ'
,
'Y/N'
]:
self
.
kwargs
[
'max_new_tokens'
]
=
128
else
:
self
.
kwargs
[
'max_new_tokens'
]
=
512
output
=
self
.
model
.
generate
(
**
inputs
,
**
self
.
kwargs
)
return
self
.
processor
.
decode
(
output
[
0
][
inputs
[
'input_ids'
].
shape
[
1
]:]).
replace
(
'<|eot_id|>'
,
''
)
VLMEvalKit/vlmeval/vlm/llava/__init__.py
0 → 100644
View file @
bc5ebf0f
from
.llava
import
LLaVA
,
LLaVA_Next
,
LLaVA_Next2
,
LLaVA_OneVision
,
LLaVA_OneVision_HF
from
.llava_xtuner
import
LLaVA_XTuner
__all__
=
[
'LLaVA'
,
'LLaVA_Next'
,
'LLaVA_XTuner'
,
'LLaVA_Next2'
,
'LLaVA_OneVision'
,
'LLaVA_OneVision_HF'
]
VLMEvalKit/vlmeval/vlm/llava/llava.py
0 → 100644
View file @
bc5ebf0f
import
torch
from
PIL
import
Image
from
abc
import
abstractproperty
import
sys
import
os.path
as
osp
from
..base
import
BaseModel
from
...smp
import
*
from
...dataset
import
DATASET_TYPE
,
DATASET_MODALITY
import
copy
import
requests
class
LLaVA
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
True
def
__init__
(
self
,
model_path
=
"liuhaotian/llava_v1.5_7b"
,
**
kwargs
):
try
:
from
llava.model.builder
import
load_pretrained_model
from
llava.mm_utils
import
get_model_name_from_path
except
Exception
as
err
:
logging
.
critical
(
"Please install llava from https://github.com/haotian-liu/LLaVA"
)
raise
err
assert
osp
.
exists
(
model_path
)
or
splitlen
(
model_path
)
==
2
self
.
system_prompt
=
(
"A chat between a curious human and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the human's questions. "
)
self
.
stop_str
=
"</s>"
if
model_path
==
"Lin-Chen/ShareGPT4V-7B"
:
model_name
=
"llava-v1.5-7b"
elif
model_path
==
"Lin-Chen/ShareGPT4V-13B"
:
model_name
=
"llava-v1.5-13b"
else
:
model_name
=
get_model_name_from_path
(
model_path
)
try
:
self
.
tokenizer
,
self
.
model
,
self
.
image_processor
,
self
.
context_len
=
(
load_pretrained_model
(
model_path
=
model_path
,
model_base
=
None
,
model_name
=
model_name
,
device
=
"cpu"
,
device_map
=
"cpu"
,
)
)
except
Exception
as
err
:
if
"ShareGPT4V"
in
model_path
:
import
llava
logging
.
critical
(
"Please manually remove the encoder type check in "
f
"
{
llava
.
__path__
[
0
]
}
/model/multimodal_encoder/builder.py "
"Line 8 to use the ShareGPT4V model. "
)
else
:
logging
.
critical
(
"Unknown error when loading LLaVA model."
)
raise
err
self
.
model
=
self
.
model
.
cuda
()
self
.
conv_mode
=
"llava_v1"
kwargs_default
=
dict
(
do_sample
=
False
,
temperature
=
0
,
max_new_tokens
=
512
,
top_p
=
None
,
num_beams
=
1
,
use_cache
=
True
,
)
# noqa E501
kwargs_default
.
update
(
kwargs
)
self
.
kwargs
=
kwargs_default
warnings
.
warn
(
f
"Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. "
)
def
use_custom_prompt
(
self
,
dataset
):
assert
dataset
is
not
None
if
DATASET_TYPE
(
dataset
)
==
"MCQ"
:
return
True
return
False
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
self
.
use_custom_prompt
(
dataset
)
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
"question"
]
hint
=
line
[
"hint"
]
if
(
"hint"
in
line
and
not
pd
.
isna
(
line
[
"hint"
]))
else
None
if
hint
is
not
None
:
question
=
hint
+
"
\n
"
+
question
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
for
key
,
item
in
options
.
items
():
question
+=
f
"
\n
{
key
}
.
{
item
}
"
prompt
=
question
if
len
(
options
):
prompt
+=
(
"
\n
请直接回答选项字母。"
if
cn_string
(
prompt
)
else
"
\n
Answer with the option's letter from the given choices directly."
)
else
:
prompt
+=
(
"
\n
请直接回答问题。"
if
cn_string
(
prompt
)
else
"
\n
Answer the question directly."
)
message
=
[
dict
(
type
=
"image"
,
value
=
s
)
for
s
in
tgt_path
]
message
.
append
(
dict
(
type
=
"text"
,
value
=
prompt
))
return
message
def
concat_tilist
(
self
,
message
):
text
,
images
=
""
,
[]
for
item
in
message
:
if
item
[
"type"
]
==
"text"
:
text
+=
item
[
"value"
]
elif
item
[
"type"
]
==
"image"
:
text
+=
" <image> "
images
.
append
(
item
[
"value"
])
return
text
,
images
def
chat_inner
(
self
,
message
,
dataset
=
None
):
from
llava.mm_utils
import
(
process_images
,
tokenizer_image_token
,
KeywordsStoppingCriteria
,
)
from
llava.constants
import
IMAGE_TOKEN_INDEX
prompt
=
self
.
system_prompt
images
=
[]
for
utter
in
message
:
prompt
+=
"USER: "
if
utter
[
"role"
]
==
"user"
else
"ASSISTANT: "
content
,
images_sub
=
self
.
concat_tilist
(
utter
[
"content"
])
prompt
+=
content
images
.
extend
(
images_sub
)
prompt
+=
" "
if
utter
[
"role"
]
==
"user"
else
self
.
stop_str
assert
message
[
-
1
][
"role"
]
==
"user"
,
message
prompt
+=
"ASSISTANT: "
images
=
[
Image
.
open
(
s
).
convert
(
"RGB"
)
for
s
in
images
]
args
=
abstractproperty
()
args
.
image_aspect_ratio
=
"pad"
image_tensor
=
process_images
(
images
,
self
.
image_processor
,
args
).
to
(
"cuda"
,
dtype
=
torch
.
float16
)
input_ids
=
(
tokenizer_image_token
(
prompt
,
self
.
tokenizer
,
IMAGE_TOKEN_INDEX
,
return_tensors
=
"pt"
)
.
unsqueeze
(
0
)
.
cuda
()
)
keywords
=
[
self
.
stop_str
]
stopping_criteria
=
KeywordsStoppingCriteria
(
keywords
,
self
.
tokenizer
,
input_ids
)
with
torch
.
inference_mode
():
output_ids
=
self
.
model
.
generate
(
input_ids
,
images
=
image_tensor
,
stopping_criteria
=
[
stopping_criteria
],
**
self
.
kwargs
,
)
output
=
self
.
tokenizer
.
batch_decode
(
output_ids
,
skip_special_tokens
=
True
)[
0
].
strip
()
return
output
def
generate_inner
(
self
,
message
,
dataset
=
None
):
from
llava.mm_utils
import
(
process_images
,
tokenizer_image_token
,
KeywordsStoppingCriteria
,
)
from
llava.constants
import
IMAGE_TOKEN_INDEX
# Support interleave text and image
content
,
images
=
self
.
concat_tilist
(
message
)
images
=
[
Image
.
open
(
s
).
convert
(
"RGB"
)
for
s
in
images
]
args
=
abstractproperty
()
args
.
image_aspect_ratio
=
"pad"
if
images
:
image_tensor
=
process_images
(
images
,
self
.
image_processor
,
args
).
to
(
"cuda"
,
dtype
=
torch
.
float16
)
else
:
image_tensor
=
None
prompt
=
self
.
system_prompt
+
"USER: "
+
content
+
" ASSISTANT: "
input_ids
=
(
tokenizer_image_token
(
prompt
,
self
.
tokenizer
,
IMAGE_TOKEN_INDEX
,
return_tensors
=
"pt"
)
.
unsqueeze
(
0
)
.
cuda
()
)
keywords
=
[
self
.
stop_str
]
stopping_criteria
=
KeywordsStoppingCriteria
(
keywords
,
self
.
tokenizer
,
input_ids
)
with
torch
.
inference_mode
():
output_ids
=
self
.
model
.
generate
(
input_ids
,
images
=
image_tensor
,
stopping_criteria
=
[
stopping_criteria
],
**
self
.
kwargs
,
)
output
=
self
.
tokenizer
.
batch_decode
(
output_ids
,
skip_special_tokens
=
True
)[
0
].
strip
()
return
output
class
LLaVA_Next
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
True
def
__init__
(
self
,
model_path
=
"llava-hf/llava-v1.6-vicuna-7b-hf"
,
**
kwargs
):
import
transformers
from
transformers
import
(
LlavaNextProcessor
,
LlavaNextForConditionalGeneration
,
AutoProcessor
,
LlavaForConditionalGeneration
,
)
self
.
model_path
=
model_path
if
"34b"
in
model_path
.
lower
():
self
.
processor
=
LlavaNextProcessor
.
from_pretrained
(
self
.
model_path
,
use_fast
=
False
)
elif
"interleave"
in
model_path
.
lower
():
self
.
processor
=
AutoProcessor
.
from_pretrained
(
self
.
model_path
)
else
:
self
.
processor
=
LlavaNextProcessor
.
from_pretrained
(
self
.
model_path
)
flash_attn_flag
=
False
try
:
import
flash_attn
flash_attn_flag
=
True
except
ImportError
:
pass
if
flash_attn_flag
:
if
"interleave"
in
model_path
.
lower
():
model
=
LlavaForConditionalGeneration
.
from_pretrained
(
self
.
model_path
,
torch_dtype
=
torch
.
float16
,
low_cpu_mem_usage
=
True
,
use_flash_attention_2
=
True
,
)
else
:
model
=
LlavaNextForConditionalGeneration
.
from_pretrained
(
self
.
model_path
,
torch_dtype
=
torch
.
float16
,
low_cpu_mem_usage
=
True
,
use_flash_attention_2
=
True
,
)
else
:
if
"interleave"
in
model_path
.
lower
():
model
=
LlavaForConditionalGeneration
.
from_pretrained
(
self
.
model_path
,
torch_dtype
=
torch
.
float16
,
low_cpu_mem_usage
=
True
)
else
:
model
=
LlavaNextForConditionalGeneration
.
from_pretrained
(
self
.
model_path
,
torch_dtype
=
torch
.
float16
,
low_cpu_mem_usage
=
True
)
model
=
model
.
eval
()
self
.
model
=
model
.
cuda
()
kwargs_default
=
dict
(
do_sample
=
False
,
temperature
=
0
,
max_new_tokens
=
512
,
top_p
=
None
,
num_beams
=
1
)
kwargs_default
.
update
(
kwargs
)
self
.
kwargs
=
kwargs_default
warnings
.
warn
(
f
"Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. "
)
def
apply_prompt_template
(
self
,
prompt
):
model_path
=
self
.
model_path
.
lower
()
if
"mistral"
in
model_path
:
template
=
"[INST] PLACEHOLDER [/INST]"
elif
"vicuna"
in
model_path
:
template
=
(
"A chat between a curious human and an artificial intelligence assistant. "
"The assistant gives helpful, detailed, and polite answers to the human's questions. "
"USER: PLACEHOLDER ASSISTANT:"
)
elif
"34b"
in
model_path
:
template
=
(
"<|im_start|>system
\n
Answer the questions.<|im_end|><|im_start|>user
\n
PLACEHOLDER<|im_end|>"
"<|im_start|>assistant
\n
"
)
else
:
raise
NotImplementedError
(
f
"Prompt template for
{
model_path
}
not implemented."
)
prompt
=
template
.
replace
(
"PLACEHOLDER"
,
f
"<image>
\n
{
prompt
}
"
)
return
prompt
def
output_process
(
self
,
answer
):
if
"<s>"
in
answer
:
answer
=
answer
.
replace
(
"<s>"
,
""
).
strip
()
if
"[/INST]"
in
answer
:
answer
=
answer
.
split
(
"[/INST]"
)[
1
].
strip
()
elif
"ASSISTANT:"
in
answer
:
answer
=
answer
.
split
(
"ASSISTANT:"
)[
1
].
strip
()
elif
"assistant
\n
"
in
answer
:
answer
=
answer
.
split
(
"assistant
\n
"
)[
1
].
strip
()
elif
"<|end_header_id|>
\n\n
"
in
answer
:
answer
=
answer
.
split
(
"<|end_header_id|>
\n\n
"
)[
2
].
strip
()
if
"</s>"
in
answer
:
answer
=
answer
.
split
(
"</s>"
)[
0
].
strip
()
elif
"<|im_end|>"
in
answer
:
answer
=
answer
.
split
(
"<|im_end|>"
)[
0
].
strip
()
elif
"<|eot_id|>"
in
answer
:
answer
=
answer
.
split
(
"<|eot_id|>"
)[
0
].
strip
()
return
answer
def
use_custom_prompt
(
self
,
dataset
):
assert
dataset
is
not
None
if
DATASET_TYPE
(
dataset
)
==
"MCQ"
:
return
True
return
False
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
self
.
use_custom_prompt
(
dataset
)
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
"question"
]
hint
=
line
[
"hint"
]
if
(
"hint"
in
line
and
not
pd
.
isna
(
line
[
"hint"
]))
else
None
if
hint
is
not
None
:
question
=
hint
+
"
\n
"
+
question
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
for
key
,
item
in
options
.
items
():
question
+=
f
"
\n
{
key
}
.
{
item
}
"
prompt
=
question
if
len
(
options
):
prompt
+=
(
"
\n
请直接回答选项字母。"
if
cn_string
(
prompt
)
else
"
\n
Answer with the option's letter from the given choices directly."
)
else
:
prompt
+=
(
"
\n
请直接回答问题。"
if
cn_string
(
prompt
)
else
"
\n
Answer the question directly."
)
message
=
[
dict
(
type
=
"image"
,
value
=
s
)
for
s
in
tgt_path
]
message
.
append
(
dict
(
type
=
"text"
,
value
=
prompt
))
return
message
def
generate_inner
(
self
,
message
,
dataset
=
None
):
content
,
images
=
[],
[]
for
msg
in
message
:
if
msg
[
"type"
]
==
"text"
:
content
.
append
({
"type"
:
msg
[
"type"
],
"text"
:
msg
[
"value"
]})
else
:
content
.
append
({
"type"
:
"image"
})
images
.
append
(
Image
.
open
(
msg
[
"value"
]).
convert
(
"RGB"
))
conversation
=
[
{
"role"
:
"user"
,
"content"
:
content
,
}
]
prompt
=
self
.
processor
.
apply_chat_template
(
conversation
,
add_generation_prompt
=
True
)
inputs
=
self
.
processor
(
prompt
,
images
,
return_tensors
=
"pt"
).
to
(
"cuda"
,
torch
.
float16
)
output
=
self
.
model
.
generate
(
**
inputs
,
**
self
.
kwargs
)
answer
=
self
.
processor
.
decode
(
output
[
0
],
skip_special_token
=
True
)
answer
=
self
.
output_process
(
answer
)
return
answer
class
LLaVA_Next2
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
True
DEFAULT_IMAGE_TOKEN
=
"<image>"
IMAGE_TOKEN_INDEX
=
-
200
def
__init__
(
self
,
model_path
=
"lmms-lab/llama3-llava-next-8b"
,
**
kwargs
):
assert
model_path
is
not
None
try
:
from
llava.model.builder
import
load_pretrained_model
from
llava.conversation
import
conv_templates
,
SeparatorStyle
from
llava.mm_utils
import
(
get_model_name_from_path
,
tokenizer_image_token
,
KeywordsStoppingCriteria
,
)
except
Exception
as
err
:
logging
.
critical
(
"Please `pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git`"
)
raise
err
model_name
=
get_model_name_from_path
(
model_path
)
tokenizer
,
model
,
image_processor
,
_
=
load_pretrained_model
(
model_path
,
None
,
model_name
,
device_map
=
None
)
model
.
cuda
().
eval
()
model
.
tie_weights
()
if
"llama3"
in
model_path
.
lower
():
conv_mode
=
"llava_llama_3"
elif
"qwen"
in
model_path
.
lower
():
conv_mode
=
"qwen_1_5"
self
.
conv_template
=
conv_mode
self
.
conv_templates
=
conv_templates
self
.
tokenizer
=
tokenizer
self
.
model
=
model
self
.
image_processor
=
image_processor
self
.
tokenizer_image_token
=
tokenizer_image_token
self
.
KeywordStoppingCriteria
=
KeywordsStoppingCriteria
self
.
SeparatorStyle
=
SeparatorStyle
def
generate_inner
(
self
,
message
,
dataset
=
None
):
content
,
images
=
""
,
[]
for
msg
in
message
:
if
msg
[
"type"
]
==
"text"
:
content
+=
msg
[
"value"
]
else
:
images
.
append
(
Image
.
open
(
msg
[
"value"
]).
convert
(
"RGB"
))
content
+=
self
.
DEFAULT_IMAGE_TOKEN
+
"
\n
"
preprocess
=
self
.
image_processor
.
preprocess
image_tokenizer
=
self
.
tokenizer_image_token
image_tensor
=
[
preprocess
(
f
,
return_tensors
=
"pt"
)[
"pixel_values"
][
0
].
half
().
cuda
()
for
f
in
images
]
image_tensor
=
torch
.
stack
(
image_tensor
)
conv
=
copy
.
deepcopy
(
self
.
conv_templates
[
self
.
conv_template
])
conv
.
append_message
(
conv
.
roles
[
0
],
content
)
conv
.
append_message
(
conv
.
roles
[
1
],
None
)
prompt_question
=
conv
.
get_prompt
()
input_ids
=
image_tokenizer
(
prompt_question
,
self
.
tokenizer
,
self
.
IMAGE_TOKEN_INDEX
,
return_tensors
=
"pt"
)
input_ids
=
input_ids
.
unsqueeze
(
0
).
cuda
()
stop_str
=
conv
.
sep
if
conv
.
sep_style
!=
self
.
SeparatorStyle
.
TWO
else
conv
.
sep2
keywords
=
[
stop_str
]
stopping_criteria
=
self
.
KeywordStoppingCriteria
(
keywords
,
self
.
tokenizer
,
input_ids
)
cont
=
self
.
model
.
generate
(
input_ids
,
images
=
image_tensor
,
do_sample
=
False
,
temperature
=
0
,
max_new_tokens
=
512
,
stopping_criteria
=
[
stopping_criteria
],
)
text_outputs
=
self
.
tokenizer
.
batch_decode
(
cont
,
skip_special_tokens
=
True
)[
0
]
return
text_outputs
class
LLaVA_OneVision
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
True
VIDEO_LLM
=
True
DEFAULT_IMAGE_TOKEN
=
"<image>"
IMAGE_TOKEN_INDEX
=
-
200
# This function is used to split InternVL2-Llama3-76B
def
split_model
(
self
,
model_path
):
import
math
device_map
=
{}
num_gpus
=
torch
.
cuda
.
device_count
()
rank
,
world_size
=
get_rank_and_world_size
()
num_gpus
=
num_gpus
//
world_size
if
"72b"
not
in
model_path
.
lower
():
return
None
# embed_tokens, vision_tower, mm_projector, lm_head are treated as 2 layers
num_layers
=
80
+
8
num_layers_per_gpu
=
math
.
ceil
(
num_layers
/
num_gpus
)
num_layers_per_gpu
=
[
num_layers_per_gpu
]
*
num_gpus
num_layers_per_gpu
[
0
]
-=
6
num_layers_per_gpu
[
-
1
]
-=
2
layer_cnt
=
0
for
i
,
num_layer
in
enumerate
(
num_layers_per_gpu
):
for
j
in
range
(
num_layer
):
device_map
[
f
"model.layers.
{
layer_cnt
}
"
]
=
rank
+
world_size
*
i
layer_cnt
+=
1
last_gpu
=
rank
+
world_size
*
(
num_gpus
-
1
)
device_map
[
"model.image_newline"
]
=
rank
device_map
[
"model.embed_tokens"
]
=
rank
device_map
[
"model.norm"
]
=
rank
device_map
[
"model.vision_tower"
]
=
rank
device_map
[
"model.vision_resampler"
]
=
rank
device_map
[
"model.mm_projector"
]
=
rank
device_map
[
"lm_head"
]
=
last_gpu
return
device_map
def
__init__
(
self
,
model_path
=
"lmms-lab/llava-onevision-qwen2-7b-si"
,
**
kwargs
):
assert
model_path
is
not
None
try
:
from
llava.model.builder
import
load_pretrained_model
from
llava.conversation
import
conv_templates
,
SeparatorStyle
from
llava.mm_utils
import
(
get_model_name_from_path
,
process_images
,
tokenizer_image_token
,
KeywordsStoppingCriteria
,
)
# noqa: E501
except
Exception
as
err
:
logging
.
critical
(
"Please `pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git`"
)
raise
err
video_kwargs_default
=
dict
(
overwrite
=
True
,
mm_spatial_pool_mode
=
"average"
,
force_sample
=
True
)
video_kwargs_default
.
update
(
kwargs
)
self
.
video_kwargs
=
video_kwargs_default
overwrite_config
=
None
if
"video"
in
model_path
.
lower
():
if
self
.
video_kwargs
[
"overwrite"
]:
overwrite_config
=
{}
overwrite_config
[
"mm_spatial_pool_mode"
]
=
self
.
video_kwargs
[
"mm_spatial_pool_mode"
]
rank
,
world_size
=
get_rank_and_world_size
()
model_name
=
get_model_name_from_path
(
model_path
)
device_map
=
self
.
split_model
(
model_path
)
if
device_map
is
None
:
if
auto_split_flag
():
assert
world_size
==
1
,
'Only support world_size == 1 when AUTO_SPLIT set for non-72B LLaVA-OneVision'
logging
.
warning
(
'Currently, we only support to split the non-72B model across all GPUs.'
)
tokenizer
,
model
,
image_processor
,
_
=
load_pretrained_model
(
model_path
,
None
,
model_name
,
device_map
=
"auto"
,
overwrite_config
=
overwrite_config
,
)
else
:
tokenizer
,
model
,
image_processor
,
_
=
load_pretrained_model
(
model_path
,
None
,
model_name
,
device_map
=
"cpu"
,
overwrite_config
=
overwrite_config
,
)
model
.
cuda
()
else
:
tokenizer
,
model
,
image_processor
,
_
=
load_pretrained_model
(
model_path
,
None
,
model_name
,
device_map
=
device_map
,
overwrite_config
=
overwrite_config
,
)
model
.
eval
()
model
.
tie_weights
()
if
"llava"
in
model_path
.
lower
():
conv_mode
=
"qwen_1_5"
if
'llava-video'
in
model_path
.
lower
():
self
.
nframe
=
64
else
:
self
.
nframe
=
16
if
"72b"
in
model_path
.
lower
():
self
.
nframe
=
32
if
"video"
in
model_path
.
lower
():
self
.
force_sample
=
self
.
video_kwargs
[
"force_sample"
]
else
:
self
.
force_sample
=
False
self
.
conv_template
=
conv_mode
self
.
conv_templates
=
conv_templates
self
.
tokenizer
=
tokenizer
self
.
model
=
model
self
.
image_processor
=
image_processor
self
.
tokenizer_image_token
=
tokenizer_image_token
self
.
process_images
=
(
process_images
# Store process_images as a class attribute
)
self
.
KeywordStoppingCriteria
=
KeywordsStoppingCriteria
self
.
SeparatorStyle
=
SeparatorStyle
def
generate_inner_image
(
self
,
message
,
dataset
=
None
):
content
,
images
=
""
,
[]
image_sizes
=
[]
# Store image sizes
for
msg
in
message
:
if
msg
[
"type"
]
==
"text"
:
content
+=
msg
[
"value"
]
else
:
img
=
Image
.
open
(
msg
[
"value"
]).
convert
(
"RGB"
)
images
.
append
(
img
)
image_sizes
.
append
(
img
.
size
)
# Store the size of each image
content
+=
self
.
DEFAULT_IMAGE_TOKEN
+
"
\n
"
# Process images using the class attribute self.process_images
image_tensor
=
self
.
process_images
(
images
,
self
.
image_processor
,
self
.
model
.
config
)
image_tensor
=
[
_image
.
to
(
dtype
=
torch
.
float16
,
device
=
"cuda"
)
for
_image
in
image_tensor
]
conv
=
copy
.
deepcopy
(
self
.
conv_templates
[
self
.
conv_template
])
conv
.
append_message
(
conv
.
roles
[
0
],
content
)
conv
.
append_message
(
conv
.
roles
[
1
],
None
)
prompt_question
=
conv
.
get_prompt
()
input_ids
=
self
.
tokenizer_image_token
(
prompt_question
,
self
.
tokenizer
,
self
.
IMAGE_TOKEN_INDEX
,
return_tensors
=
"pt"
)
input_ids
=
input_ids
.
unsqueeze
(
0
).
cuda
()
stop_str
=
conv
.
sep
if
conv
.
sep_style
!=
self
.
SeparatorStyle
.
TWO
else
conv
.
sep2
keywords
=
[
stop_str
]
stopping_criteria
=
self
.
KeywordStoppingCriteria
(
keywords
,
self
.
tokenizer
,
input_ids
)
# Pass image sizes along with other parameters
cont
=
self
.
model
.
generate
(
input_ids
,
images
=
image_tensor
,
image_sizes
=
image_sizes
,
# Pass the image sizes here
do_sample
=
False
,
temperature
=
0
,
max_new_tokens
=
512
,
stopping_criteria
=
[
stopping_criteria
],
)
text_outputs
=
self
.
tokenizer
.
batch_decode
(
cont
,
skip_special_tokens
=
True
)[
0
]
return
text_outputs
def
generate_inner_video
(
self
,
message
,
dataset
=
None
):
content
,
text_content
,
visual_content
,
videos
=
""
,
""
,
""
,
[]
for
msg
in
message
:
if
msg
[
"type"
]
==
"text"
:
text_content
+=
msg
[
"value"
]
else
:
videos
.
append
(
msg
[
"value"
])
visual_content
+=
self
.
DEFAULT_IMAGE_TOKEN
+
"
\n
"
if
len
(
videos
)
>
1
:
raise
ValueError
(
"LLaVA-OneVision does not support multiple videos as input."
)
video_frames
,
frame_time
,
video_time
=
self
.
load_video
(
videos
[
0
],
self
.
nframe
,
self
.
force_sample
)
time_instruciton
=
(
f
"The video lasts for
{
video_time
:.
2
f
}
seconds,"
f
"and
{
len
(
video_frames
[
0
])
}
frames are uniformly sampled from it."
f
"These frames are located at
{
frame_time
}
."
f
"Please answer the following questions related to this video.
\n
"
)
if
self
.
force_sample
:
content
=
visual_content
+
time_instruciton
+
text_content
else
:
content
=
visual_content
+
text_content
image_tensors
=
[]
frames
=
(
self
.
image_processor
.
preprocess
(
video_frames
,
return_tensors
=
"pt"
)[
"pixel_values"
]
.
half
()
.
cuda
()
)
image_tensors
.
append
(
frames
)
conv
=
copy
.
deepcopy
(
self
.
conv_templates
[
self
.
conv_template
])
conv
.
append_message
(
conv
.
roles
[
0
],
content
)
conv
.
append_message
(
conv
.
roles
[
1
],
None
)
prompt_question
=
conv
.
get_prompt
()
input_ids
=
self
.
tokenizer_image_token
(
prompt_question
,
self
.
tokenizer
,
self
.
IMAGE_TOKEN_INDEX
,
return_tensors
=
"pt"
)
input_ids
=
input_ids
.
unsqueeze
(
0
).
cuda
()
image_sizes
=
[
frame
.
size
for
frame
in
video_frames
]
modalities
=
[
"video"
]
*
len
(
video_frames
)
stop_str
=
conv
.
sep
if
conv
.
sep_style
!=
self
.
SeparatorStyle
.
TWO
else
conv
.
sep2
keywords
=
[
stop_str
]
stopping_criteria
=
self
.
KeywordStoppingCriteria
(
keywords
,
self
.
tokenizer
,
input_ids
)
# Pass image sizes along with other parameters
cont
=
self
.
model
.
generate
(
input_ids
,
images
=
image_tensors
,
image_sizes
=
image_sizes
,
# Pass the image sizes here
do_sample
=
False
,
temperature
=
0
,
max_new_tokens
=
512
,
modalities
=
modalities
,
stopping_criteria
=
[
stopping_criteria
],
)
text_outputs
=
self
.
tokenizer
.
batch_decode
(
cont
,
skip_special_tokens
=
True
)[
0
]
return
text_outputs
def
load_video
(
self
,
video_path
,
max_frames_num
,
force_sample
=
False
,
fps
=
1
):
from
decord
import
VideoReader
,
cpu
import
numpy
as
np
if
max_frames_num
==
0
:
return
np
.
zeros
((
1
,
336
,
336
,
3
))
vr
=
VideoReader
(
video_path
,
ctx
=
cpu
(
0
),
num_threads
=
1
)
total_frame_num
=
len
(
vr
)
video_time
=
total_frame_num
/
vr
.
get_avg_fps
()
fps
=
round
(
vr
.
get_avg_fps
()
/
fps
)
frame_idx
=
[
i
for
i
in
range
(
0
,
len
(
vr
),
fps
)]
frame_time
=
[
i
/
fps
for
i
in
frame_idx
]
if
len
(
frame_idx
)
>
max_frames_num
or
force_sample
:
sample_fps
=
max_frames_num
uniform_sampled_frames
=
np
.
linspace
(
0
,
total_frame_num
-
1
,
sample_fps
,
dtype
=
int
)
frame_idx
=
uniform_sampled_frames
.
tolist
()
frame_time
=
[
i
/
vr
.
get_avg_fps
()
for
i
in
frame_idx
]
frame_time
=
","
.
join
([
f
"
{
i
:.
2
f
}
s"
for
i
in
frame_time
])
spare_frames
=
vr
.
get_batch
(
frame_idx
).
asnumpy
()
# import pdb;pdb.set_trace()
return
spare_frames
,
frame_time
,
video_time
def
generate_inner
(
self
,
message
,
dataset
=
None
):
if
DATASET_MODALITY
(
dataset
)
==
'VIDEO'
:
return
self
.
generate_inner_video
(
message
,
dataset
)
else
:
return
self
.
generate_inner_image
(
message
,
dataset
)
class
LLaVA_OneVision_HF
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
True
VIDEO_LLM
=
True
DEFAULT_IMAGE_TOKEN
=
"<image>"
IMAGE_TOKEN_INDEX
=
-
200
def
__init__
(
self
,
model_path
=
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
,
**
kwargs
):
from
transformers
import
AutoProcessor
,
LlavaOnevisionForConditionalGeneration
assert
model_path
is
not
None
,
"Model path must be provided."
self
.
model
=
LlavaOnevisionForConditionalGeneration
.
from_pretrained
(
model_path
,
torch_dtype
=
torch
.
float16
,
low_cpu_mem_usage
=
True
).
to
(
'cuda'
)
self
.
processor
=
AutoProcessor
.
from_pretrained
(
model_path
)
self
.
video_kwargs
=
kwargs
.
get
(
"video_kwargs"
,
{})
self
.
force_sample
=
self
.
video_kwargs
.
get
(
"force_sample"
,
False
)
self
.
nframe
=
kwargs
.
get
(
"nframe"
,
8
)
self
.
fps
=
1
self
.
model_path
=
model_path
def
generate_inner_image
(
self
,
message
,
dataset
=
None
):
content
,
images
=
""
,
[]
image_sizes
=
[]
for
msg
in
message
:
if
msg
[
"type"
]
==
"text"
:
content
+=
msg
[
"value"
]
elif
msg
[
"type"
]
==
"image"
:
img
=
Image
.
open
(
msg
[
"value"
]).
convert
(
"RGB"
)
images
.
append
(
img
)
image_sizes
.
append
(
img
.
size
)
content
+=
self
.
DEFAULT_IMAGE_TOKEN
+
"
\n
"
conversation
=
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"text"
,
"text"
:
content
.
split
(
"
\n
"
,
1
)[
-
1
]},
{
"type"
:
"image"
},
],
}
]
prompt
=
self
.
processor
.
apply_chat_template
(
conversation
,
add_generation_prompt
=
True
)
inputs
=
self
.
processor
(
images
=
images
,
text
=
prompt
,
return_tensors
=
"pt"
).
to
(
'cuda'
,
torch
.
float16
)
output
=
self
.
model
.
generate
(
**
inputs
,
max_new_tokens
=
512
)
if
self
.
model_path
==
"NCSOFT/VARCO-VISION-14B-HF"
:
return
self
.
processor
.
decode
(
output
[
0
][
inputs
.
input_ids
.
shape
[
1
]:],
skip_special_tokens
=
True
)
return
self
.
processor
.
decode
(
output
[
0
],
skip_special_tokens
=
True
)
def
generate_inner_video
(
self
,
message
,
dataset
=
None
):
content
,
text_content
,
visual_content
,
videos
=
""
,
""
,
""
,
[]
for
msg
in
message
:
if
msg
[
"type"
]
==
"text"
:
text_content
+=
msg
[
"value"
]
elif
msg
[
"type"
]
==
"video"
:
videos
.
append
(
msg
[
"value"
])
visual_content
+=
self
.
DEFAULT_IMAGE_TOKEN
+
"
\n
"
if
len
(
videos
)
>
1
:
raise
ValueError
(
"LLaVA-OneVision does not support multiple videos as input."
)
video_frames
,
frame_time
,
video_time
=
self
.
load_video
(
videos
[
0
],
self
.
nframe
,
fps
=
1
,
force_sample
=
self
.
force_sample
)
time_instruction
=
(
f
"The video lasts for
{
video_time
:.
2
f
}
seconds, "
f
"and
{
len
(
video_frames
)
}
frames are uniformly sampled from it. "
f
"These frames are located at
{
frame_time
}
. "
f
"Please answer the following questions related to this video.
\n
"
)
content
=
visual_content
+
time_instruction
+
text_content
conversation
=
[
{
"role"
:
"user"
,
"content"
:
[{
"type"
:
"text"
,
"text"
:
content
},
{
"type"
:
"video"
}],
}
]
prompt
=
self
.
processor
.
apply_chat_template
(
conversation
,
add_generation_prompt
=
True
)
inputs
=
self
.
processor
(
videos
=
video_frames
,
text
=
prompt
,
return_tensors
=
"pt"
).
to
(
'cuda'
,
torch
.
float16
)
output
=
self
.
model
.
generate
(
**
inputs
,
max_new_tokens
=
512
)
return
self
.
processor
.
decode
(
output
[
0
],
skip_special_tokens
=
True
)
def
load_video
(
self
,
video_path
,
max_frames_num
,
fps
=
1
,
force_sample
=
False
):
from
decord
import
VideoReader
,
cpu
import
numpy
as
np
vr
=
VideoReader
(
video_path
,
ctx
=
cpu
(
0
),
num_threads
=
1
)
total_frame_num
=
len
(
vr
)
avg_fps
=
vr
.
get_avg_fps
()
if
avg_fps
==
0
:
raise
ValueError
(
f
"Video '
{
video_path
}
' has an average FPS of 0, which is invalid."
)
if
fps
<=
0
:
raise
ValueError
(
"FPS argument must be greater than 0."
)
effective_fps
=
round
(
avg_fps
/
fps
)
frame_idx
=
list
(
range
(
0
,
total_frame_num
,
effective_fps
))
frame_time
=
[
i
/
avg_fps
for
i
in
frame_idx
]
if
len
(
frame_idx
)
>
max_frames_num
or
force_sample
:
uniform_sampled_frames
=
np
.
linspace
(
0
,
total_frame_num
-
1
,
max_frames_num
,
dtype
=
int
)
frame_idx
=
uniform_sampled_frames
.
tolist
()
frame_time
=
[
i
/
avg_fps
for
i
in
frame_idx
]
frame_time_str
=
", "
.
join
([
f
"
{
t
:.
2
f
}
s"
for
t
in
frame_time
])
video_frames
=
vr
.
get_batch
(
frame_idx
).
asnumpy
()
video_time
=
total_frame_num
/
avg_fps
return
video_frames
,
frame_time_str
,
video_time
def
generate_inner
(
self
,
message
,
dataset
=
None
):
if
DATASET_MODALITY
(
dataset
)
==
"VIDEO"
:
return
self
.
generate_inner_video
(
message
,
dataset
)
else
:
return
self
.
generate_inner_image
(
message
,
dataset
)
VLMEvalKit/vlmeval/vlm/llava/llava_xtuner.py
0 → 100644
View file @
bc5ebf0f
import
os
import
os.path
as
osp
import
string
import
sys
import
warnings
import
pandas
as
pd
import
torch
from
huggingface_hub
import
snapshot_download
from
PIL
import
Image
from
transformers
import
(
AutoModel
,
AutoModelForCausalLM
,
AutoTokenizer
,
CLIPImageProcessor
,
CLIPVisionModel
,
GenerationConfig
,
StoppingCriteriaList
)
from
..base
import
BaseModel
from
...smp
import
*
from
...dataset
import
DATASET_TYPE
class
LLaVA_XTuner
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
False
def
__init__
(
self
,
llava_path
,
llm_path
=
None
,
visual_encoder_path
=
'openai/clip-vit-large-patch14-336'
,
visual_select_layer
=-
2
,
prompt_template
=
None
,
stop_words
=
[],
torch_dtype
=
torch
.
float16
):
try
:
from
peft
import
PeftModel
from
xtuner.utils
import
PROMPT_TEMPLATE
,
StopWordStoppingCriteria
except
Exception
as
err
:
logging
.
critical
(
'Please install xtuner with `pip install -U xtuner` before '
'using LLaVA_XTuner'
)
raise
err
if
not
osp
.
isdir
(
llava_path
):
cache_path
=
get_cache_path
(
llava_path
)
if
cache_path
is
not
None
:
llava_path
=
cache_path
else
:
llava_path
=
snapshot_download
(
repo_id
=
llava_path
)
assert
osp
.
exists
(
llava_path
)
and
osp
.
isdir
(
llava_path
)
# build visual_encoder
if
'llm'
in
os
.
listdir
(
llava_path
):
assert
llm_path
is
None
,
(
"Please don't specify the `llm_path` since passed "
'`llava_path` contains a LLM!'
)
llm_path
=
osp
.
join
(
llava_path
,
'llm'
)
else
:
assert
llm_path
is
not
None
,
'Please specify the `llm_path`!'
llm
=
AutoModelForCausalLM
.
from_pretrained
(
llm_path
,
trust_remote_code
=
True
,
torch_dtype
=
torch_dtype
,
device_map
=
'cpu'
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
llm_path
,
trust_remote_code
=
True
,
encode_special_tokens
=
True
)
print
(
f
'Load LLM from
{
llm_path
}
'
)
# build visual_encoder
if
'visual_encoder'
in
os
.
listdir
(
llava_path
):
assert
visual_encoder_path
is
None
,
(
"Please don't specify the `visual_encoder_path` since passed "
'`llava_path` contains a visual encoder!'
)
visual_encoder_path
=
osp
.
join
(
llava_path
,
'visual_encoder'
)
else
:
assert
visual_encoder_path
is
not
None
,
(
'Please specify the `visual_encoder_path`!'
)
visual_encoder
=
CLIPVisionModel
.
from_pretrained
(
visual_encoder_path
,
torch_dtype
=
torch_dtype
,
device_map
=
'cpu'
)
image_processor
=
CLIPImageProcessor
.
from_pretrained
(
visual_encoder_path
)
print
(
f
'Load visual_encoder from
{
visual_encoder_path
}
'
)
# load adapter
if
'llm_adapter'
in
os
.
listdir
(
llava_path
):
adapter_path
=
osp
.
join
(
llava_path
,
'llm_adapter'
)
llm
=
PeftModel
.
from_pretrained
(
llm
,
adapter_path
,
trust_remote_code
=
True
,
device_map
=
'cpu'
)
print
(
f
'Load LLM adapter from
{
llava_path
}
'
)
if
'visual_encoder_adapter'
in
os
.
listdir
(
llava_path
):
adapter_path
=
osp
.
join
(
llava_path
,
'visual_encoder_adapter'
)
visual_encoder
=
PeftModel
.
from_pretrained
(
visual_encoder
,
adapter_path
,
trust_remote_code
=
True
,
device_map
=
'cpu'
)
print
(
f
'Load visual_encoder adapter from
{
llava_path
}
'
)
# build projector
projector_path
=
osp
.
join
(
llava_path
,
'projector'
)
projector
=
AutoModel
.
from_pretrained
(
projector_path
,
trust_remote_code
=
True
,
torch_dtype
=
torch_dtype
,
device_map
=
'cpu'
)
print
(
f
'Load projector from
{
llava_path
}
'
)
llm
.
eval
()
visual_encoder
.
eval
()
projector
.
eval
()
self
.
llm
=
llm
.
cuda
()
self
.
tokenizer
=
tokenizer
self
.
visual_encoder
=
visual_encoder
.
cuda
()
self
.
image_processor
=
image_processor
self
.
projector
=
projector
.
cuda
()
self
.
visual_select_layer
=
visual_select_layer
if
prompt_template
is
not
None
:
# modified prompt template
if
prompt_template
==
'llama3_chat'
:
self
.
prompt_template
=
dict
(
SYSTEM
=
(
'<|start_header_id|>system<|end_header_id|>
\n\n
'
'{system}<|eot_id|>'
),
INSTRUCTION
=
(
'<|start_header_id|>user<|end_header_id|>
\n\n
{input}<|eot_id|>'
'<|start_header_id|>assistant<|end_header_id|>
\n\n
'
),
SUFFIX
=
'<|eot_id|>'
,
SUFFIX_AS_EOS
=
True
,
STOP_WORDS
=
[
'<|eot_id|>'
])
else
:
self
.
prompt_template
=
PROMPT_TEMPLATE
[
prompt_template
]
stop_words
+=
self
.
prompt_template
.
get
(
'STOP_WORDS'
,
[])
else
:
self
.
prompt_template
=
None
self
.
stop_criteria
=
StoppingCriteriaList
()
for
word
in
stop_words
:
self
.
stop_criteria
.
append
(
StopWordStoppingCriteria
(
self
.
tokenizer
,
word
))
def
build_gen_config
(
self
,
dataset
):
gen_kwargs
=
dict
(
max_new_tokens
=
512
,
do_sample
=
True
,
temperature
=
1
,
num_beams
=
5
,
eos_token_id
=
self
.
tokenizer
.
eos_token_id
,
pad_token_id
=
self
.
tokenizer
.
pad_token_id
if
self
.
tokenizer
.
pad_token_id
is
not
None
else
self
.
tokenizer
.
eos_token_id
)
# For single word generation
if
(
dataset
is
not
None
and
DATASET_TYPE
(
dataset
)
in
[
'MCQ'
,
'Y/N'
]):
gen_kwargs
.
update
(
dict
(
max_new_tokens
=
5
,
do_sample
=
False
,
num_beams
=
1
))
return
GenerationConfig
(
**
gen_kwargs
)
def
use_custom_prompt
(
self
,
dataset
):
assert
dataset
is
not
None
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
return
True
return
False
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
self
.
use_custom_prompt
(
dataset
)
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
'question'
]
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
if
hint
is
not
None
:
question
=
hint
+
'
\n
'
+
question
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
for
key
,
item
in
options
.
items
():
question
+=
f
'
\n
{
key
}
.
{
item
}
'
if
not
cn_string
(
question
):
prompt
=
question
+
'
\n
'
+
(
"Answer with the option's letter "
'from the given choices directly.'
)
else
:
prompt
=
question
+
'
\n
'
+
'请直接回答选项字母。'
message
=
[
dict
(
type
=
'text'
,
value
=
prompt
)]
message
.
extend
([
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
])
return
message
def
generate_inner
(
self
,
message
,
dataset
=
None
):
from
xtuner.dataset.utils
import
expand2square
from
xtuner.model.utils
import
prepare_inputs_labels_for_multimodal
from
xtuner.utils
import
DEFAULT_IMAGE_TOKEN
,
IMAGE_TOKEN_INDEX
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
prompt
=
prompt
.
replace
(
'<image>'
,
''
)
image
=
Image
.
open
(
image_path
).
convert
(
'RGB'
)
image
=
expand2square
(
image
,
tuple
(
int
(
x
*
255
)
for
x
in
self
.
image_processor
.
image_mean
))
image
=
self
.
image_processor
.
preprocess
(
image
,
return_tensors
=
'pt'
)[
'pixel_values'
][
0
]
image
=
image
.
cuda
().
unsqueeze
(
0
)
visual_outputs
=
self
.
visual_encoder
(
image
,
output_hidden_states
=
True
)
pixel_values
=
self
.
projector
(
visual_outputs
.
hidden_states
[
self
.
visual_select_layer
][:,
1
:])
inputs
=
DEFAULT_IMAGE_TOKEN
+
'
\n
'
+
prompt
if
self
.
prompt_template
:
inputs
=
self
.
prompt_template
[
'INSTRUCTION'
].
format
(
input
=
inputs
)
chunk_encode
=
[]
for
idx
,
chunk
in
enumerate
(
inputs
.
split
(
DEFAULT_IMAGE_TOKEN
)):
if
idx
==
0
:
cur_encode
=
self
.
tokenizer
(
chunk
)
else
:
cur_encode
=
self
.
tokenizer
(
chunk
,
add_special_tokens
=
False
)
chunk_encode
.
append
(
cur_encode
)
assert
len
(
chunk_encode
)
==
2
ids
=
[]
for
idx
,
cur_chunk_encode
in
enumerate
(
chunk_encode
):
ids
.
extend
(
cur_chunk_encode
[
'input_ids'
])
if
idx
!=
len
(
chunk_encode
)
-
1
:
ids
.
append
(
IMAGE_TOKEN_INDEX
)
ids
=
torch
.
tensor
(
ids
).
cuda
().
unsqueeze
(
0
)
mm_inputs
=
prepare_inputs_labels_for_multimodal
(
llm
=
self
.
llm
,
input_ids
=
ids
,
pixel_values
=
pixel_values
)
gen_config
=
self
.
build_gen_config
(
dataset
)
generate_output
=
self
.
llm
.
generate
(
**
mm_inputs
,
generation_config
=
gen_config
,
streamer
=
None
,
bos_token_id
=
self
.
tokenizer
.
bos_token_id
,
stopping_criteria
=
self
.
stop_criteria
)
predict
=
self
.
tokenizer
.
decode
(
generate_output
[
0
],
skip_special_tokens
=
True
).
strip
()
return
predict
VLMEvalKit/vlmeval/vlm/mantis.py
0 → 100644
View file @
bc5ebf0f
import
torch
from
PIL
import
Image
from
abc
import
abstractproperty
from
.base
import
BaseModel
from
..smp
import
*
from
..dataset
import
DATASET_TYPE
import
warnings
class
Mantis
(
BaseModel
):
"""
Mantis Model
This implementation is adpated from the Llava model from llava.py and the Idefics model from idefics.py
"""
INSTALL_REQ
=
True
INTERLEAVE
=
True
DEFAULT_IMAGE_TOKEN
=
'<image>'
IMAGE_TOKEN_INDEX
=
-
200
def
__init__
(
self
,
model_path
=
'TIGER-Lab/Mantis-8B-siglip-llama3'
,
**
kwargs
):
assert
model_path
is
not
None
try
:
from
mantis.models.mllava
import
LlavaForConditionalGeneration
,
MLlavaProcessor
from
mantis.models.mfuyu
import
MFuyuForCausalLM
,
MFuyuProcessor
from
mantis.models.conversation
import
conv_mllava_v1
as
default_conv
,
conv_templates
except
Exception
as
e
:
logging
.
critical
(
"Mantis is not installed. Please install Mantis to use this model.Please use 'pip install "
"git+https://github.com/TIGER-AI-Lab/Mantis.git' to install"
)
raise
e
try
:
from
transformers
import
AutoModelForVision2Seq
,
AutoProcessor
except
Exception
as
e
:
logging
.
critical
(
f
'
{
type
(
e
)
}
:
{
e
}
'
)
logging
.
critical
(
"Upgrade transformers to use Mantis's idefics model.
\n
Error: %s"
%
e
)
# inference implementation for attention, can be "sdpa", "eager", "flash_attention_2".
# Seems FA2 is not effective during inference:
# https://discuss.huggingface.co/t/flash-attention-has-no-effect-on-inference/73453/5
# if is_flash_attn_2_available:
# best_fit_attn_implementation = "flash_attention_2"
# flash_attn has a bug that says: ERROR Error query and key must have the same dtype in generating
try
:
import
flash_attn
best_fit_attn_implementation
=
'flash_attention_2'
except
ImportError
:
best_fit_attn_implementation
=
'eager'
self
.
model_path
=
model_path
attn_implementation
=
best_fit_attn_implementation
self
.
_is_idefics
=
'idefics'
in
model_path
.
lower
()
# Here load the "non-idefics" Mantis model.
if
not
self
.
_is_idefics
:
if
'fuyu'
in
model_path
.
lower
():
self
.
processor
=
MFuyuProcessor
.
from_pretrained
(
self
.
model_path
)
model
=
MFuyuForCausalLM
.
from_pretrained
(
self
.
model_path
,
device_map
=
'cuda'
,
attn_implementation
=
attn_implementation
,
torch_dtype
=
torch
.
float16
)
else
:
self
.
processor
=
MLlavaProcessor
.
from_pretrained
(
self
.
model_path
)
model
=
LlavaForConditionalGeneration
.
from_pretrained
(
self
.
model_path
,
device_map
=
'cuda'
,
attn_implementation
=
attn_implementation
,
torch_dtype
=
torch
.
float16
)
else
:
self
.
processor
=
AutoProcessor
.
from_pretrained
(
self
.
model_path
)
model
=
AutoModelForVision2Seq
.
from_pretrained
(
self
.
model_path
,
device_map
=
'cuda'
,
torch_dtype
=
torch
.
float16
)
model
=
model
.
eval
()
self
.
model
=
model
.
cuda
()
kwargs_default
=
dict
(
do_sample
=
False
,
temperature
=
0
,
max_new_tokens
=
1024
,
top_p
=
None
,
num_beams
=
1
)
kwargs_default
.
update
(
kwargs
)
self
.
kwargs
=
kwargs_default
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
self
.
tokenizer
=
self
.
processor
.
tokenizer
self
.
default_conv
=
default_conv
self
.
conv_templates
=
conv_templates
def
use_custom_prompt
(
self
,
dataset
):
assert
dataset
is
not
None
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
return
True
return
False
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
self
.
use_custom_prompt
(
dataset
)
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
'question'
]
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
if
hint
is
not
None
:
question
=
hint
+
'
\n
'
+
question
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
for
key
,
item
in
options
.
items
():
question
+=
f
'
\n
{
key
}
.
{
item
}
'
prompt
=
question
if
len
(
options
):
prompt
+=
(
'
\n
请直接回答选项字母。'
if
cn_string
(
prompt
)
else
"
\n
Answer with the option's letter from the given choices directly."
)
else
:
prompt
+=
'
\n
请直接回答问题。'
if
cn_string
(
prompt
)
else
'
\n
Answer the question directly.'
message
=
[
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
]
message
.
append
(
dict
(
type
=
'text'
,
value
=
prompt
))
return
message
def
output_process
(
self
,
answer
):
if
'<s>'
in
answer
:
answer
=
answer
.
replace
(
'<s>'
,
''
).
strip
()
if
'[/INST]'
in
answer
:
answer
=
answer
.
split
(
'[/INST]'
)[
1
].
strip
()
elif
'ASSISTANT:'
in
answer
:
answer
=
answer
.
split
(
'ASSISTANT:'
)[
1
].
strip
()
elif
'assistant
\n
'
in
answer
:
answer
=
answer
.
split
(
'assistant
\n
'
)[
1
].
strip
()
elif
'<|end_header_id|>
\n\n
'
in
answer
:
answer
=
answer
.
split
(
'<|end_header_id|>
\n\n
'
)[
2
].
strip
()
if
'</s>'
in
answer
:
answer
=
answer
.
split
(
'</s>'
)[
0
].
strip
()
elif
'<|im_end|>'
in
answer
:
answer
=
answer
.
split
(
'<|im_end|>'
)[
0
].
strip
()
elif
'<|eot_id|>'
in
answer
:
answer
=
answer
.
split
(
'<|eot_id|>'
)[
0
].
strip
()
elif
'<end_of_utterance>'
in
answer
:
answer
=
answer
.
split
(
'<end_of_utterance>'
)[
0
].
strip
()
elif
'|ENDOFTEXT|'
in
answer
:
answer
=
answer
.
split
(
'|ENDOFTEXT|'
)[
0
].
strip
()
return
answer
def
generate_inner
(
self
,
message
,
dataset
=
None
):
content
,
images
=
''
,
[]
ide_content
,
question
=
[],
''
for
msg
in
message
:
if
msg
[
'type'
]
==
'text'
:
content
+=
msg
[
'value'
]
question
+=
msg
[
'value'
]
else
:
images
.
append
(
Image
.
open
(
msg
[
'value'
]).
convert
(
'RGB'
))
content
+=
(
self
.
DEFAULT_IMAGE_TOKEN
+
'
\n
'
)
ide_content
.
append
({
'type'
:
'image'
})
if
self
.
_is_idefics
:
# Follow the idefics implementation:
ide_content
.
append
({
'type'
:
'text'
,
'text'
:
question
})
prompt
=
[{
'role'
:
'user'
,
'content'
:
ide_content
}]
prompt
=
self
.
processor
.
apply_chat_template
(
prompt
,
add_generation_prompt
=
True
)
else
:
# Follow the Mantis code base to make sure they are consistent:
# https://github.com/TIGER-AI-Lab/Mantis/blob/main/mantis/models/mllava/utils.py#L33
# Users don't need to define chat template as it is done here
if
'llama-3'
in
self
.
model
.
language_model
.
name_or_path
.
lower
():
conv
=
self
.
conv_templates
[
'llama_3'
]
terminators
=
[
self
.
processor
.
tokenizer
.
eos_token_id
,
self
.
processor
.
tokenizer
.
convert_tokens_to_ids
(
'<|eot_id|>'
)
]
else
:
conv
=
self
.
default_conv
terminators
=
[
self
.
processor
.
tokenizer
.
eos_token_id
]
# Using EOT because end of *text* is more accurate for what we're doing than end of *sentence*
if
'eos_token_id'
not
in
self
.
kwargs
:
self
.
kwargs
[
'eos_token_id'
]
=
terminators
conv
=
conv
.
copy
()
conv
.
append_message
(
conv
.
roles
[
0
],
content
)
conv
.
append_message
(
conv
.
roles
[
1
],
''
)
assert
conv
.
messages
[
-
1
][
0
]
==
conv
.
roles
[
1
]
and
conv
.
messages
[
-
1
][
1
]
==
''
,
'Format check'
prompt
=
conv
.
get_prompt
()
inputs
=
self
.
processor
(
prompt
,
images
,
return_tensors
=
'pt'
,
truncation
=
True
)
# FIXME: Fuyu model would return a list instead of a pytorch tensor. This weird behavior needs fixing.
if
'image_patches'
in
inputs
.
keys
():
inputs
[
'image_patches'
]
=
inputs
[
'image_patches'
][
0
]
inputs
=
{
k
:
v
.
to
(
self
.
model
.
device
)
for
k
,
v
in
inputs
.
items
()}
output
=
self
.
model
.
generate
(
**
inputs
,
**
self
.
kwargs
)
output
=
output
[
0
]
generated_ids
=
output
[
inputs
[
'input_ids'
].
shape
[
-
1
]:]
answer
=
self
.
processor
.
decode
(
generated_ids
,
skip_special_token
=
True
)
answer
=
self
.
output_process
(
answer
)
return
answer
VLMEvalKit/vlmeval/vlm/mgm.py
0 → 100644
View file @
bc5ebf0f
import
sys
import
torch
import
os.path
as
osp
import
os
import
warnings
from
.base
import
BaseModel
from
..smp
import
*
from
PIL
import
Image
'''
Please follow the instructions to download ckpt.
https://github.com/dvlab-research/MGM?tab=readme-ov-file#pretrained-weights
'''
class
Mini_Gemini
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
,
root
=
None
,
conv_mode
=
'llava_v1'
,
**
kwargs
):
if
root
is
None
:
warnings
.
warn
(
'Please set `root` to Mini_Gemini code directory,
\
which is cloned from here: "https://github.com/dvlab-research/MGM?tab=readme-ov-file" '
)
raise
ValueError
warnings
.
warn
(
'Please follow the instructions of Mini_Gemini to put the ckpt file in the right place,
\
which can be found at https://github.com/dvlab-research/MGM?tab=readme-ov-file#structure'
)
assert
model_path
==
'YanweiLi/MGM-7B-HD'
,
'We only support MGM-7B-HD for now'
self
.
model_path
=
model_path
sys
.
path
.
append
(
root
)
try
:
from
mgm.model.builder
import
load_pretrained_model
from
mgm.mm_utils
import
get_model_name_from_path
except
Exception
as
e
:
logging
.
critical
(
'Please first install Mini_Gemini and set the root path to use Mini_Gemini, '
'which is cloned from here: "https://github.com/dvlab-research/MGM?tab=readme-ov-file" '
)
raise
e
VLMEvalKit_path
=
os
.
getcwd
()
os
.
chdir
(
root
)
warnings
.
warn
(
'Please set `root` to Mini_Gemini code directory,
\
which is cloned from here: "https://github.com/dvlab-research/MGM?tab=readme-ov-file" '
)
model_path
=
osp
.
join
(
root
,
'work_dirs'
,
'MGM'
,
'MGM-7B-HD'
)
try
:
model_name
=
get_model_name_from_path
(
model_path
)
except
Exception
as
e
:
logging
.
critical
(
'Please follow the instructions of Mini_Gemini to put the ckpt file in the right place, '
'which can be found at https://github.com/dvlab-research/MGM?tab=readme-ov-file#structure'
)
raise
e
tokenizer
,
model
,
image_processor
,
context_len
=
load_pretrained_model
(
model_path
,
None
,
model_name
)
os
.
chdir
(
VLMEvalKit_path
)
self
.
model
=
model
self
.
tokenizer
=
tokenizer
self
.
image_processor
=
image_processor
self
.
conv_mode
=
conv_mode
kwargs_default
=
dict
(
temperature
=
float
(
0
),
num_beams
=
1
,
top_p
=
None
,
max_new_tokens
=
1024
,
use_cache
=
True
)
kwargs_default
.
update
(
kwargs
)
do_sample
=
kwargs_default
[
'temperature'
]
>
0
kwargs_default
.
update
({
'do_sample'
:
do_sample
})
self
.
kwargs
=
kwargs_default
def
generate_inner
(
self
,
message
,
dataset
=
None
):
try
:
from
mgm.constants
import
IMAGE_TOKEN_INDEX
,
DEFAULT_IMAGE_TOKEN
,
\
DEFAULT_IM_START_TOKEN
,
DEFAULT_IM_END_TOKEN
from
mgm.conversation
import
conv_templates
from
mgm.mm_utils
import
tokenizer_image_token
,
process_images
except
Exception
as
e
:
logging
.
critical
(
'Please first install Mini_Gemini and set the root path to use Mini_Gemini, '
'which is cloned from here: "https://github.com/dvlab-research/MGM?tab=readme-ov-file" '
)
raise
e
prompt
,
image
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
image
=
Image
.
open
(
image
)
prompt
=
DEFAULT_IM_START_TOKEN
+
DEFAULT_IMAGE_TOKEN
+
DEFAULT_IM_END_TOKEN
+
'
\n
'
+
prompt
conv
=
conv_templates
[
self
.
conv_mode
].
copy
()
conv
.
append_message
(
conv
.
roles
[
0
],
prompt
)
conv
.
append_message
(
conv
.
roles
[
1
],
None
)
prompt
=
conv
.
get_prompt
()
input_ids
=
tokenizer_image_token
(
prompt
,
self
.
tokenizer
,
IMAGE_TOKEN_INDEX
,
return_tensors
=
'pt'
)
input_ids
=
input_ids
.
unsqueeze
(
0
).
cuda
()
if
hasattr
(
self
.
model
.
config
,
'image_size_aux'
):
if
not
hasattr
(
self
.
image_processor
,
'image_size_raw'
):
self
.
image_processor
.
image_size_raw
=
self
.
image_processor
.
crop_size
.
copy
()
self
.
image_processor
.
crop_size
[
'height'
]
=
self
.
model
.
config
.
image_size_aux
self
.
image_processor
.
crop_size
[
'width'
]
=
self
.
model
.
config
.
image_size_aux
self
.
image_processor
.
size
[
'shortest_edge'
]
=
self
.
model
.
config
.
image_size_aux
image_tensor
=
process_images
([
image
],
self
.
image_processor
,
self
.
model
.
config
)[
0
]
image_grid
=
getattr
(
self
.
model
.
config
,
'image_grid'
,
1
)
if
hasattr
(
self
.
model
.
config
,
'image_size_aux'
):
raw_shape
=
[
self
.
image_processor
.
image_size_raw
[
'height'
]
*
image_grid
,
self
.
image_processor
.
image_size_raw
[
'width'
]
*
image_grid
]
image_tensor_aux
=
image_tensor
image_tensor
=
torch
.
nn
.
functional
.
interpolate
(
image_tensor
[
None
],
size
=
raw_shape
,
mode
=
'bilinear'
,
align_corners
=
False
)[
0
]
else
:
image_tensor_aux
=
[]
if
image_grid
>=
2
:
raw_image
=
image_tensor
.
reshape
(
3
,
image_grid
,
self
.
image_processor
.
image_size_raw
[
'height'
],
image_grid
,
self
.
image_processor
.
image_size_raw
[
'width'
]
)
raw_image
=
raw_image
.
permute
(
1
,
3
,
0
,
2
,
4
)
raw_image
=
raw_image
.
reshape
(
-
1
,
3
,
self
.
image_processor
.
image_size_raw
[
'height'
],
self
.
image_processor
.
image_size_raw
[
'width'
]
)
if
getattr
(
self
.
model
.
config
,
'image_global'
,
False
):
global_image
=
image_tensor
if
len
(
global_image
.
shape
)
==
3
:
global_image
=
global_image
[
None
]
global_image
=
torch
.
nn
.
functional
.
interpolate
(
global_image
,
size
=
[
self
.
image_processor
.
image_size_raw
[
'height'
],
self
.
image_processor
.
image_size_raw
[
'width'
]
],
mode
=
'bilinear'
,
align_corners
=
False
)
# [image_crops, image_global]
raw_image
=
torch
.
cat
([
raw_image
,
global_image
],
dim
=
0
)
image_tensor
=
raw_image
.
contiguous
()
images
=
image_tensor
[
None
].
to
(
dtype
=
self
.
model
.
dtype
,
device
=
'cuda'
,
non_blocking
=
True
)
if
len
(
image_tensor_aux
)
>
0
:
images_aux
=
image_tensor_aux
[
None
].
to
(
dtype
=
self
.
model
.
dtype
,
device
=
'cuda'
,
non_blocking
=
True
)
else
:
images_aux
=
None
with
torch
.
inference_mode
():
output_ids
=
self
.
model
.
generate
(
input_ids
,
images
=
images
,
images_aux
=
images_aux
,
# no_repeat_ngram_size=3,
bos_token_id
=
self
.
tokenizer
.
bos_token_id
,
# Begin of sequence token
eos_token_id
=
self
.
tokenizer
.
eos_token_id
,
# End of sequence token
pad_token_id
=
self
.
tokenizer
.
pad_token_id
,
# Pad token
**
self
.
kwargs
)
outputs
=
self
.
tokenizer
.
batch_decode
(
output_ids
,
skip_special_tokens
=
True
)[
0
].
strip
()
return
outputs
VLMEvalKit/vlmeval/vlm/minicpm_v.py
0 → 100644
View file @
bc5ebf0f
import
math
import
torch
import
random
import
numpy
as
np
from
PIL
import
Image
from
transformers
import
AutoModel
,
AutoTokenizer
from
.base
import
BaseModel
from
..smp
import
*
from
..dataset
import
DATASET_TYPE
,
DATASET_MODALITY
class
MiniCPM_V
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
=
'openbmb/MiniCPM-V'
,
**
kwargs
):
assert
model_path
is
not
None
self
.
model_path
=
model_path
print
(
f
'load from
{
self
.
model_path
}
'
)
self
.
model
=
AutoModel
.
from_pretrained
(
self
.
model_path
,
trust_remote_code
=
True
)
self
.
model
=
self
.
model
.
to
(
dtype
=
torch
.
bfloat16
)
self
.
model
.
eval
().
cuda
()
self
.
kwargs
=
kwargs
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
self
.
model_path
,
trust_remote_code
=
True
)
torch
.
cuda
.
empty_cache
()
self
.
num_beams
=
1
if
self
.
model_path
==
'openbmb/MiniCPM-V'
else
3
def
use_custom_prompt
(
self
,
dataset
):
assert
dataset
is
not
None
if
listinstr
([
'MMDU'
,
'MME-RealWorld'
,
'MME-RealWorld-CN'
],
dataset
):
# For Multi-Turn we don't have custom prompt
return
False
return
False
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
assert
self
.
use_custom_prompt
(
dataset
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
'question'
]
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
options_prompt
=
'Options:
\n
'
for
key
,
item
in
options
.
items
():
options_prompt
+=
f
'
{
key
}
.
{
item
}
\n
'
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
prompt
=
''
if
hint
is
not
None
:
prompt
+=
f
'Hint:
{
hint
}
\n
'
prompt
+=
f
'
{
question
}
\n
'
if
len
(
options
):
prompt
+=
options_prompt
prompt
=
'Study the image carefully and pick the option associated with the correct answer.
\
Focus solely on selecting the option and avoid including any other content.
\n
'
+
prompt
message
=
[
dict
(
type
=
'text'
,
value
=
prompt
)]
message
.
extend
([
dict
(
type
=
'image'
,
value
=
p
)
for
p
in
tgt_path
])
return
message
def
generate_inner
(
self
,
message
,
dataset
=
None
):
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
image
=
Image
.
open
(
image_path
).
convert
(
'RGB'
)
msgs
=
[{
'role'
:
'user'
,
'content'
:
prompt
}]
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
max_new_tokens
=
20
elif
DATASET_TYPE
(
dataset
)
==
'Y/N'
:
max_new_tokens
=
100
else
:
max_new_tokens
=
1024
default_kwargs
=
dict
(
max_new_tokens
=
max_new_tokens
,
sampling
=
False
,
num_beams
=
self
.
num_beams
)
default_kwargs
.
update
(
self
.
kwargs
)
res
,
_
,
_
=
self
.
model
.
chat
(
image
=
image
,
msgs
=
msgs
,
context
=
None
,
tokenizer
=
self
.
tokenizer
,
**
default_kwargs
)
return
res
class
MiniCPM_Llama3_V
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
True
def
__init__
(
self
,
model_path
=
'openbmb/MiniCPM-Llama3-V-2_5'
,
**
kwargs
):
assert
model_path
is
not
None
self
.
model_path
=
model_path
print
(
f
'load from
{
self
.
model_path
}
'
)
self
.
model
=
AutoModel
.
from_pretrained
(
self
.
model_path
,
trust_remote_code
=
True
)
self
.
model
=
self
.
model
.
to
(
dtype
=
torch
.
float16
)
self
.
model
.
eval
().
cuda
()
self
.
kwargs
=
kwargs
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
self
.
model_path
,
trust_remote_code
=
True
)
torch
.
cuda
.
empty_cache
()
self
.
num_beams
=
1
if
self
.
model_path
==
'openbmb/MiniCPM-V'
else
3
self
.
options_system_prompt
=
(
'Carefully read the following question and select the letter corresponding '
'to the correct answer. Highlight the applicable choices without giving '
'explanations.'
)
self
.
wo_options_system_prompt
=
'Carefully read the following question Answer the question directly.'
self
.
detail_system_prompt
=
'Answer this question in detail.'
self
.
vqa_prompt
=
'Answer the question using a single word or phrase.'
def
use_custom_prompt
(
self
,
dataset
):
if
listinstr
([
'MCQ'
,
'VQA'
],
DATASET_TYPE
(
dataset
)):
return
True
elif
dataset
is
not
None
and
listinstr
([
'HallusionBench'
],
dataset
):
return
True
return
False
def
build_prompt
(
self
,
line
,
dataset
=
None
):
if
isinstance
(
line
,
int
):
line
=
self
.
data
.
iloc
[
line
]
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
system_prompt
=
''
question
=
line
[
'question'
]
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
options_prompt
=
'Options:
\n
'
for
key
,
item
in
options
.
items
():
options_prompt
+=
f
'
{
key
}
.
{
item
}
\n
'
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
prompt
=
''
if
hint
is
not
None
:
prompt
+=
f
'Hint:
{
hint
}
\n
'
prompt
+=
f
'Question:
{
question
}
\n
'
if
len
(
options
):
prompt
+=
options_prompt
system_prompt
=
self
.
options_system_prompt
+
'
\n
Please just indicate your choice.'
else
:
system_prompt
=
self
.
wo_options_system_prompt
if
'MMMU'
in
dataset
:
# Corner Case
prompt
=
system_prompt
+
'
\n
'
+
prompt
system_prompt
=
''
elif
dataset
is
not
None
and
listinstr
([
'HallusionBench'
],
dataset
):
question
=
line
[
'question'
]
+
' Yes or No?'
prompt
=
question
elif
dataset
is
not
None
and
listinstr
([
'MME'
],
dataset
):
question
=
line
[
'question'
]
+
' Yes or No?'
prompt
=
question
elif
dataset
is
not
None
and
listinstr
([
'OCRBench'
],
dataset
):
system_prompt
=
self
.
vqa_prompt
question
=
line
[
'question'
]
prompt
=
question
elif
DATASET_TYPE
(
dataset
)
==
'VQA'
:
if
listinstr
([
'LLaVABench'
,
'MMLongBench_DOC'
],
dataset
):
system_prompt
=
''
prompt
=
question
elif
listinstr
([
'MMVet'
],
dataset
):
system_prompt
=
self
.
detail_system_prompt
prompt
=
question
else
:
system_prompt
=
self
.
vqa_prompt
prompt
=
question
msgs
=
[]
if
system_prompt
:
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
system_prompt
))
if
isinstance
(
tgt_path
,
list
):
msgs
.
extend
([
dict
(
type
=
'image'
,
value
=
p
)
for
p
in
tgt_path
])
else
:
msgs
=
[
dict
(
type
=
'image'
,
value
=
tgt_path
)]
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
prompt
))
return
msgs
def
generate_inner
(
self
,
message
,
dataset
=
None
):
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
max_new_tokens
=
200
elif
DATASET_TYPE
(
dataset
)
==
'Y/N'
:
max_new_tokens
=
3
else
:
max_new_tokens
=
1024
default_kwargs
=
dict
(
max_new_tokens
=
max_new_tokens
,
sampling
=
False
,
num_beams
=
self
.
num_beams
,
)
default_kwargs
.
update
(
self
.
kwargs
)
content
=
[]
for
x
in
message
:
if
x
[
'type'
]
==
'text'
:
content
.
append
(
x
[
'value'
])
elif
x
[
'type'
]
==
'image'
:
image
=
Image
.
open
(
x
[
'value'
]).
convert
(
'RGB'
)
content
.
append
(
image
)
msgs
=
[{
'role'
:
'user'
,
'content'
:
content
}]
res
=
self
.
model
.
chat
(
msgs
=
msgs
,
context
=
None
,
image
=
None
,
tokenizer
=
self
.
tokenizer
,
**
default_kwargs
)
if
isinstance
(
res
,
tuple
)
and
len
(
res
)
>
0
:
res
=
res
[
0
]
return
res
def
chat_inner
(
self
,
message
,
dataset
=
None
):
max_new_tokens
=
1024
default_kwargs
=
dict
(
max_new_tokens
=
max_new_tokens
,
sampling
=
False
,
num_beams
=
self
.
num_beams
,
)
default_kwargs
.
update
(
self
.
kwargs
)
msgs
=
[]
for
msg
in
message
:
content
=
[]
if
len
(
msg
[
'content'
])
==
1
and
msg
[
'content'
][
0
][
'type'
]
==
'text'
:
msg_new
=
{
'role'
:
msg
[
'role'
],
'content'
:
msg
[
'content'
][
0
][
'value'
]}
msgs
.
append
(
msg_new
)
continue
for
x
in
msg
[
'content'
]:
if
x
[
'type'
]
==
'text'
:
content
.
append
(
x
[
'value'
])
elif
x
[
'type'
]
==
'image'
:
image
=
Image
.
open
(
x
[
'value'
]).
convert
(
'RGB'
)
content
.
append
(
image
)
msg_new
=
{
'role'
:
msg
[
'role'
],
'content'
:
content
}
msgs
.
append
(
msg_new
)
res
=
self
.
model
.
chat
(
msgs
=
msgs
,
context
=
None
,
image
=
None
,
tokenizer
=
self
.
tokenizer
,
**
default_kwargs
)
if
isinstance
(
res
,
tuple
)
and
len
(
res
)
>
0
:
res
=
res
[
0
]
return
res
class
MiniCPM_V_2_6
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
True
def
__init__
(
self
,
model_path
=
'openbmb/MiniCPM-V'
,
**
kwargs
):
random
.
seed
(
0
)
np
.
random
.
seed
(
0
)
torch
.
manual_seed
(
0
)
torch
.
cuda
.
manual_seed_all
(
0
)
assert
model_path
is
not
None
self
.
model_path
=
model_path
print
(
f
'load from path
{
self
.
model_path
}
'
)
self
.
model
=
AutoModel
.
from_pretrained
(
self
.
model_path
,
trust_remote_code
=
True
)
self
.
model
=
self
.
model
.
to
(
dtype
=
torch
.
bfloat16
)
self
.
model
.
eval
().
cuda
()
self
.
kwargs
=
kwargs
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
self
.
model_path
,
trust_remote_code
=
True
)
torch
.
cuda
.
empty_cache
()
self
.
num_beams
=
1
if
self
.
model_path
==
'openbmb/MiniCPM-V'
else
3
self
.
options_suffix_prompt
=
'''
\n
Answer with the option's letter from the given choices directly.'''
self
.
wo_options_system_prompt
=
'Carefully read the following question Answer the question directly.'
self
.
detail_system_prompt
=
'Answer this question in detail.'
self
.
vqa_prompt
=
'Answer the question using a single word or phrase.'
self
.
multi_choice_cot_prompt
=
(
'''Carefully read the following multichoice question, solve it step '''
'''by step and finally pick the option associated with the correct '''
'''answer in the format of "Answer: selected option
\n\n
'''
)
self
.
short_ans_cot_prompt
=
(
'''Read the following question carefully, solve it step by step, and '''
'''then output the final answer in the format of "Answer: single number '''
'''or single word or phrase".
\n\n
'''
)
def
use_custom_prompt
(
self
,
dataset
=
None
):
if
dataset
is
None
:
return
False
if
DATASET_TYPE
(
dataset
)
in
[
'MCQ'
,
'VQA'
,
'Y/N'
]:
return
True
return
False
def
use_cot
(
self
,
dataset
=
None
):
if
dataset
is
None
:
return
False
if
listinstr
([
'MMMU'
,
'HallusionBench'
,
'OCRBench'
,
'ChartQA'
],
dataset
):
return
True
elif
listinstr
([
'MathVista'
,
'MMVet'
,
'MMBench'
,
'MMStar'
,
'AI2D'
,
'RealWorldQA'
,
'POPE'
,
'ScienceQA'
,
'TextVQA'
,
'DocVQA'
],
dataset
):
return
False
else
:
return
False
def
use_upsize
(
self
,
dataset
=
None
):
if
dataset
is
None
:
return
False
if
listinstr
([
'MMVet'
,
'MMBench'
,
'MMStar'
,
'AI2D'
,
'OCRBench'
],
dataset
):
return
True
else
:
return
False
def
build_prompt
(
self
,
line
,
dataset
=
None
):
if
isinstance
(
line
,
int
):
line
=
self
.
data
.
iloc
[
line
]
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
system_prompt
,
prompt
=
''
,
''
question
=
line
[
'question'
]
if
not
self
.
use_cot
(
dataset
):
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
options_prompt
=
'Options:
\n
'
for
key
,
item
in
options
.
items
():
options_prompt
+=
f
'
{
key
}
.
{
item
}
\n
'
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
if
hint
is
not
None
:
prompt
+=
f
'Hint:
{
hint
}
\n
'
prompt
+=
f
'Question:
{
question
}
\n
'
if
len
(
options
):
prompt
+=
options_prompt
prompt
+=
self
.
options_suffix_prompt
else
:
system_prompt
=
self
.
wo_options_system_prompt
if
'MMMU'
in
dataset
:
if
len
(
system_prompt
)
>
0
:
prompt
=
system_prompt
+
'
\n
'
+
prompt
system_prompt
=
''
elif
dataset
is
not
None
and
listinstr
([
'HallusionBench'
],
dataset
):
question
+=
' Yes or No?'
prompt
=
question
elif
dataset
is
not
None
and
listinstr
([
'OCRBench'
],
dataset
):
system_prompt
=
self
.
vqa_prompt
prompt
=
question
elif
DATASET_TYPE
(
dataset
)
==
'VQA'
:
if
listinstr
([
'LLaVABench'
],
dataset
):
system_prompt
=
''
elif
listinstr
([
'MMVet'
],
dataset
):
system_prompt
=
self
.
detail_system_prompt
else
:
system_prompt
=
self
.
vqa_prompt
prompt
=
question
else
:
prompt
=
question
else
:
has_options
=
True
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
options_prompt
=
''
for
key
,
item
in
options
.
items
():
options_prompt
+=
f
'
{
key
}
.
{
item
}
\n
'
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
if
hint
is
not
None
:
prompt
+=
f
'Hint:
{
hint
}
\n
'
prompt
+=
f
'
{
question
}
\n
'
if
len
(
options
):
prompt
+=
options_prompt
else
:
has_options
=
False
if
'MMMU'
in
dataset
:
if
len
(
system_prompt
)
>
0
:
prompt
=
system_prompt
+
'
\n
'
+
prompt
system_prompt
=
''
else
:
prompt
=
question
if
DATASET_TYPE
(
dataset
)
in
[
'MCQ'
,
'Y/N'
,
'VQA'
]:
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
if
has_options
:
prompt
=
self
.
multi_choice_cot_prompt
+
prompt
else
:
prompt
=
self
.
short_ans_cot_prompt
+
prompt
elif
DATASET_TYPE
(
dataset
)
==
'Y/N'
:
prompt
=
self
.
short_ans_cot_prompt
+
prompt
else
:
prompt
=
self
.
short_ans_cot_prompt
+
prompt
msgs
=
[]
if
system_prompt
:
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
system_prompt
))
if
isinstance
(
tgt_path
,
list
):
msgs
.
extend
([
dict
(
type
=
'image'
,
value
=
p
)
for
p
in
tgt_path
])
else
:
msgs
=
[
dict
(
type
=
'image'
,
value
=
tgt_path
)]
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
prompt
))
return
msgs
def
generate_inner
(
self
,
message
,
dataset
=
None
):
if
DATASET_MODALITY
(
dataset
)
==
'VIDEO'
:
max_slice_nums
=
1
use_image_id
=
False
max_inp_length
=
2048
*
10
else
:
max_slice_nums
=
None
use_image_id
=
True
max_inp_length
=
8192
max_new_tokens
=
2048
default_kwargs
=
dict
(
max_new_tokens
=
max_new_tokens
,
sampling
=
False
,
num_beams
=
self
.
num_beams
,
)
default_kwargs
.
update
(
self
.
kwargs
)
content
=
[]
for
x
in
message
:
if
x
[
'type'
]
==
'text'
:
content
.
append
(
x
[
'value'
])
elif
x
[
'type'
]
==
'image'
:
image
=
Image
.
open
(
x
[
'value'
]).
convert
(
'RGB'
)
if
not
self
.
use_upsize
(
dataset
):
content
.
append
(
image
)
else
:
img_width
,
img_height
=
image
.
width
,
image
.
height
if
(
img_width
*
img_height
)
>=
(
1344
*
1344
):
content
.
append
(
image
)
else
:
ratio
=
math
.
sqrt
((
1344
*
1344
)
/
(
img_width
*
img_height
))
max_img_width
=
int
(
img_width
*
ratio
)
new_img_width
=
random
.
randint
(
img_width
,
max_img_width
)
new_img_height
=
int
(
new_img_width
/
img_width
*
img_height
)
resized_image
=
image
.
resize
((
new_img_width
,
new_img_height
))
content
.
append
(
resized_image
)
msgs
=
[{
'role'
:
'user'
,
'content'
:
content
}]
res
=
self
.
model
.
chat
(
image
=
None
,
msgs
=
msgs
,
context
=
None
,
tokenizer
=
self
.
tokenizer
,
max_inp_length
=
max_inp_length
,
use_image_id
=
use_image_id
,
max_slice_nums
=
max_slice_nums
,
**
default_kwargs
)
if
isinstance
(
res
,
tuple
)
and
len
(
res
)
>
0
:
res
=
res
[
0
]
return
res
VLMEvalKit/vlmeval/vlm/minigpt4.py
0 → 100644
View file @
bc5ebf0f
import
torch
import
sys
import
os.path
as
osp
import
warnings
from
transformers
import
StoppingCriteriaList
from
.base
import
BaseModel
class
MiniGPT4
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
False
def
__init__
(
self
,
mode
=
'v2'
,
root
=
'/mnt/petrelfs/share_data/duanhaodong/MiniGPT-4/'
,
temperature
=
1
,
max_out_len
=
512
):
if
root
is
None
:
warnings
.
warn
(
'Please set root to the directory of MiniGPT-4, which is cloned from here: '
'https://github.com/Vision-CAIR/MiniGPT-4. '
)
if
mode
==
'v2'
:
cfg
=
'minigptv2_eval.yaml'
elif
mode
==
'v1_7b'
:
cfg
=
'minigpt4_7b_eval.yaml'
elif
mode
==
'v1_13b'
:
cfg
=
'minigpt4_13b_eval.yaml'
else
:
raise
NotImplementedError
self
.
mode
=
mode
self
.
temperature
=
temperature
self
.
max_out_len
=
max_out_len
self
.
root
=
root
this_dir
=
osp
.
dirname
(
__file__
)
self
.
cfg
=
osp
.
join
(
this_dir
,
'misc'
,
cfg
)
sys
.
path
.
append
(
self
.
root
)
from
omegaconf
import
OmegaConf
from
minigpt4.common.registry
import
registry
from
minigpt4.conversation.conversation
import
StoppingCriteriaSub
,
CONV_VISION_Vicuna0
,
CONV_VISION_minigptv2
device
=
torch
.
cuda
.
current_device
()
self
.
device
=
device
cfg_path
=
self
.
cfg
cfg
=
OmegaConf
.
load
(
cfg_path
)
model_cfg
=
cfg
.
model
model_cfg
.
device_8bit
=
device
model_cls
=
registry
.
get_model_class
(
model_cfg
.
arch
)
model
=
model_cls
.
from_config
(
model_cfg
)
model
=
model
.
to
(
device
)
model
.
eval
()
vis_processor_cfg
=
cfg
.
datasets
.
cc_sbu_align
.
vis_processor
.
train
vis_processor
=
registry
.
get_processor_class
(
vis_processor_cfg
.
name
).
from_config
(
vis_processor_cfg
)
self
.
model
=
model
self
.
vis_processor
=
vis_processor
self
.
CONV_VISION
=
CONV_VISION_minigptv2
if
self
.
mode
==
'v2'
else
CONV_VISION_Vicuna0
stop_words_ids
=
[[
835
],
[
2277
,
29937
]]
stop_words_ids
=
[
torch
.
tensor
(
ids
).
to
(
device
)
for
ids
in
stop_words_ids
]
self
.
stopping_criteria
=
StoppingCriteriaList
([
StoppingCriteriaSub
(
stops
=
stop_words_ids
)])
def
generate_inner
(
self
,
message
,
dataset
=
None
):
from
minigpt4.conversation.conversation
import
Chat
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
if
self
.
mode
==
'v2'
:
chat
=
Chat
(
self
.
model
,
self
.
vis_processor
,
device
=
self
.
device
)
else
:
chat
=
Chat
(
self
.
model
,
self
.
vis_processor
,
device
=
self
.
device
,
stopping_criteria
=
self
.
stopping_criteria
)
chat_state
=
self
.
CONV_VISION
.
copy
()
img_list
=
[]
_
=
chat
.
upload_img
(
image_path
,
chat_state
,
img_list
)
chat
.
encode_img
(
img_list
)
chat
.
ask
(
prompt
,
chat_state
)
with
torch
.
inference_mode
():
msg
=
chat
.
answer
(
conv
=
chat_state
,
img_list
=
img_list
)[
0
]
return
msg
VLMEvalKit/vlmeval/vlm/minimonkey.py
0 → 100644
View file @
bc5ebf0f
import
torch
from
transformers
import
AutoTokenizer
,
AutoConfig
,
AutoModel
,
CLIPImageProcessor
import
warnings
from
PIL
import
Image
from
.base
import
BaseModel
from
..smp
import
*
from
..dataset
import
DATASET_TYPE
import
pandas
as
pd
import
string
import
torch.distributed
as
dist
import
torchvision.transforms
as
T
import
transformers
from
torchvision.transforms.functional
import
InterpolationMode
import
re
IMAGENET_MEAN
=
(
0.485
,
0.456
,
0.406
)
IMAGENET_STD
=
(
0.229
,
0.224
,
0.225
)
def
build_transform
(
input_size
):
MEAN
,
STD
=
IMAGENET_MEAN
,
IMAGENET_STD
transform
=
T
.
Compose
([
T
.
Lambda
(
lambda
img
:
img
.
convert
(
'RGB'
)
if
img
.
mode
!=
'RGB'
else
img
),
T
.
Resize
((
input_size
,
input_size
),
interpolation
=
InterpolationMode
.
BICUBIC
),
T
.
ToTensor
(),
T
.
Normalize
(
mean
=
MEAN
,
std
=
STD
)
])
return
transform
def
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
width
,
height
,
image_size
):
best_ratio_diff
=
float
(
'inf'
)
best_ratio
=
(
1
,
1
)
area
=
width
*
height
for
ratio
in
target_ratios
:
target_aspect_ratio
=
ratio
[
0
]
/
ratio
[
1
]
ratio_diff
=
abs
(
aspect_ratio
-
target_aspect_ratio
)
if
ratio_diff
<
best_ratio_diff
:
best_ratio_diff
=
ratio_diff
best_ratio
=
ratio
elif
ratio_diff
==
best_ratio_diff
:
if
area
>
0.5
*
image_size
*
image_size
*
ratio
[
0
]
*
ratio
[
1
]:
best_ratio
=
ratio
return
best_ratio
def
dynamic_preprocess
(
image
,
min_num
=
5
,
max_num
=
6
,
image_size
=
448
,
use_thumbnail
=
False
):
orig_width
,
orig_height
=
image
.
size
aspect_ratio
=
orig_width
/
orig_height
# calculate the existing image aspect ratio
target_ratios
=
set
(
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
i
*
j
<=
max_num
and
i
*
j
>=
min_num
)
target_ratios
=
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
target_ratios
,
orig_width
,
orig_height
,
image_size
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
,
target_aspect_ratio
def
dynamic_preprocess2
(
image
,
min_num
=
1
,
max_num
=
6
,
image_size
=
448
,
use_thumbnail
=
False
,
prior_aspect_ratio
=
None
):
orig_width
,
orig_height
=
image
.
size
aspect_ratio
=
orig_width
/
orig_height
# calculate the existing image aspect ratio
target_ratios
=
set
(
(
i
,
j
)
for
n
in
range
(
min_num
,
max_num
+
1
)
for
i
in
range
(
1
,
n
+
1
)
for
j
in
range
(
1
,
n
+
1
)
if
i
*
j
<=
max_num
and
i
*
j
>=
min_num
)
target_ratios
=
sorted
(
target_ratios
,
key
=
lambda
x
:
x
[
0
]
*
x
[
1
])
new_target_ratios
=
[]
if
prior_aspect_ratio
is
not
None
:
for
i
in
target_ratios
:
if
prior_aspect_ratio
[
0
]
%
i
[
0
]
!=
0
or
prior_aspect_ratio
[
1
]
%
i
[
1
]
!=
0
:
new_target_ratios
.
append
(
i
)
else
:
continue
# find the closest aspect ratio to the target
target_aspect_ratio
=
find_closest_aspect_ratio
(
aspect_ratio
,
new_target_ratios
,
orig_width
,
orig_height
,
image_size
)
# calculate the target width and height
target_width
=
image_size
*
target_aspect_ratio
[
0
]
target_height
=
image_size
*
target_aspect_ratio
[
1
]
blocks
=
target_aspect_ratio
[
0
]
*
target_aspect_ratio
[
1
]
# resize the image
resized_img
=
image
.
resize
((
target_width
,
target_height
))
processed_images
=
[]
for
i
in
range
(
blocks
):
box
=
(
(
i
%
(
target_width
//
image_size
))
*
image_size
,
(
i
//
(
target_width
//
image_size
))
*
image_size
,
((
i
%
(
target_width
//
image_size
))
+
1
)
*
image_size
,
((
i
//
(
target_width
//
image_size
))
+
1
)
*
image_size
)
# split the image
split_img
=
resized_img
.
crop
(
box
)
processed_images
.
append
(
split_img
)
assert
len
(
processed_images
)
==
blocks
if
use_thumbnail
and
len
(
processed_images
)
!=
1
:
thumbnail_img
=
image
.
resize
((
image_size
,
image_size
))
processed_images
.
append
(
thumbnail_img
)
return
processed_images
def
load_image
(
image_file
,
input_size
=
448
,
min_num
=
1
,
max_num
=
6
):
image
=
Image
.
open
(
image_file
).
convert
(
'RGB'
)
transform
=
build_transform
(
input_size
=
input_size
)
images
,
target_aspect_ratio
=
dynamic_preprocess
(
image
,
image_size
=
input_size
,
use_thumbnail
=
True
,
min_num
=
min_num
,
max_num
=
max_num
)
pixel_values
=
[
transform
(
image
)
for
image
in
images
]
pixel_values
=
torch
.
stack
(
pixel_values
)
return
pixel_values
,
target_aspect_ratio
def
load_image2
(
image_file
,
input_size
=
448
,
target_aspect_ratio
=
(
1
,
1
),
min_num
=
1
,
max_num
=
6
):
image
=
Image
.
open
(
image_file
).
convert
(
'RGB'
)
transform
=
build_transform
(
input_size
=
input_size
)
images
=
dynamic_preprocess2
(
image
,
image_size
=
input_size
,
prior_aspect_ratio
=
target_aspect_ratio
,
use_thumbnail
=
True
,
min_num
=
min_num
,
max_num
=
max_num
)
pixel_values
=
[
transform
(
image
)
for
image
in
images
]
pixel_values
=
torch
.
stack
(
pixel_values
)
return
pixel_values
# This function is used to split InternVL2-Llama3-76B
def
split_model
(
model_name
):
import
math
device_map
=
{}
num_gpus
=
torch
.
cuda
.
device_count
()
rank
,
world_size
=
get_rank_and_world_size
()
num_gpus
=
num_gpus
//
world_size
num_layers
=
{
'InternVL2-8B'
:
32
,
'InternVL2-26B'
:
48
,
'InternVL2-40B'
:
60
,
'InternVL2-Llama3-76B'
:
80
}[
model_name
]
# Since the first GPU will be used for ViT, treat it as 0.8 GPU.
num_layers_per_gpu
=
math
.
ceil
(
num_layers
/
(
num_gpus
-
0.2
))
num_layers_per_gpu
=
[
num_layers_per_gpu
]
*
num_gpus
num_layers_per_gpu
[
0
]
=
math
.
ceil
(
num_layers_per_gpu
[
0
]
*
0.8
)
layer_cnt
=
0
for
i
,
num_layer
in
enumerate
(
num_layers_per_gpu
):
for
j
in
range
(
num_layer
):
device_map
[
f
'language_model.model.layers.
{
layer_cnt
}
'
]
=
rank
+
world_size
*
i
layer_cnt
+=
1
device_map
[
'vision_model'
]
=
rank
device_map
[
'mlp1'
]
=
rank
device_map
[
'language_model.model.tok_embeddings'
]
=
rank
device_map
[
'language_model.model.embed_tokens'
]
=
rank
device_map
[
'language_model.output'
]
=
rank
device_map
[
'language_model.model.norm'
]
=
rank
device_map
[
'language_model.lm_head'
]
=
rank
device_map
[
f
'language_model.model.layers.
{
num_layers
-
1
}
'
]
=
rank
return
device_map
# To revert changes
class
MiniMonkey
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
True
def
__init__
(
self
,
model_path
=
'mx262/MiniMokney'
,
load_in_8bit
=
False
,
**
kwargs
):
assert
model_path
is
not
None
assert
version_cmp
(
transformers
.
__version__
,
'4.36.2'
,
'ge'
)
self
.
model_path
=
model_path
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
,
use_fast
=
False
)
# Regular expression to match the pattern 'Image' followed by a number, e.g. Image1
self
.
pattern
=
r
'Image(\d+)'
# Replacement pattern to insert a hyphen between 'Image' and the number, e.g. Image-1
self
.
replacement
=
r
'Image-\1'
# Convert InternVL2 response to dataset format
# e.g. Image1 -> Image-1
# Regular expression to match the pattern 'Image-' followed by a number
self
.
reverse_pattern
=
r
'Image-(\d+)'
# Replacement pattern to remove the hyphen (Image-1 -> Image1)
self
.
reverse_replacement
=
r
'Image\1'
if
listinstr
([
'InternVL2-Llama3-76B'
],
model_path
):
device_map
=
split_model
(
model_path
.
split
(
'/'
)[
-
1
])
self
.
model
=
AutoModel
.
from_pretrained
(
model_path
,
torch_dtype
=
torch
.
bfloat16
,
load_in_8bit
=
load_in_8bit
,
trust_remote_code
=
True
,
low_cpu_mem_usage
=
True
,
device_map
=
device_map
).
eval
()
else
:
device
=
torch
.
cuda
.
current_device
()
self
.
device
=
device
self
.
model
=
AutoModel
.
from_pretrained
(
model_path
,
torch_dtype
=
torch
.
bfloat16
,
trust_remote_code
=
True
,
load_in_8bit
=
load_in_8bit
).
eval
()
if
not
load_in_8bit
:
self
.
model
=
self
.
model
.
to
(
device
)
self
.
image_size
=
self
.
model
.
config
.
vision_config
.
image_size
self
.
kwargs
=
kwargs
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
def
use_custom_prompt
(
self
,
dataset
):
if
dataset
is
None
:
return
False
if
listinstr
([
'MMDU'
],
dataset
):
# For Multi-Turn we don't have custom prompt
return
False
else
:
return
True
def
build_multi_choice_prompt
(
self
,
line
,
dataset
=
None
):
question
=
line
[
'question'
]
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
if
hint
is
not
None
:
question
=
hint
+
'
\n
'
+
question
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
for
key
,
item
in
options
.
items
():
question
+=
f
'
\n
{
key
}
.
{
item
}
'
prompt
=
question
if
len
(
options
):
prompt
+=
'
\n
请直接回答选项字母。'
if
cn_string
(
prompt
)
else
"
\n
Answer with the option's letter from the given choices directly."
else
:
prompt
+=
'
\n
请直接回答问题。'
if
cn_string
(
prompt
)
else
'
\n
Answer the question directly.'
return
prompt
def
build_video_prompt
(
self
,
prompt
,
dataset
=
None
,
max_nframe
=
64
):
for
start
in
range
(
0
,
max_nframe
,
8
):
images_to_remove
=
''
.
join
([
f
'<image-
{
i
}
>'
for
i
in
range
(
start
+
1
,
start
+
9
)])
prompt
=
prompt
.
replace
(
images_to_remove
,
''
)
for
i
in
range
(
max_nframe
):
prompt
=
prompt
.
replace
(
f
'<image-
{
i
+
1
}
>'
,
f
'Frame
{
i
+
1
}
'
)
if
listinstr
([
'MMBench-Video'
],
dataset
):
prompt
=
prompt
.
replace
(
'
\n
Answer:'
,
''
)
prompt
+=
'
\n
Answer the question using a single word or phrase.'
elif
listinstr
([
'Video-MME'
],
dataset
):
prompt
=
prompt
.
replace
(
'
\n
Answer:'
,
''
)
prompt
+=
"
\n
Answer with the option's letter from the given choices directly."
return
prompt
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
self
.
use_custom_prompt
(
dataset
)
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
kwargs_default
=
dict
(
do_sample
=
False
,
max_new_tokens
=
512
,
top_p
=
None
,
num_beams
=
1
)
self
.
kwargs
=
kwargs_default
if
dataset
is
not
None
and
listinstr
([
'MME'
],
dataset
):
question
=
line
[
'question'
]
prompt
=
question
+
' Answer the question using a single word or phrase.'
elif
dataset
is
not
None
and
listinstr
([
'HallusionBench'
],
dataset
):
question
=
line
[
'question'
]
prompt
=
question
+
' Please answer yes or no. Answer the question using a single word or phrase.'
elif
dataset
is
not
None
and
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
prompt
=
self
.
build_multi_choice_prompt
(
line
,
dataset
)
elif
dataset
is
not
None
and
DATASET_TYPE
(
dataset
)
==
'VQA'
:
if
listinstr
([
'MathVista'
,
'MathVision'
],
dataset
):
prompt
=
line
[
'question'
]
elif
listinstr
([
'LLaVABench'
],
dataset
):
question
=
line
[
'question'
]
prompt
=
question
+
'
\n
Answer this question in detail.'
elif
listinstr
([
'MMVet'
],
dataset
):
prompt
=
line
[
'question'
]
else
:
question
=
line
[
'question'
]
prompt
=
question
+
'
\n
Answer the question using a single word or phrase.'
else
:
prompt
=
line
[
'question'
]
message
=
[
dict
(
type
=
'text'
,
value
=
prompt
)]
message
.
extend
([
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
])
return
message
def
set_max_num
(
self
,
dataset
):
if
dataset
is
None
:
self
.
max_num
=
12
self
.
max_num2
=
7
self
.
min_num
=
4
self
.
min_num2
=
3
return
if
dataset
is
not
None
and
listinstr
([
'ChartQA_TEST'
],
dataset
):
self
.
max_num
=
12
self
.
max_num2
=
3
elif
dataset
is
not
None
and
listinstr
([
'DocVQA_VAL'
,
'DocVQA_TEST'
,
'TextVQA_VAL'
],
dataset
):
self
.
max_num
=
23
self
.
max_num2
=
15
self
.
min_num
=
14
self
.
min_num2
=
5
elif
dataset
is
not
None
and
listinstr
([
'InfoVQA_VAL'
,
'InfoVQA_TEST'
,
'SEEDBench_IMG'
],
dataset
):
self
.
max_num
=
23
self
.
max_num2
=
5
self
.
min_num
=
15
self
.
min_num2
=
3
elif
dataset
is
not
None
and
listinstr
([
'OCRBench'
,
'POPE'
],
dataset
):
self
.
max_num
=
24
self
.
max_num2
=
8
self
.
min_num
=
9
self
.
min_num2
=
5
elif
dataset
is
not
None
and
listinstr
([
'HallusionBench'
],
dataset
):
self
.
max_num
=
11
self
.
max_num2
=
6
self
.
min_num
=
4
self
.
min_num2
=
2
elif
dataset
is
not
None
and
listinstr
([
'MME'
],
dataset
):
self
.
max_num
=
11
self
.
max_num2
=
6
self
.
min_num
=
5
self
.
min_num2
=
2
elif
dataset
is
not
None
and
listinstr
([
'AI2D_TEST'
],
dataset
):
self
.
max_num
=
12
self
.
max_num2
=
6
self
.
min_num
=
5
self
.
min_num2
=
2
elif
dataset
is
not
None
and
listinstr
([
'CCBench'
],
dataset
):
self
.
max_num
=
24
self
.
max_num2
=
8
self
.
min_num
=
9
self
.
min_num2
=
4
elif
dataset
is
not
None
and
listinstr
([
'MMMU_DEV_VAL'
],
dataset
):
self
.
max_num
=
12
self
.
max_num2
=
7
self
.
min_num
=
5
self
.
min_num2
=
3
else
:
self
.
max_num
=
12
self
.
max_num2
=
7
self
.
min_num
=
4
self
.
min_num2
=
3
def
generate_v2
(
self
,
message
,
dataset
=
None
):
image_num
=
len
([
x
for
x
in
message
if
x
[
'type'
]
==
'image'
])
if
image_num
==
1
:
prompt
=
'<image>
\n
'
+
'
\n
'
.
join
([
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'text'
])
else
:
prompt
,
image_idx
=
''
,
1
for
x
in
message
:
if
x
[
'type'
]
==
'text'
:
prompt
+=
x
[
'value'
]
elif
x
[
'type'
]
==
'image'
:
prompt
+=
f
'<image-
{
image_idx
}
>'
image_idx
+=
1
prompt
=
' '
.
join
([
f
'<image-
{
i
+
1
}
>: <image>'
for
i
in
range
(
image_num
)])
+
'
\n
'
+
prompt
if
dataset
is
not
None
and
listinstr
([
'Video'
],
dataset
):
prompt
=
self
.
build_video_prompt
(
prompt
,
dataset
)
if
image_num
>
1
:
image_path
=
[
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'image'
]
num_patches_list
=
[]
pixel_values_list
=
[]
for
image_idx
,
file_name
in
enumerate
(
image_path
):
curr_pixel_values
,
target_aspect_ratio
=
load_image
(
file_name
,
min_num
=
self
.
min_num
,
max_num
=
self
.
max_num
)
curr_pixel_values
=
curr_pixel_values
.
cuda
().
to
(
torch
.
bfloat16
)
curr_pixel_values2
=
load_image2
(
file_name
,
target_aspect_ratio
=
target_aspect_ratio
,
min_num
=
self
.
min_num2
,
max_num
=
self
.
max_num2
)
curr_pixel_values2
=
curr_pixel_values2
.
cuda
().
to
(
torch
.
bfloat16
)
curr_pixel_values
=
torch
.
cat
(
(
curr_pixel_values
[:
-
1
],
curr_pixel_values2
[:
-
1
],
curr_pixel_values
[
-
1
:]),
0
)
num_patches_list
.
append
(
curr_pixel_values
.
size
(
0
))
pixel_values_list
.
append
(
curr_pixel_values
)
pixel_values
=
torch
.
cat
(
pixel_values_list
,
dim
=
0
)
elif
image_num
==
1
:
image_path
=
[
x
[
'value'
]
for
x
in
message
if
x
[
'type'
]
==
'image'
][
0
]
pixel_values
,
target_aspect_ratio
=
load_image
(
image_path
,
min_num
=
self
.
min_num
,
max_num
=
self
.
max_num
)
pixel_values
=
pixel_values
.
cuda
().
to
(
torch
.
bfloat16
)
pixel_values2
=
load_image2
(
image_path
,
target_aspect_ratio
=
target_aspect_ratio
,
min_num
=
self
.
min_num2
,
max_num
=
self
.
max_num2
)
pixel_values2
=
pixel_values2
.
cuda
().
to
(
torch
.
bfloat16
)
pixel_values
=
torch
.
cat
((
pixel_values
[:
-
1
],
pixel_values2
[:
-
1
],
pixel_values
[
-
1
:]),
0
)
num_patches_list
=
[
pixel_values
.
size
(
0
)]
else
:
pixel_values
=
None
num_patches_list
=
[]
with
torch
.
no_grad
():
response
=
self
.
model
.
chat
(
self
.
tokenizer
,
pixel_values
=
pixel_values
,
target_aspect_ratio
=
(
1
,
1
),
num_patches_list
=
num_patches_list
,
question
=
prompt
,
generation_config
=
self
.
kwargs
,
verbose
=
False
)
return
response
def
generate_inner
(
self
,
message
,
dataset
=
None
):
self
.
set_max_num
(
dataset
)
return
self
.
generate_v2
(
message
,
dataset
)
def
build_history
(
self
,
message
):
# Global Variables
image_path
=
[]
image_cnt
=
0
def
concat_tilist
(
tilist
):
nonlocal
image_cnt
# Declare image_cnt as nonlocal to modify it
prompt
=
''
for
item
in
tilist
:
# Substitute the pattern in the text
if
item
[
'type'
]
==
'text'
:
prompt
+=
re
.
sub
(
self
.
pattern
,
self
.
replacement
,
item
[
'value'
])
elif
item
[
'type'
]
==
'image'
:
image_cnt
+=
1
prompt
+=
'<image>
\n
'
image_path
.
append
(
item
[
'value'
])
return
prompt
# Only previous messages
assert
len
(
message
)
%
2
==
0
history
=
[]
for
i
in
range
(
len
(
message
)
//
2
):
m1
,
m2
=
message
[
2
*
i
],
message
[
2
*
i
+
1
]
assert
m1
[
'role'
]
==
'user'
and
m2
[
'role'
]
==
'assistant'
history
.
append
((
concat_tilist
(
m1
[
'content'
]),
concat_tilist
(
m2
[
'content'
])))
return
history
,
image_path
,
image_cnt
def
chat_inner_v2
(
self
,
message
,
dataset
=
None
):
image_cnt
=
0
if
len
(
message
)
>
1
:
history
,
image_path
,
image_cnt
=
self
.
build_history
(
message
[:
-
1
])
else
:
history
,
image_path
,
image_cnt
=
None
,
[],
1
current_msg
=
message
[
-
1
]
question
=
''
# If message is just text in the conversation
if
len
(
current_msg
[
'content'
])
==
1
and
current_msg
[
'content'
][
0
][
'type'
]
==
'text'
:
question
=
current_msg
[
'content'
][
0
][
'value'
]
question
=
re
.
sub
(
self
.
pattern
,
self
.
replacement
,
question
)
# Fix pattern as per InternVL
else
:
for
msg
in
current_msg
[
'content'
]:
if
msg
[
'type'
]
==
'text'
:
question
+=
re
.
sub
(
self
.
pattern
,
self
.
replacement
,
msg
[
'value'
])
elif
msg
[
'type'
]
==
'image'
:
image_cnt
+=
1
question
+=
'<image>
\n
'
image_path
.
append
(
msg
[
'value'
])
if
image_cnt
>
1
:
num_patches_list
=
[]
pixel_values_list
=
[]
for
image_idx
,
file_name
in
enumerate
(
image_path
):
curr_pixel_values
,
target_aspect_ratio
=
load_image
(
file_name
,
min_num
=
self
.
min_num
,
max_num
=
self
.
max_num
)
curr_pixel_values
=
curr_pixel_values
.
cuda
().
to
(
torch
.
bfloat16
)
curr_pixel_values2
=
load_image2
(
file_name
,
target_aspect_ratio
=
target_aspect_ratio
,
min_num
=
self
.
min_num2
,
max_num
=
self
.
max_num2
)
curr_pixel_values2
=
curr_pixel_values2
.
cuda
().
to
(
torch
.
bfloat16
)
curr_pixel_values
=
torch
.
cat
(
(
curr_pixel_values
[:
-
1
],
curr_pixel_values2
[:
-
1
],
curr_pixel_values
[
-
1
:]),
0
)
num_patches_list
.
append
(
curr_pixel_values
.
size
(
0
))
pixel_values_list
.
append
(
curr_pixel_values
)
pixel_values
=
torch
.
cat
(
pixel_values_list
,
dim
=
0
)
elif
image_cnt
==
1
:
pixel_values
,
target_aspect_ratio
=
load_image
(
image_path
,
min_num
=
self
.
min_num
,
max_num
=
self
.
max_num
)
pixel_values
=
pixel_values
.
cuda
().
to
(
torch
.
bfloat16
)
pixel_values2
=
load_image2
(
image_path
,
target_aspect_ratio
=
target_aspect_ratio
,
min_num
=
self
.
min_num2
,
max_num
=
self
.
max_num2
)
pixel_values2
=
pixel_values2
.
cuda
().
to
(
torch
.
bfloat16
)
pixel_values
=
torch
.
cat
((
pixel_values
[:
-
1
],
pixel_values2
[:
-
1
],
pixel_values
[
-
1
:]),
0
)
num_patches_list
=
[
pixel_values
.
size
(
0
)]
else
:
pixel_values
=
None
num_patches_list
=
[]
response
,
history
=
self
.
model
.
chat
(
self
.
tokenizer
,
pixel_values
=
pixel_values
,
target_aspect_ratio
=
target_aspect_ratio
,
num_patches_list
=
num_patches_list
,
question
=
question
,
generation_config
=
self
.
kwargs
,
history
=
history
,
return_history
=
True
)
response
=
re
.
sub
(
self
.
reverse_pattern
,
self
.
reverse_replacement
,
response
)
return
response
def
chat_inner
(
self
,
message
,
dataset
=
None
):
self
.
set_max_num
(
dataset
)
kwargs_default
=
dict
(
do_sample
=
False
,
max_new_tokens
=
512
,
top_p
=
None
,
num_beams
=
1
)
self
.
kwargs
=
kwargs_default
return
self
.
chat_inner_v2
(
message
,
dataset
)
VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml
0 → 100644
View file @
bc5ebf0f
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model
:
arch
:
instruct_vicuna13b
load_finetuned
:
False
load_pretrained
:
True
pretrained
:
"
https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth"
finetuned
:
"
"
# vit encoder
image_size
:
224
drop_path_rate
:
0
use_grad_checkpoint
:
False
vit_precision
:
"
fp16"
freeze_vit
:
True
# Q-Former
num_query_token
:
32
# path to Vicuna checkpoint
llm_model
:
"
Please
set
the
path
to
your
vicuna-13b-v1.1"
# generation configs
prompt
:
"
"
preprocess
:
vis_processor
:
train
:
name
:
"
blip2_image_train"
image_size
:
224
eval
:
name
:
"
blip_image_eval"
image_size
:
224
text_processor
:
train
:
name
:
"
blip_caption"
eval
:
name
:
"
blip_caption"
VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml
0 → 100644
View file @
bc5ebf0f
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model
:
arch
:
instruct_vicuna7b
load_finetuned
:
False
load_pretrained
:
True
pretrained
:
"
https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth"
finetuned
:
"
"
# vit encoder
image_size
:
224
drop_path_rate
:
0
use_grad_checkpoint
:
False
vit_precision
:
"
fp16"
freeze_vit
:
True
# Q-Former
num_query_token
:
32
# path to Vicuna checkpoint
llm_model
:
"
Please
set
the
path
to
your
vicuna-7b-v1.1"
# generation configs
prompt
:
"
"
preprocess
:
vis_processor
:
train
:
name
:
"
blip2_image_train"
image_size
:
224
eval
:
name
:
"
blip_image_eval"
image_size
:
224
text_processor
:
train
:
name
:
"
blip_caption"
eval
:
name
:
"
blip_caption"
VLMEvalKit/vlmeval/vlm/misc/minigpt4_13b_eval.yaml
0 → 100644
View file @
bc5ebf0f
model
:
arch
:
minigpt4
model_type
:
pretrain_vicuna_7b
max_txt_len
:
160
end_sym
:
"
###"
low_resource
:
True
prompt_template
:
'
###Human:
{}
###Assistant:
'
ckpt
:
"
please
set
this
value
to
the
path
of
pretrained
checkpoint"
# vit encoder
image_size
:
224
drop_path_rate
:
0
use_grad_checkpoint
:
False
vit_precision
:
"
fp16"
freeze_vit
:
True
freeze_qformer
:
True
# Q-Former
num_query_token
:
32
# generation configs
prompt
:
"
"
llama_model
:
"
please
set
this
value
to
the
path
of
vicuna-13b-v0"
datasets
:
cc_sbu_align
:
vis_processor
:
train
:
name
:
"
blip2_image_eval"
image_size
:
224
text_processor
:
train
:
name
:
"
blip_caption"
run
:
task
:
image_text_pretrain
Prev
1
…
5
6
7
8
9
10
11
12
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment