Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Ovis_pytorch
Commits
81028572
Commit
81028572
authored
Sep 28, 2024
by
luopl
Browse files
init
parents
Pipeline
#1722
canceled with stages
Changes
222
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2559 additions
and
0 deletions
+2559
-0
VLMEvalKit/vlmeval/vlm/phi3_vision.py
VLMEvalKit/vlmeval/vlm/phi3_vision.py
+160
-0
VLMEvalKit/vlmeval/vlm/pixtral.py
VLMEvalKit/vlmeval/vlm/pixtral.py
+67
-0
VLMEvalKit/vlmeval/vlm/qh_360vl.py
VLMEvalKit/vlmeval/vlm/qh_360vl.py
+61
-0
VLMEvalKit/vlmeval/vlm/qwen2_vl/__init__.py
VLMEvalKit/vlmeval/vlm/qwen2_vl/__init__.py
+2
-0
VLMEvalKit/vlmeval/vlm/qwen2_vl/model.py
VLMEvalKit/vlmeval/vlm/qwen2_vl/model.py
+142
-0
VLMEvalKit/vlmeval/vlm/qwen2_vl/prompt.py
VLMEvalKit/vlmeval/vlm/qwen2_vl/prompt.py
+155
-0
VLMEvalKit/vlmeval/vlm/qwen_vl.py
VLMEvalKit/vlmeval/vlm/qwen_vl.py
+126
-0
VLMEvalKit/vlmeval/vlm/rbdash.py
VLMEvalKit/vlmeval/vlm/rbdash.py
+274
-0
VLMEvalKit/vlmeval/vlm/slime.py
VLMEvalKit/vlmeval/vlm/slime.py
+78
-0
VLMEvalKit/vlmeval/vlm/transcore_m.py
VLMEvalKit/vlmeval/vlm/transcore_m.py
+162
-0
VLMEvalKit/vlmeval/vlm/video_llm/__init__.py
VLMEvalKit/vlmeval/vlm/video_llm/__init__.py
+8
-0
VLMEvalKit/vlmeval/vlm/video_llm/chat_uni_vi.py
VLMEvalKit/vlmeval/vlm/video_llm/chat_uni_vi.py
+190
-0
VLMEvalKit/vlmeval/vlm/video_llm/configs/llama_vid/processor/clip-patch14-224/config.json
.../configs/llama_vid/processor/clip-patch14-224/config.json
+171
-0
VLMEvalKit/vlmeval/vlm/video_llm/configs/llama_vid/processor/clip-patch14-224/preprocessor_config.json
...a_vid/processor/clip-patch14-224/preprocessor_config.json
+19
-0
VLMEvalKit/vlmeval/vlm/video_llm/configs/videochat2_hd.json
VLMEvalKit/vlmeval/vlm/video_llm/configs/videochat2_hd.json
+56
-0
VLMEvalKit/vlmeval/vlm/video_llm/llama_vid.py
VLMEvalKit/vlmeval/vlm/video_llm/llama_vid.py
+123
-0
VLMEvalKit/vlmeval/vlm/video_llm/pllava.py
VLMEvalKit/vlmeval/vlm/video_llm/pllava.py
+100
-0
VLMEvalKit/vlmeval/vlm/video_llm/video_chatgpt.py
VLMEvalKit/vlmeval/vlm/video_llm/video_chatgpt.py
+67
-0
VLMEvalKit/vlmeval/vlm/video_llm/video_llava.py
VLMEvalKit/vlmeval/vlm/video_llm/video_llava.py
+160
-0
VLMEvalKit/vlmeval/vlm/video_llm/videochat2.py
VLMEvalKit/vlmeval/vlm/video_llm/videochat2.py
+438
-0
No files found.
VLMEvalKit/vlmeval/vlm/phi3_vision.py
0 → 100644
View file @
81028572
from
PIL
import
Image
import
torch
from
.base
import
BaseModel
from
..smp
import
*
class
Phi3Vision
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
=
'microsoft/Phi-3-vision-128k-instruct'
,
**
kwargs
):
try
:
from
transformers
import
AutoProcessor
,
AutoModelForCausalLM
except
:
warnings
.
warn
(
'Please install the latest version transformers.'
)
sys
.
exit
(
-
1
)
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
device_map
=
'cuda'
,
trust_remote_code
=
True
,
torch_dtype
=
'auto'
).
eval
()
processor
=
AutoProcessor
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
self
.
model
=
model
self
.
processor
=
processor
self
.
kwargs
=
kwargs
def
generate_inner
(
self
,
message
,
dataset
=
None
):
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
image
=
Image
.
open
(
image_path
).
convert
(
'RGB'
)
messages
=
[
{
'role'
:
'user'
,
'content'
:
f
'<|image_1|>
\n
{
prompt
}
'
}
]
prompt
=
self
.
processor
.
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
inputs
=
self
.
processor
(
prompt
,
[
image
],
return_tensors
=
'pt'
).
to
(
'cuda'
)
generation_args
=
{
'max_new_tokens'
:
500
,
'temperature'
:
0.0
,
'do_sample'
:
False
,
}
generation_args
.
update
(
self
.
kwargs
)
generate_ids
=
self
.
model
.
generate
(
**
inputs
,
eos_token_id
=
self
.
processor
.
tokenizer
.
eos_token_id
,
**
generation_args
)
generate_ids
=
generate_ids
[:,
inputs
[
'input_ids'
].
shape
[
1
]:]
response
=
self
.
processor
.
batch_decode
(
generate_ids
,
skip_special_tokens
=
True
,
clean_up_tokenization_spaces
=
False
)[
0
]
return
response
def
chat_inner
(
self
,
message
,
dataset
=
None
):
messages
=
[]
image_cnt
=
1
image_list
=
[]
for
msg
in
message
:
content
=
''
# If message is just text in the conversation
if
len
(
msg
[
'content'
])
==
1
and
msg
[
'content'
][
0
][
'type'
]
==
'text'
:
msg_new
=
{
'role'
:
msg
[
'role'
],
'content'
:
msg
[
'content'
][
0
][
'value'
]}
messages
.
append
(
msg_new
)
continue
# If both image & text is present
for
x
in
msg
[
'content'
]:
if
x
[
'type'
]
==
'text'
:
content
+=
x
[
'value'
]
elif
x
[
'type'
]
==
'image'
:
image
=
Image
.
open
(
x
[
'value'
]).
convert
(
'RGB'
)
content
+=
f
'<|image_
{
image_cnt
}
|>
\n
'
image_list
.
append
(
image
)
image_cnt
+=
1
msg_new
=
{
'role'
:
msg
[
'role'
],
'content'
:
content
}
messages
.
append
(
msg_new
)
prompt
=
self
.
processor
.
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
inputs
=
self
.
processor
(
prompt
,
image_list
,
return_tensors
=
'pt'
).
to
(
'cuda'
)
generation_args
=
{
'max_new_tokens'
:
500
,
'temperature'
:
0.0
,
'do_sample'
:
False
,
}
generation_args
.
update
(
self
.
kwargs
)
generate_ids
=
self
.
model
.
generate
(
**
inputs
,
eos_token_id
=
self
.
processor
.
tokenizer
.
eos_token_id
,
**
generation_args
)
generate_ids
=
generate_ids
[:,
inputs
[
'input_ids'
].
shape
[
1
]:]
response
=
self
.
processor
.
batch_decode
(
generate_ids
,
skip_special_tokens
=
True
,
clean_up_tokenization_spaces
=
False
)[
0
]
return
response
class
Phi3_5Vision
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
=
'microsoft/Phi-3.5-vision-instruct'
,
**
kwargs
):
try
:
from
transformers
import
AutoProcessor
,
AutoModelForCausalLM
except
:
warnings
.
warn
(
'Please install the latest version transformers.'
)
sys
.
exit
(
-
1
)
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
device_map
=
'cuda'
,
trust_remote_code
=
True
,
torch_dtype
=
'auto'
,
_attn_implementation
=
'flash_attention_2'
).
eval
()
# for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
processor
=
AutoProcessor
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
,
num_crops
=
4
)
self
.
model
=
model
self
.
processor
=
processor
self
.
kwargs
=
kwargs
def
generate_inner
(
self
,
message
,
dataset
=
None
):
prompt
=
'
\n
'
.
join
([
msg
[
'value'
]
for
msg
in
message
if
msg
[
'type'
]
==
'text'
])
images
=
[
Image
.
open
(
msg
[
'value'
]).
convert
(
'RGB'
)
for
msg
in
message
if
msg
[
'type'
]
==
'image'
]
num_images
=
len
(
images
)
placeholder
=
''
for
i
in
range
(
1
,
num_images
+
1
):
placeholder
+=
f
'<|image_
{
i
}
|>
\n
'
messages
=
[
{
'role'
:
'user'
,
'content'
:
placeholder
+
prompt
}
]
prompt
=
self
.
processor
.
tokenizer
.
apply_chat_template
(
messages
,
tokenize
=
False
,
add_generation_prompt
=
True
)
inputs
=
self
.
processor
(
prompt
,
images
,
return_tensors
=
'pt'
).
to
(
'cuda'
)
generation_args
=
{
'max_new_tokens'
:
1000
,
'temperature'
:
0.0
,
'do_sample'
:
False
,
}
generation_args
.
update
(
self
.
kwargs
)
generate_ids
=
self
.
model
.
generate
(
**
inputs
,
eos_token_id
=
self
.
processor
.
tokenizer
.
eos_token_id
,
**
generation_args
)
# remove input tokens
generate_ids
=
generate_ids
[:,
inputs
[
'input_ids'
].
shape
[
1
]:]
response
=
self
.
processor
.
batch_decode
(
generate_ids
,
skip_special_tokens
=
True
,
clean_up_tokenization_spaces
=
False
)[
0
]
return
response
VLMEvalKit/vlmeval/vlm/pixtral.py
0 → 100644
View file @
81028572
import
torch
from
PIL
import
Image
from
.base
import
BaseModel
from
..smp
import
*
import
warnings
from
huggingface_hub
import
snapshot_download
class
Pixtral
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
True
def
__init__
(
self
,
model_path
=
'mistralai/Pixtral-12B-2409'
,
**
kwargs
):
self
.
model_path
=
model_path
try
:
from
mistral_inference.transformer
import
Transformer
from
mistral_common.tokens.tokenizers.mistral
import
MistralTokenizer
except
ImportError
as
err
:
warnings
.
warn
(
'Please install `mistral-inference` and `mistral_common`'
)
raise
err
if
get_cache_path
(
model_path
)
is
None
:
snapshot_download
(
repo_id
=
model_path
)
cache_path
=
get_cache_path
(
self
.
model_path
)
self
.
tokenizer
=
MistralTokenizer
.
from_file
(
f
'
{
cache_path
}
/tekken.json'
)
model
=
Transformer
.
from_folder
(
cache_path
,
device
=
'cpu'
)
model
.
cuda
()
self
.
model
=
model
self
.
max_tokens
=
512
def
generate_inner
(
self
,
message
,
dataset
=
None
):
try
:
from
mistral_inference.generate
import
generate
from
mistral_common.protocol.instruct.messages
import
UserMessage
,
TextChunk
,
ImageURLChunk
from
mistral_common.protocol.instruct.request
import
ChatCompletionRequest
except
ImportError
as
err
:
warnings
.
warn
(
'Please install `mistral-inference` and `mistral_common`'
)
raise
err
msg_new
=
[]
for
msg
in
message
:
tp
,
val
=
msg
[
'type'
],
msg
[
'value'
]
if
tp
==
'text'
:
msg_new
.
append
(
TextChunk
(
text
=
val
))
elif
tp
==
'image'
:
b64
=
encode_image_file_to_base64
(
val
)
image_url
=
f
'data:image/jpeg;base64,
{
b64
}
'
msg_new
.
append
(
ImageURLChunk
(
image_url
=
image_url
))
completion_request
=
ChatCompletionRequest
(
messages
=
[
UserMessage
(
content
=
msg_new
)])
encoded
=
self
.
tokenizer
.
encode_chat_completion
(
completion_request
)
images
=
encoded
.
images
tokens
=
encoded
.
tokens
out_tokens
,
_
=
generate
(
[
tokens
],
self
.
model
,
images
=
[
images
],
max_tokens
=
self
.
max_tokens
,
temperature
=
0
,
eos_id
=
self
.
tokenizer
.
instruct_tokenizer
.
tokenizer
.
eos_id
)
result
=
self
.
tokenizer
.
decode
(
out_tokens
[
0
])
return
result
VLMEvalKit/vlmeval/vlm/qh_360vl.py
0 → 100644
View file @
81028572
import
torch
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
import
warnings
import
os.path
as
osp
from
PIL
import
Image
from
.base
import
BaseModel
from
..smp
import
*
from
..dataset
import
DATASET_TYPE
class
QH_360VL
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
=
'qihoo360/360VL-70B'
,
**
kwargs
):
assert
model_path
is
not
None
self
.
model_path
=
model_path
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
torch_dtype
=
torch
.
float16
,
low_cpu_mem_usage
=
True
,
device_map
=
'auto'
,
trust_remote_code
=
True
).
eval
()
vision_tower
=
self
.
model
.
get_vision_tower
()
vision_tower
.
load_model
()
vision_tower
.
to
(
device
=
'cuda'
,
dtype
=
torch
.
float16
)
self
.
image_processor
=
vision_tower
.
image_processor
self
.
tokenizer
.
pad_token
=
self
.
tokenizer
.
eos_token
self
.
kwargs
=
kwargs
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
torch
.
cuda
.
empty_cache
()
def
generate
(
self
,
message
,
dataset
=
None
):
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
print
(
prompt
)
image
=
Image
.
open
(
image_path
).
convert
(
'RGB'
)
terminators
=
[
self
.
tokenizer
.
convert_tokens_to_ids
(
'<|eot_id|>'
,)
]
inputs
=
self
.
model
.
build_conversation_input_ids
(
self
.
tokenizer
,
query
=
prompt
,
image
=
image
,
image_processor
=
self
.
image_processor
)
input_ids
=
inputs
[
'input_ids'
].
to
(
device
=
'cuda'
,
non_blocking
=
True
)
images
=
inputs
[
'image'
].
to
(
dtype
=
torch
.
float16
,
device
=
'cuda'
,
non_blocking
=
True
)
output_ids
=
self
.
model
.
generate
(
input_ids
=
input_ids
,
images
=
images
,
do_sample
=
False
,
num_beams
=
1
,
max_new_tokens
=
512
,
eos_token_id
=
terminators
,
use_cache
=
True
)
input_token_len
=
input_ids
.
shape
[
1
]
outputs
=
self
.
tokenizer
.
batch_decode
(
output_ids
[:,
input_token_len
:],
skip_special_tokens
=
True
)[
0
]
response
=
outputs
.
strip
()
return
response
VLMEvalKit/vlmeval/vlm/qwen2_vl/__init__.py
0 → 100644
View file @
81028572
from
.model
import
Qwen2VLChat
from
.prompt
import
Qwen2VLPromptMixin
VLMEvalKit/vlmeval/vlm/qwen2_vl/model.py
0 → 100644
View file @
81028572
from
__future__
import
annotations
import
os
import
warnings
import
torch
from
..base
import
BaseModel
from
.prompt
import
Qwen2VLPromptMixin
def
ensure_image_url
(
image
:
str
)
->
str
:
prefixes
=
[
'http://'
,
'https://'
,
'file://'
,
'data:image;'
]
if
any
(
image
.
startswith
(
prefix
)
for
prefix
in
prefixes
):
return
image
if
os
.
path
.
exists
(
image
):
return
'file://'
+
image
raise
ValueError
(
f
'Invalid image:
{
image
}
'
)
def
ensure_video_url
(
video
:
str
)
->
str
:
prefixes
=
[
'http://'
,
'https://'
,
'file://'
,
'data:video;'
]
if
any
(
video
.
startswith
(
prefix
)
for
prefix
in
prefixes
):
return
video
if
os
.
path
.
exists
(
video
):
return
'file://'
+
video
raise
ValueError
(
f
'Invalid video:
{
video
}
'
)
class
Qwen2VLChat
(
Qwen2VLPromptMixin
,
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
True
VIDEO_LLM
=
True
def
__init__
(
self
,
model_path
:
str
,
min_pixels
:
int
|
None
=
None
,
max_pixels
:
int
|
None
=
None
,
max_new_tokens
=
2048
,
top_p
=
0.001
,
top_k
=
1
,
temperature
=
0.01
,
repetition_penalty
=
1.0
,
use_custom_prompt
:
bool
=
True
,
system_prompt
:
str
|
None
=
None
,
verbose
:
bool
=
True
,
):
super
().
__init__
(
use_custom_prompt
=
use_custom_prompt
)
self
.
min_pixels
=
min_pixels
self
.
max_pixels
=
max_pixels
self
.
generate_kwargs
=
dict
(
max_new_tokens
=
max_new_tokens
,
top_p
=
top_p
,
top_k
=
top_k
,
temperature
=
temperature
,
repetition_penalty
=
repetition_penalty
,
)
self
.
system_prompt
=
system_prompt
self
.
verbose
=
verbose
self
.
fps
=
2.0
from
transformers
import
Qwen2VLForConditionalGeneration
,
Qwen2VLProcessor
assert
model_path
is
not
None
self
.
model_path
=
model_path
self
.
processor
=
Qwen2VLProcessor
.
from_pretrained
(
model_path
)
if
'72b'
not
in
self
.
model_path
.
lower
():
self
.
model
=
Qwen2VLForConditionalGeneration
.
from_pretrained
(
model_path
,
torch_dtype
=
'auto'
,
device_map
=
'cpu'
,
attn_implementation
=
'flash_attention_2'
)
self
.
model
.
cuda
().
eval
()
else
:
self
.
model
=
Qwen2VLForConditionalGeneration
.
from_pretrained
(
model_path
,
torch_dtype
=
'auto'
,
device_map
=
'auto'
,
attn_implementation
=
'flash_attention_2'
)
self
.
model
.
cuda
().
eval
()
torch
.
cuda
.
empty_cache
()
def
_prepare_content
(
self
,
inputs
:
list
[
dict
[
str
,
str
]],
dataset
:
str
|
None
=
None
)
->
list
[
dict
[
str
,
str
]]:
"""
inputs list[dict[str, str]], each dict has keys: ['type', 'value']
"""
content
=
[]
for
s
in
inputs
:
if
s
[
'type'
]
==
'image'
:
item
=
{
'type'
:
'image'
,
'image'
:
ensure_image_url
(
s
[
'value'
])}
if
dataset
==
'OCRBench'
:
item
[
'min_pixels'
]
=
10
*
10
*
28
*
28
warnings
.
warn
(
f
"OCRBench dataset uses custom min_pixels=
{
item
[
'min_pixels'
]
}
"
)
if
self
.
max_pixels
is
not
None
:
item
[
'max_pixels'
]
=
self
.
max_pixels
else
:
if
self
.
min_pixels
is
not
None
:
item
[
'min_pixels'
]
=
self
.
min_pixels
if
self
.
max_pixels
is
not
None
:
item
[
'max_pixels'
]
=
self
.
max_pixels
elif
s
[
'type'
]
==
'video'
:
item
=
{
'type'
:
'video'
,
'video'
:
ensure_video_url
(
s
[
'value'
])}
if
self
.
fps
is
not
None
:
item
[
'fps'
]
=
self
.
fps
elif
s
[
'type'
]
==
'text'
:
item
=
{
'type'
:
'text'
,
'text'
:
s
[
'value'
]}
else
:
raise
ValueError
(
f
"Invalid message type:
{
s
[
'type'
]
}
,
{
s
}
"
)
content
.
append
(
item
)
return
content
def
generate_inner
(
self
,
message
,
dataset
=
None
):
try
:
from
qwen_vl_utils
import
process_vision_info
except
ImportError
:
warnings
.
warn
(
"qwen_vl_utils not found, please install it via 'pip install qwen-vl-utils'"
)
raise
messages
=
[]
if
self
.
system_prompt
is
not
None
:
messages
.
append
({
'role'
:
'system'
,
'content'
:
self
.
system_prompt
})
messages
.
append
({
'role'
:
'user'
,
'content'
:
self
.
_prepare_content
(
message
,
dataset
=
dataset
)})
if
self
.
verbose
:
print
(
f
'
\033
[31m
{
messages
}
\033
[0m'
)
text
=
self
.
processor
.
apply_chat_template
([
messages
],
tokenize
=
False
,
add_generation_prompt
=
True
)
images
,
videos
=
process_vision_info
([
messages
])
inputs
=
self
.
processor
(
text
=
text
,
images
=
images
,
videos
=
videos
,
padding
=
True
,
return_tensors
=
'pt'
)
inputs
=
inputs
.
to
(
'cuda'
)
generated_ids
=
self
.
model
.
generate
(
**
inputs
,
**
self
.
generate_kwargs
,
)
generated_ids
=
[
output_ids
[
len
(
input_ids
):]
for
input_ids
,
output_ids
in
zip
(
inputs
.
input_ids
,
generated_ids
)
]
out
=
self
.
processor
.
tokenizer
.
batch_decode
(
generated_ids
,
skip_special_tokens
=
True
,
clean_up_tokenization_spaces
=
False
)
response
=
out
[
0
]
if
self
.
verbose
:
print
(
f
'
\033
[32m
{
response
}
\033
[0m'
)
return
response
VLMEvalKit/vlmeval/vlm/qwen2_vl/prompt.py
0 → 100644
View file @
81028572
from
__future__
import
annotations
class
Qwen2VLPromptMixin
:
"""
Mixin class for Qwen2VLChat to build custom prompt for different datasets.
Requires the following methods to be implemented in the subclass:
- dump_image(line, dataset: str) -> str | list[str]
Implements the following methods:
- use_custom_prompt(dataset: str) -> bool
- build_prompt(line, dataset: str) -> list[dict[str, str]]
"""
def
__init__
(
self
,
*
args
,
use_custom_prompt
:
bool
=
True
,
**
kwargs
)
->
None
:
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
_use_custom_prompt
=
use_custom_prompt
def
set_dump_image
(
self
,
dump_image_func
):
self
.
dump_image_func
=
dump_image_func
def
dump_image
(
self
,
line
,
dataset
):
return
self
.
dump_image_func
(
line
)
def
use_custom_prompt
(
self
,
dataset
:
str
)
->
bool
:
from
vlmeval.dataset
import
DATASET_TYPE
dataset_type
=
DATASET_TYPE
(
dataset
,
default
=
None
)
if
not
self
.
_use_custom_prompt
:
return
False
if
dataset
in
{
'MMMU_DEV_VAL'
,
'MMMU_TEST'
}:
return
True
if
dataset_type
==
'MCQ'
:
return
True
if
dataset_type
==
'Y/N'
and
dataset
in
{
'HallusionBench'
,
'POPE'
}:
# MME has it's own prompt
return
True
if
dataset_type
==
'VQA'
and
dataset
not
in
{
'MMVet'
}:
# MMVet VQA has it's own prompt
return
True
return
False
def
build_prompt
(
self
,
line
,
dataset
:
str
)
->
list
[
dict
[
str
,
str
]]:
from
vlmeval.dataset
import
DATASET_TYPE
if
dataset
in
{
'MMMU_DEV_VAL'
,
'MMMU_TEST'
}:
return
self
.
_build_mmmu_prompt
(
line
,
dataset
)
dataset_type
=
DATASET_TYPE
(
dataset
,
default
=
None
)
if
dataset_type
==
'MCQ'
:
return
self
.
_build_mcq_prompt
(
line
,
dataset
)
if
dataset_type
==
'Y/N'
:
return
self
.
_build_yorn_prompt
(
line
,
dataset
)
if
dataset_type
==
'VQA'
:
return
self
.
_build_vqa_prompt
(
line
,
dataset
)
raise
ValueError
(
f
'Unsupported dataset:
{
dataset
}
'
)
def
_build_mmmu_prompt
(
self
,
line
,
dataset
:
str
)
->
list
[
dict
[
str
,
str
]]:
"""change the prompt for MMMU dataset: keep all images at beginning."""
import
string
import
pandas
as
pd
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
'question'
]
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])}
options_prompt
=
'Options:
\n
'
for
key
,
item
in
options
.
items
():
options_prompt
+=
f
'
{
key
}
.
{
item
}
\n
'
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
prompt
=
''
if
hint
is
not
None
:
prompt
+=
f
'Hint:
{
hint
}
\n
'
prompt
+=
f
'Question:
{
question
}
\n
'
if
len
(
options
):
prompt
+=
options_prompt
prompt
+=
'Please select the correct answer from the options above.
\n
'
prompt
=
prompt
.
rstrip
()
msgs
=
[]
if
isinstance
(
tgt_path
,
list
):
msgs
.
extend
([
dict
(
type
=
'image'
,
value
=
p
)
for
p
in
tgt_path
])
else
:
msgs
=
[
dict
(
type
=
'image'
,
value
=
tgt_path
)]
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
prompt
))
return
msgs
def
_build_mcq_prompt
(
self
,
line
,
dataset
:
str
)
->
list
[
dict
[
str
,
str
]]:
"""change the prompt for MCQ dataset: use chinese prompt if the question contains chinese characters."""
MCQ_CN_PROMPT
=
'请直接回答选项字母。'
MCQ_EN_PROMPT
=
'Please select the correct answer from the options above.'
import
string
import
pandas
as
pd
def
cn_string
(
s
):
import
re
if
re
.
search
(
'[
\u4e00
-
\u9fff
]'
,
s
):
return
True
return
False
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
'question'
]
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])}
options_prompt
=
'Options:
\n
'
for
key
,
item
in
options
.
items
():
options_prompt
+=
f
'
{
key
}
.
{
item
}
\n
'
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
prompt
=
''
if
hint
is
not
None
:
prompt
+=
f
'Hint:
{
hint
}
\n
'
prompt
+=
f
'Question:
{
question
}
\n
'
if
len
(
options
):
prompt
+=
options_prompt
prompt
+=
MCQ_CN_PROMPT
if
cn_string
(
prompt
)
else
MCQ_EN_PROMPT
prompt
=
prompt
.
rstrip
()
msgs
=
[]
if
isinstance
(
tgt_path
,
list
):
msgs
.
extend
([
dict
(
type
=
'image'
,
value
=
p
)
for
p
in
tgt_path
])
else
:
msgs
=
[
dict
(
type
=
'image'
,
value
=
tgt_path
)]
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
prompt
))
return
msgs
def
_build_yorn_prompt
(
self
,
line
,
dataset
:
str
)
->
list
[
dict
[
str
,
str
]]:
"""change the prompt for YORN dataset:"""
YORN_PROMPT
=
' Please answer yes or no.'
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
'question'
]
msgs
=
[]
if
isinstance
(
tgt_path
,
list
):
msgs
.
extend
([
dict
(
type
=
'image'
,
value
=
p
)
for
p
in
tgt_path
])
else
:
msgs
=
[
dict
(
type
=
'image'
,
value
=
tgt_path
)]
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
question
))
assert
msgs
[
-
1
][
'type'
]
==
'text'
msgs
[
-
1
][
'value'
]
+=
YORN_PROMPT
return
msgs
def
_build_vqa_prompt
(
self
,
line
,
dataset
:
str
)
->
list
[
dict
[
str
,
str
]]:
"""change the prompt for VQA dataset:"""
VQA_PROMPT
=
'
\n
Please try to answer the question with short words or phrases if possible.'
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
'question'
]
msgs
=
[]
if
isinstance
(
tgt_path
,
list
):
msgs
.
extend
([
dict
(
type
=
'image'
,
value
=
p
)
for
p
in
tgt_path
])
else
:
msgs
=
[
dict
(
type
=
'image'
,
value
=
tgt_path
)]
msgs
.
append
(
dict
(
type
=
'text'
,
value
=
question
))
assert
msgs
[
-
1
][
'type'
]
==
'text'
msgs
[
-
1
][
'value'
]
+=
VQA_PROMPT
return
msgs
VLMEvalKit/vlmeval/vlm/qwen_vl.py
0 → 100644
View file @
81028572
import
torch
from
transformers
import
AutoModelForCausalLM
,
AutoTokenizer
import
warnings
import
copy
as
cp
from
.base
import
BaseModel
from
..smp
import
isimg
,
listinstr
from
..dataset
import
DATASET_TYPE
class
QwenVL
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
True
def
__init__
(
self
,
model_path
=
'Qwen/Qwen-VL'
,
**
kwargs
):
assert
model_path
is
not
None
self
.
model_path
=
model_path
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
tokenizer
.
padding_side
=
'left'
tokenizer
.
pad_token_id
=
tokenizer
.
eod_id
self
.
tokenizer
=
tokenizer
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
device_map
=
'cuda'
,
trust_remote_code
=
True
).
eval
()
default_kwargs
=
dict
(
do_sample
=
False
,
num_beams
=
1
,
max_new_tokens
=
512
,
min_new_tokens
=
1
,
num_return_sequences
=
1
,
use_cache
=
True
,
output_hidden_states
=
True
,
pad_token_id
=
tokenizer
.
eod_id
,
eos_token_id
=
tokenizer
.
eod_id
)
default_kwargs
.
update
(
kwargs
)
self
.
kwargs
=
default_kwargs
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
torch
.
cuda
.
empty_cache
()
def
adjust_kwargs
(
self
,
dataset
):
kwargs
=
cp
.
deepcopy
(
self
.
kwargs
)
if
DATASET_TYPE
(
dataset
)
in
[
'MCQ'
,
'Y/N'
]:
kwargs
[
'max_new_tokens'
]
=
32
elif
DATASET_TYPE
(
dataset
)
==
'Caption'
and
'COCO'
in
dataset
:
kwargs
[
'max_new_tokens'
]
=
32
elif
DATASET_TYPE
(
dataset
)
==
'VQA'
:
if
listinstr
([
'OCRVQA'
,
'ChartQA'
,
'DocVQA'
],
dataset
):
kwargs
[
'max_new_tokens'
]
=
100
elif
listinstr
([
'TextVQA'
],
dataset
):
kwargs
[
'max_new_tokens'
]
=
10
return
kwargs
def
generate_inner
(
self
,
message
,
dataset
=
None
):
if
dataset
is
not
None
:
kwargs
=
self
.
adjust_kwargs
(
dataset
)
else
:
kwargs
=
self
.
kwargs
prompt
=
''
for
s
in
message
:
if
s
[
'type'
]
==
'image'
:
prompt
+=
f
'<img>
{
s
[
"value"
]
}
</img>'
elif
s
[
'type'
]
==
'text'
:
prompt
+=
s
[
'value'
]
if
dataset
is
not
None
and
DATASET_TYPE
(
dataset
)
==
'VQA'
:
prompt
+=
' Answer:'
encoded
=
self
.
tokenizer
([
prompt
],
return_tensors
=
'pt'
,
padding
=
'longest'
)
input_ids
=
encoded
.
input_ids
.
to
(
'cuda'
)
attention_mask
=
encoded
.
attention_mask
.
to
(
'cuda'
)
pred
=
self
.
model
.
generate
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
**
kwargs
)
answer
=
self
.
tokenizer
.
decode
(
pred
[
0
][
input_ids
.
size
(
1
):].
cpu
(),
skip_special_tokens
=
True
).
strip
()
return
answer
class
QwenVLChat
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
True
def
__init__
(
self
,
model_path
=
'Qwen/Qwen-VL-Chat'
,
**
kwargs
):
assert
model_path
is
not
None
self
.
model_path
=
model_path
self
.
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
self
.
model
=
AutoModelForCausalLM
.
from_pretrained
(
model_path
,
device_map
=
'cuda'
,
trust_remote_code
=
True
).
eval
()
torch
.
cuda
.
empty_cache
()
self
.
kwargs
=
kwargs
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
def
build_history
(
self
,
message
):
def
concat_tilist
(
tilist
):
image_cnt
=
1
prompt
=
''
for
item
in
tilist
:
if
item
[
'type'
]
==
'text'
:
prompt
+=
item
[
'value'
]
elif
item
[
'type'
]
==
'image'
:
prompt
+=
f
"Picture
{
image_cnt
}
: <img>
{
item
[
'value'
]
}
</img>
\n
"
image_cnt
+=
1
return
prompt
assert
len
(
message
)
%
2
==
0
hist
=
[]
for
i
in
range
(
len
(
message
)
//
2
):
m1
,
m2
=
message
[
2
*
i
],
message
[
2
*
i
+
1
]
assert
m1
[
'role'
]
==
'user'
and
m2
[
'role'
]
==
'assistant'
hist
.
append
((
concat_tilist
(
m1
[
'content'
]),
concat_tilist
(
m2
[
'content'
])))
return
hist
def
generate_inner
(
self
,
message
,
dataset
=
None
):
vl_list
=
[{
'image'
:
s
[
'value'
]}
if
s
[
'type'
]
==
'image'
else
{
'text'
:
s
[
'value'
]}
for
s
in
message
]
query
=
self
.
tokenizer
.
from_list_format
(
vl_list
)
response
,
_
=
self
.
model
.
chat
(
self
.
tokenizer
,
query
=
query
,
history
=
None
,
**
self
.
kwargs
)
return
response
def
chat_inner
(
self
,
message
,
dataset
=
None
):
assert
len
(
message
)
%
2
==
1
and
message
[
-
1
][
'role'
]
==
'user'
history
=
self
.
build_history
(
message
[:
-
1
])
vl_list
=
[
{
'image'
:
s
[
'value'
]}
if
s
[
'type'
]
==
'image'
else
{
'text'
:
s
[
'value'
]}
for
s
in
message
[
-
1
][
'content'
]
]
query
=
self
.
tokenizer
.
from_list_format
(
vl_list
)
response
,
_
=
self
.
model
.
chat
(
self
.
tokenizer
,
query
=
query
,
history
=
history
,
**
self
.
kwargs
)
return
response
VLMEvalKit/vlmeval/vlm/rbdash.py
0 → 100644
View file @
81028572
import
sys
import
torch
import
os.path
as
osp
import
os
import
warnings
from
.base
import
BaseModel
from
..dataset
import
DATASET_TYPE
from
..smp
import
*
from
PIL
import
Image
'''
Please follow the instructions to download ckpt.
https://github.com/RBDash-Team/RBDash?tab=readme-ov-file#pretrained-weights
'''
class
RBDash
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
False
def
__init__
(
self
,
model_path
,
root
=
None
,
conv_mode
=
'qwen'
,
**
kwargs
):
from
huggingface_hub
import
snapshot_download
if
root
is
None
:
warnings
.
warn
(
'Please set `root` to RBDash code directory,
\
which is cloned from here: "https://github.com/RBDash-Team/RBDash?tab=readme-ov-file" '
)
sys
.
exit
(
-
1
)
warnings
.
warn
(
'Please follow the instructions of RBDash to put the ckpt file in the right place,
\
which can be found at https://github.com/RBDash-Team/RBDash?tab=readme-ov-file#structure'
)
assert
model_path
==
'RBDash-Team/RBDash-v1.2-72b'
,
'We only support RBDash-v1.2-72b for now'
sys
.
path
.
append
(
root
)
try
:
from
rbdash.model.builder
import
load_pretrained_model
from
rbdash.mm_utils
import
get_model_name_from_path
except
:
raise
ImportError
(
'Please first install RBdash and set the root path to use RBdash, '
'which is cloned from here: "https://github.com/RBDash-Team/RBDash?tab=readme-ov-file" '
)
VLMEvalKit_path
=
os
.
getcwd
()
os
.
chdir
(
root
)
warnings
.
warn
(
'Please set `root` to RBdash code directory,
\
which is cloned from here: "https://github.com/RBDash-Team/RBDash?tab=readme-ov-file" '
)
try
:
model_name
=
get_model_name_from_path
(
model_path
)
except
:
raise
ImportError
(
'Please follow the instructions of RBdash to put the ckpt file in the right place, '
'which can be found at https://github.com/RBDash-Team/RBDash?tab=readme-ov-file#structure'
)
download_model_path
=
snapshot_download
(
model_path
)
internvit_local_dir
=
'./model_zoo/OpenGVLab/InternViT-6B-448px-V1-5'
os
.
makedirs
(
internvit_local_dir
,
exist_ok
=
True
)
snapshot_download
(
'OpenGVLab/InternViT-6B-448px-V1-5'
,
local_dir
=
internvit_local_dir
)
convnext_local_dir
=
'./model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup'
os
.
makedirs
(
convnext_local_dir
,
exist_ok
=
True
)
snapshot_download
(
'laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup'
,
local_dir
=
convnext_local_dir
)
preprocessor_url
=
'https://huggingface.co/openai/clip-vit-large-patch14-336/blob/main/preprocessor_config.json'
download_file_path
=
osp
.
join
(
convnext_local_dir
,
'preprocessor_config.json'
)
if
not
osp
.
exists
(
download_file_path
):
print
(
f
'download preprocessor to
{
download_file_path
}
'
)
download_file
(
preprocessor_url
,
download_file_path
)
tokenizer
,
model
,
image_processor
,
image_processor_aux
,
context_len
=
load_pretrained_model
(
download_model_path
,
None
,
model_name
,
device_map
=
'auto'
)
os
.
chdir
(
VLMEvalKit_path
)
self
.
model
=
model
self
.
tokenizer
=
tokenizer
self
.
image_processor
=
image_processor
self
.
image_processor_aux
=
image_processor_aux
self
.
conv_mode
=
conv_mode
if
tokenizer
.
unk_token
is
None
:
tokenizer
.
unk_token
=
'<|endoftext|>'
tokenizer
.
pad_token
=
tokenizer
.
unk_token
kwargs_default
=
dict
(
temperature
=
float
(
0.2
),
num_beams
=
1
,
top_p
=
None
,
max_new_tokens
=
128
,
use_cache
=
True
)
kwargs_default
.
update
(
kwargs
)
self
.
kwargs
=
kwargs_default
def
generate_inner
(
self
,
message
,
dataset
=
None
):
try
:
from
rbdash.constants
import
IMAGE_TOKEN_INDEX
,
DEFAULT_IMAGE_TOKEN
,
\
DEFAULT_IM_START_TOKEN
,
DEFAULT_IM_END_TOKEN
from
rbdash.conversation
import
conv_templates
from
rbdash.mm_utils
import
tokenizer_image_token
,
process_images
except
:
raise
ImportError
(
'Please first install RBdash and set the root path to use RBdash, '
'which is cloned from here: "https://github.com/RBDash-Team/RBDash?tab=readme-ov-file" '
)
prompt
,
image
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
image
=
Image
.
open
(
image
).
convert
(
'RGB'
)
if
self
.
model
.
config
.
mm_use_im_start_end
:
prompt
=
(
DEFAULT_IM_START_TOKEN
+
DEFAULT_IMAGE_TOKEN
+
DEFAULT_IM_END_TOKEN
+
'
\n
'
+
prompt
)
else
:
prompt
=
DEFAULT_IMAGE_TOKEN
+
'
\n
'
+
prompt
conv
=
conv_templates
[
self
.
conv_mode
].
copy
()
conv
.
append_message
(
conv
.
roles
[
0
],
prompt
)
conv
.
append_message
(
conv
.
roles
[
1
],
None
)
prompt
=
conv
.
get_prompt
()
input_ids
=
tokenizer_image_token
(
prompt
,
self
.
tokenizer
,
IMAGE_TOKEN_INDEX
,
return_tensors
=
'pt'
)
input_ids
=
input_ids
.
unsqueeze
(
0
).
cuda
()
if
hasattr
(
self
.
model
.
config
,
'image_size_aux'
):
if
not
hasattr
(
self
.
image_processor
,
'image_size_raw'
):
self
.
image_processor
.
image_size_raw
=
self
.
image_processor
.
crop_size
.
copy
()
self
.
image_processor
.
crop_size
[
'height'
]
=
self
.
model
.
config
.
image_size_aux
self
.
image_processor
.
crop_size
[
'width'
]
=
self
.
model
.
config
.
image_size_aux
self
.
image_processor
.
size
[
'shortest_edge'
]
=
self
.
model
.
config
.
image_size_aux
self
.
image_processor_aux
.
crop_size
[
'height'
]
=
self
.
model
.
config
.
image_size_aux
self
.
image_processor_aux
.
crop_size
[
'width'
]
=
self
.
model
.
config
.
image_size_aux
self
.
image_processor_aux
.
size
[
'shortest_edge'
]
=
self
.
model
.
config
.
image_size_aux
image_tensor
=
process_images
([
image
],
self
.
image_processor
,
self
.
model
.
config
)[
0
]
image_grid
=
getattr
(
self
.
model
.
config
,
'image_grid'
,
1
)
if
hasattr
(
self
.
model
.
config
,
'image_size_aux'
):
raw_shape
=
[
self
.
image_processor
.
image_size_raw
[
'height'
]
*
image_grid
,
self
.
image_processor
.
image_size_raw
[
'width'
]
*
image_grid
]
if
self
.
image_processor
is
not
self
.
image_processor_aux
:
image_tensor_aux
=
process_images
([
image
],
self
.
image_processor_aux
,
self
.
model
.
config
)[
0
]
else
:
image_tensor_aux
=
image_tensor
image_tensor
=
torch
.
nn
.
functional
.
interpolate
(
image_tensor
[
None
],
size
=
raw_shape
,
mode
=
'bilinear'
,
align_corners
=
False
)[
0
]
else
:
image_tensor_aux
=
[]
if
image_grid
>=
2
:
raw_image
=
image_tensor
.
reshape
(
3
,
image_grid
,
self
.
image_processor
.
image_size_raw
[
'height'
],
image_grid
,
self
.
image_processor
.
image_size_raw
[
'width'
]
)
raw_image
=
raw_image
.
permute
(
1
,
3
,
0
,
2
,
4
)
raw_image
=
raw_image
.
reshape
(
-
1
,
3
,
self
.
image_processor
.
image_size_raw
[
'height'
],
self
.
image_processor
.
image_size_raw
[
'width'
]
)
if
getattr
(
self
.
model
.
config
,
'image_global'
,
False
):
global_image
=
image_tensor
if
len
(
global_image
.
shape
)
==
3
:
global_image
=
global_image
[
None
]
global_image
=
torch
.
nn
.
functional
.
interpolate
(
global_image
,
size
=
[
self
.
image_processor
.
image_size_raw
[
'height'
],
self
.
image_processor
.
image_size_raw
[
'width'
]
],
mode
=
'bilinear'
,
align_corners
=
False
)
raw_image
=
torch
.
cat
([
raw_image
,
global_image
],
dim
=
0
)
image_tensor
=
raw_image
.
contiguous
()
images
=
image_tensor
[
None
].
to
(
dtype
=
self
.
model
.
dtype
,
device
=
'cuda'
,
non_blocking
=
True
)
if
len
(
image_tensor_aux
)
>
0
:
images_aux
=
image_tensor_aux
[
None
].
to
(
dtype
=
self
.
model
.
dtype
,
device
=
'cuda'
,
non_blocking
=
True
)
else
:
images_aux
=
None
with
torch
.
inference_mode
():
output_ids
=
self
.
model
.
generate
(
input_ids
,
max_new_tokens
=
512
,
images
=
images
,
images_aux
=
images_aux
,
do_sample
=
True
if
self
.
kwargs
[
'temperature'
]
>
0
else
False
,
temperature
=
self
.
kwargs
[
'temperature'
],
top_p
=
self
.
kwargs
[
'top_p'
],
num_beams
=
self
.
kwargs
[
'num_beams'
]
)
outputs
=
self
.
tokenizer
.
batch_decode
(
output_ids
,
skip_special_tokens
=
True
)[
0
].
strip
()
return
outputs
def
use_custom_prompt
(
self
,
dataset
):
assert
dataset
is
not
None
if
listinstr
([
'MMDU'
,
'MME-RealWorld'
,
'MME-RealWorld-CN'
],
dataset
):
# For Multi-Turn we don't have custom prompt
return
False
if
'mme'
in
dataset
.
lower
():
return
True
elif
'hallusionbench'
in
dataset
.
lower
():
return
True
elif
'mmmu'
in
dataset
.
lower
():
return
True
elif
'mmbench'
in
dataset
.
lower
():
return
True
return
False
def
build_mme
(
self
,
line
):
question
=
line
[
'question'
]
prompt
=
question
+
'Answer the question using a single word or phrase.'
return
prompt
def
build_hallusionbench
(
self
,
line
):
question
=
line
[
'question'
]
prompt
=
question
+
'
\n
Answer the question using a single word or phrase.'
return
prompt
def
build_mmbench
(
self
,
line
):
question
=
line
[
'question'
]
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
options_prompt
=
''
for
key
,
item
in
options
.
items
():
options_prompt
+=
f
'
{
key
}
.
{
item
}
\n
'
prompt
=
f
'
{
question
}
\n
'
if
len
(
options
):
prompt
+=
options_prompt
prompt
+=
"Answer with the option's letter from the given choices directly."
else
:
prompt
+=
'Answer the question using a single word or phrase.'
return
prompt
def
build_mmmu
(
self
,
line
):
question
=
line
[
'question'
]
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
options_prompt
=
''
for
key
,
item
in
options
.
items
():
options_prompt
+=
f
'(
{
key
}
)
{
item
}
\n
'
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
prompt
=
''
if
hint
is
not
None
:
prompt
+=
f
'Hint:
{
hint
}
\n
'
prompt
+=
f
'
{
question
}
\n
'
if
len
(
options
):
prompt
+=
options_prompt
prompt
+=
"
\n
Answer with the option's letter from the given choices directly."
else
:
prompt
+=
'Answer the question using a single word or phrase.'
return
prompt
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
assert
self
.
use_custom_prompt
(
dataset
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
if
'mme'
in
dataset
.
lower
():
prompt
=
self
.
build_mme
(
line
)
elif
'hallusionbench'
in
dataset
.
lower
():
prompt
=
self
.
build_hallusionbench
(
line
)
elif
'mmmu'
in
dataset
.
lower
():
prompt
=
self
.
build_mmmu
(
line
)
elif
'mmbench'
in
dataset
.
lower
():
prompt
=
self
.
build_mmbench
(
line
)
ret
=
[
dict
(
type
=
'text'
,
value
=
prompt
)]
ret
.
extend
([
dict
(
type
=
'image'
,
value
=
s
)
for
s
in
tgt_path
])
return
ret
VLMEvalKit/vlmeval/vlm/slime.py
0 → 100644
View file @
81028572
import
torch
from
PIL
import
Image
from
abc
import
abstractproperty
import
sys
import
os.path
as
osp
from
.base
import
BaseModel
from
..smp
import
*
from
..dataset
import
DATASET_TYPE
import
copy
class
SliME
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
True
DEFAULT_IMAGE_TOKEN
=
'<image>'
IMAGE_TOKEN_INDEX
=
-
200
def
__init__
(
self
,
model_path
=
'yifanzhang114/SliME-Llama3-8B'
,
**
kwargs
):
assert
model_path
is
not
None
try
:
from
llava.model.builder
import
load_pretrained_model
from
llava.conversation
import
conv_templates
from
llava.mm_utils
import
get_model_name_from_path
,
tokenizer_image_token
except
:
warnings
.
warn
(
'Please install requirements on https://github.com/yfzhang114/SliME before using SliME'
)
model_name
=
get_model_name_from_path
(
model_path
)
tokenizer
,
model
,
image_processor
,
_
=
load_pretrained_model
(
model_path
,
None
,
model_name
,
device_map
=
None
)
model
.
cuda
().
eval
()
model
.
tie_weights
()
if
'llama3'
in
model_path
.
lower
():
conv_mode
=
'llama3'
elif
'vicuna'
in
model_path
.
lower
():
conv_mode
=
'v1'
self
.
conv_template
=
conv_mode
self
.
conv_templates
=
conv_templates
self
.
tokenizer
=
tokenizer
self
.
model
=
model
self
.
image_processor
=
image_processor
self
.
tokenizer_image_token
=
tokenizer_image_token
def
generate_inner
(
self
,
message
,
dataset
=
None
):
content
,
images
=
''
,
[]
for
msg
in
message
:
if
msg
[
'type'
]
==
'text'
:
content
+=
msg
[
'value'
]
else
:
images
.
append
(
Image
.
open
(
msg
[
'value'
]).
convert
(
'RGB'
))
content
+=
(
self
.
DEFAULT_IMAGE_TOKEN
+
'
\n
'
)
preprocess
=
self
.
image_processor
.
preprocess
image_tokenizer
=
self
.
tokenizer_image_token
image_tensor
=
[
preprocess
(
f
,
return_tensors
=
'pt'
)[
'pixel_values'
][
0
].
half
().
cuda
()
for
f
in
images
]
image_tensor
=
torch
.
stack
(
image_tensor
)
conv
=
copy
.
deepcopy
(
self
.
conv_templates
[
self
.
conv_template
])
conv
.
messages
=
list
(
conv
.
messages
)
conv
.
append_message
(
conv
.
roles
[
0
],
content
)
conv
.
append_message
(
conv
.
roles
[
1
],
None
)
prompt_question
=
conv
.
get_prompt
()
input_ids
=
image_tokenizer
(
prompt_question
,
self
.
tokenizer
,
self
.
IMAGE_TOKEN_INDEX
,
return_tensors
=
'pt'
)
input_ids
=
input_ids
.
unsqueeze
(
0
).
cuda
()
cont
=
self
.
model
.
generate
(
input_ids
,
images
=
image_tensor
,
do_sample
=
False
,
temperature
=
0
,
max_new_tokens
=
512
,
)
text_outputs
=
self
.
tokenizer
.
batch_decode
(
cont
,
skip_special_tokens
=
True
)[
0
]
return
text_outputs
VLMEvalKit/vlmeval/vlm/transcore_m.py
0 → 100644
View file @
81028572
import
sys
import
torch
from
abc
import
abstractproperty
from
.base
import
BaseModel
from
..smp
import
*
from
..dataset
import
DATASET_TYPE
from
transformers
import
AutoTokenizer
,
BitsAndBytesConfig
class
TransCoreM
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
False
def
load_pretrained_model
(
self
,
model_path
,
load_8bit
=
False
,
load_4bit
=
False
,
revision
=
'main'
):
from
transcorem.model
import
TransCoreMQWenForCausalLM
from
transcorem.constants
import
DEFAULT_IMAGE_PATCH_TOKEN
,
DEFAULT_IM_START_TOKEN
,
DEFAULT_IM_END_TOKEN
import
transcorem.config_param
as
config_param
kwargs
=
{
'revision'
:
revision
}
if
load_8bit
:
kwargs
[
'load_in_8bit'
]
=
True
elif
load_4bit
:
kwargs
[
'load_in_4bit'
]
=
True
kwargs
[
'quantization_config'
]
=
BitsAndBytesConfig
(
load_in_4bit
=
True
,
bnb_4bit_compute_dtype
=
torch
.
float16
,
bnb_4bit_use_double_quant
=
True
,
bnb_4bit_quant_type
=
'nf4'
)
else
:
kwargs
[
'torch_dtype'
]
=
torch
.
float16
config_param
.
model_path
=
model_path
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
use_fast
=
False
,
revision
=
revision
,
trust_remote_code
=
True
)
model
=
TransCoreMQWenForCausalLM
.
from_pretrained
(
model_path
,
low_cpu_mem_usage
=
True
,
trust_remote_code
=
True
,
**
kwargs
)
image_processor
=
None
mm_use_im_start_end
=
getattr
(
model
.
config
,
'mm_use_im_start_end'
,
False
)
mm_use_im_patch_token
=
getattr
(
model
.
config
,
'mm_use_im_patch_token'
,
True
)
if
mm_use_im_patch_token
:
tokenizer
.
add_tokens
([
DEFAULT_IMAGE_PATCH_TOKEN
],
special_tokens
=
True
)
if
mm_use_im_start_end
:
tokenizer
.
add_tokens
([
DEFAULT_IM_START_TOKEN
,
DEFAULT_IM_END_TOKEN
],
special_tokens
=
True
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
vision_tower
=
model
.
get_vision_tower
()
if
not
vision_tower
.
is_loaded
:
vision_tower
.
load_model
()
vision_tower
.
to
(
device
=
'cpu'
,
dtype
=
torch
.
float16
)
image_processor
=
vision_tower
.
image_processor
if
hasattr
(
model
.
config
,
'max_sequence_length'
):
context_len
=
model
.
config
.
max_sequence_length
else
:
context_len
=
2048
return
tokenizer
,
model
,
image_processor
,
context_len
def
__init__
(
self
,
root
=
None
,
revision
=
'main'
,
**
kwargs
):
self
.
root
=
root
self
.
revision
=
revision
sys
.
path
.
append
(
root
)
model_path
=
'PCIResearch/TransCore-M'
assert
osp
.
exists
(
model_path
)
or
splitlen
(
model_path
)
==
2
self
.
tokenizer
,
self
.
model
,
self
.
image_processor
,
self
.
context_len
=
self
.
load_pretrained_model
(
model_path
=
model_path
,
revision
=
revision
)
self
.
model
=
self
.
model
.
cuda
()
print
(
'==============conv_mode: transcorem_v1'
)
self
.
conv_mode
=
'transcorem_v1'
kwargs_default
=
dict
(
do_sample
=
False
,
temperature
=
0.0
,
max_new_tokens
=
512
,
top_p
=
None
,
num_beams
=
1
)
kwargs_default
.
update
(
kwargs
)
self
.
kwargs
=
kwargs_default
warnings
.
warn
(
f
'Following kwargs received:
{
self
.
kwargs
}
, will use as generation config. '
)
def
use_custom_prompt
(
self
,
dataset
):
assert
dataset
is
not
None
if
DATASET_TYPE
(
dataset
)
==
'MCQ'
:
return
True
return
False
def
build_prompt
(
self
,
line
,
dataset
=
None
):
assert
dataset
is
None
or
isinstance
(
dataset
,
str
)
assert
self
.
use_custom_prompt
(
dataset
)
tgt_path
=
self
.
dump_image
(
line
,
dataset
)
question
=
line
[
'question'
]
hint
=
line
[
'hint'
]
if
(
'hint'
in
line
and
not
pd
.
isna
(
line
[
'hint'
]))
else
None
if
hint
is
not
None
:
question
=
hint
+
'
\n
'
+
question
options
=
{
cand
:
line
[
cand
]
for
cand
in
string
.
ascii_uppercase
if
cand
in
line
and
not
pd
.
isna
(
line
[
cand
])
}
for
key
,
item
in
options
.
items
():
question
+=
f
'
\n
{
key
}
.
{
item
}
'
prompt
=
question
if
len
(
options
):
prompt
+=
(
'
\n
请直接回答选项字母。'
if
cn_string
(
prompt
)
else
"
\n
Answer with the option's letter from the given choices directly."
)
else
:
prompt
+=
'
\n
请直接回答问题。'
if
cn_string
(
prompt
)
else
'
\n
Answer the question directly.'
message
=
[
dict
(
type
=
'text'
,
value
=
prompt
)]
message
.
extend
([
dict
(
type
=
'image'
,
value
=
f
)
for
f
in
tgt_path
])
return
message
def
generate_inner
(
self
,
message
,
dataset
=
None
):
from
transcorem.mm_utils
import
highres_process_images
,
tokenizer_image_token
,
KeywordsStoppingCriteria
from
transcorem.constants
import
(
IMAGE_TOKEN_INDEX
,
DEFAULT_IMAGE_TOKEN
,
DEFAULT_IM_START_TOKEN
,
DEFAULT_IM_END_TOKEN
)
from
transcorem.conversation
import
conv_templates
,
SeparatorStyle
prompt
,
image_path
=
self
.
message_to_promptimg
(
message
,
dataset
=
dataset
)
image
=
Image
.
open
(
image_path
).
convert
(
'RGB'
)
args
=
abstractproperty
()
args
.
image_aspect_ratio
=
'pad'
image_patches
=
highres_process_images
(
image
,
self
.
image_processor
,
args
,
base_reso
=
336
)
image_patches
=
[
patch
.
unsqueeze
(
0
).
to
(
'cuda'
,
dtype
=
torch
.
float16
)
for
patch
in
image_patches
]
if
self
.
model
.
config
.
mm_use_im_start_end
:
inp
=
DEFAULT_IM_START_TOKEN
+
DEFAULT_IMAGE_TOKEN
+
DEFAULT_IM_END_TOKEN
+
'
\n
'
+
prompt
else
:
inp
=
DEFAULT_IMAGE_TOKEN
+
'
\n
'
+
prompt
conv
=
conv_templates
[
self
.
conv_mode
].
copy
()
conv
.
append_message
(
conv
.
roles
[
0
],
inp
)
conv
.
append_message
(
conv
.
roles
[
1
],
None
)
prompt_conv
=
conv
.
get_prompt
()
input_ids
=
tokenizer_image_token
(
prompt_conv
,
self
.
tokenizer
,
IMAGE_TOKEN_INDEX
,
return_tensors
=
'pt'
).
unsqueeze
(
0
).
cuda
()
stop_str
=
conv
.
sep
if
conv
.
sep_style
!=
SeparatorStyle
.
TWO
else
conv
.
sep2
keywords
=
[
stop_str
]
stopping_criteria
=
KeywordsStoppingCriteria
(
keywords
,
self
.
tokenizer
,
input_ids
)
with
torch
.
inference_mode
():
output_ids
=
self
.
model
.
generate
(
input_ids
,
images
=
image_patches
,
use_cache
=
True
,
stopping_criteria
=
[
stopping_criteria
],
**
self
.
kwargs
)
input_token_len
=
input_ids
.
shape
[
1
]
n_diff_input_output
=
(
input_ids
!=
output_ids
[:,
:
input_token_len
]).
sum
().
item
()
if
n_diff_input_output
>
0
:
print
(
f
'[Warning]
{
n_diff_input_output
}
output_ids are not the same as the input_ids'
)
outputs
=
self
.
tokenizer
.
batch_decode
(
output_ids
[:,
input_token_len
:],
skip_special_tokens
=
True
)[
0
]
outputs
=
outputs
.
strip
()
if
outputs
.
endswith
(
stop_str
):
outputs
=
outputs
[:
-
len
(
stop_str
)]
outputs
=
outputs
.
strip
()
return
outputs
VLMEvalKit/vlmeval/vlm/video_llm/__init__.py
0 → 100644
View file @
81028572
from
.video_llava
import
VideoLLaVA
,
VideoLLaVA_HF
from
.videochat2
import
VideoChat2_HD
from
.chat_uni_vi
import
Chatunivi
from
.video_chatgpt
import
VideoChatGPT
from
.llama_vid
import
LLaMAVID
from
.pllava
import
PLLaVA
__all__
=
[
'VideoLLaVA'
,
'VideoLLaVA_HF'
,
'Chatunivi'
,
'VideoChatGPT'
,
'LLaMAVID'
,
'VideoChat2_HD'
,
'PLLaVA'
]
VLMEvalKit/vlmeval/vlm/video_llm/chat_uni_vi.py
0 → 100644
View file @
81028572
import
torch
import
warnings
import
copy
as
cp
import
numpy
as
np
import
sys
import
os
from
..base
import
BaseModel
from
...smp
import
isimg
,
listinstr
from
...dataset
import
DATASET_TYPE
from
decord
import
VideoReader
,
cpu
from
PIL
import
Image
def
_get_rawvideo_dec
(
video_path
,
image_processor
,
max_frames
=
64
,
image_resolution
=
224
,
video_framerate
=
1
,
s
=
None
,
e
=
None
,
):
# speed up video decode via decord.
video_mask
=
np
.
zeros
(
max_frames
,
dtype
=
np
.
int64
)
max_video_length
=
0
# T x 3 x H x W
video
=
np
.
zeros
((
max_frames
,
3
,
image_resolution
,
image_resolution
),
dtype
=
np
.
float64
)
if
s
is
None
:
start_time
,
end_time
=
None
,
None
else
:
start_time
=
int
(
s
)
end_time
=
int
(
e
)
start_time
=
start_time
if
start_time
>=
0.0
else
0.0
end_time
=
end_time
if
end_time
>=
0.0
else
0.0
if
start_time
>
end_time
:
start_time
,
end_time
=
end_time
,
start_time
elif
start_time
==
end_time
:
end_time
=
start_time
+
1
if
os
.
path
.
exists
(
video_path
):
vreader
=
VideoReader
(
video_path
,
ctx
=
cpu
(
0
))
else
:
print
(
video_path
)
raise
FileNotFoundError
fps
=
vreader
.
get_avg_fps
()
f_start
=
0
if
start_time
is
None
else
int
(
start_time
*
fps
)
f_end
=
int
(
min
(
1000000000
if
end_time
is
None
else
end_time
*
fps
,
len
(
vreader
)
-
1
))
num_frames
=
f_end
-
f_start
+
1
if
num_frames
>
0
:
# T x 3 x H x W
sample_fps
=
int
(
video_framerate
)
t_stride
=
int
(
round
(
float
(
fps
)
/
sample_fps
))
all_pos
=
list
(
range
(
f_start
,
f_end
+
1
,
t_stride
))
if
len
(
all_pos
)
>
max_frames
:
sample_pos
=
[
all_pos
[
_
]
for
_
in
np
.
linspace
(
0
,
len
(
all_pos
)
-
1
,
num
=
max_frames
,
dtype
=
int
)]
else
:
sample_pos
=
all_pos
patch_images
=
[
Image
.
fromarray
(
f
)
for
f
in
vreader
.
get_batch
(
sample_pos
).
asnumpy
()]
patch_images
=
torch
.
stack
(
[
image_processor
.
preprocess
(
img
,
return_tensors
=
'pt'
)[
'pixel_values'
][
0
]
for
img
in
patch_images
]
)
slice_len
=
patch_images
.
shape
[
0
]
max_video_length
=
max_video_length
if
max_video_length
>
slice_len
else
slice_len
if
slice_len
<
1
:
pass
else
:
video
[:
slice_len
,
...]
=
patch_images
return
patch_images
,
slice_len
else
:
print
(
'video path: {} error.'
.
format
(
video_path
))
video_mask
[:
max_video_length
]
=
[
1
]
*
max_video_length
return
torch
.
from_numpy
(
video
),
video_mask
class
Chatunivi
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
False
VIDEO_LLM
=
True
def
__init__
(
self
,
model_path
=
'Chat-UniVi/Chat-UniVi'
,
**
kwargs
):
assert
model_path
is
not
None
try
:
from
ChatUniVi.model.builder
import
load_pretrained_model
except
:
warnings
.
warn
(
'Please install Chat-UniVi from https://github.com/PKU-YuanGroup/Chat-UniVi.git.'
)
sys
.
exit
(
-
1
)
model_name
=
'ChatUniVi'
tokenizer
,
model
,
processor
,
context_len
=
load_pretrained_model
(
model_path
,
None
,
model_name
)
self
.
tokenizer
=
tokenizer
self
.
model
=
model
vision_tower
=
model
.
get_vision_tower
()
if
not
vision_tower
.
is_loaded
:
vision_tower
.
load_model
()
image_processor
=
vision_tower
.
image_processor
self
.
processor
=
image_processor
self
.
context_len
=
context_len
self
.
kwargs
=
kwargs
self
.
nframe
=
64
self
.
resolution
=
224
if
'v1.5'
in
model_path
:
self
.
resolution
=
336
def
get_model_output
(
self
,
model
,
video_processor
,
tokenizer
,
video
,
qs
):
from
ChatUniVi.conversation
import
conv_templates
,
SeparatorStyle
from
ChatUniVi.constants
import
(
DEFAULT_IMAGE_PATCH_TOKEN
,
DEFAULT_IMAGE_TOKEN
,
IMAGE_TOKEN_INDEX
,
DEFAULT_IM_START_TOKEN
,
DEFAULT_IM_END_TOKEN
,
MAX_IMAGE_LENGTH
,
)
from
ChatUniVi.mm_utils
import
(
tokenizer_image_token
,
KeywordsStoppingCriteria
,
)
mm_use_im_start_end
=
getattr
(
model
.
config
,
'mm_use_im_start_end'
,
False
)
mm_use_im_patch_token
=
getattr
(
model
.
config
,
'mm_use_im_patch_token'
,
True
)
if
mm_use_im_patch_token
:
tokenizer
.
add_tokens
([
DEFAULT_IMAGE_PATCH_TOKEN
],
special_tokens
=
True
)
if
mm_use_im_start_end
:
tokenizer
.
add_tokens
([
DEFAULT_IM_START_TOKEN
,
DEFAULT_IM_END_TOKEN
],
special_tokens
=
True
)
model
.
resize_token_embeddings
(
len
(
tokenizer
))
if
model
.
config
.
config
[
'use_cluster'
]:
for
n
,
m
in
model
.
named_modules
():
m
=
m
.
to
(
dtype
=
torch
.
bfloat16
)
video_frames
,
slice_len
=
_get_rawvideo_dec
(
video
,
video_processor
,
max_frames
=
MAX_IMAGE_LENGTH
,
image_resolution
=
self
.
resolution
)
if
model
.
config
.
mm_use_im_start_end
:
qs
=
DEFAULT_IM_START_TOKEN
+
DEFAULT_IMAGE_TOKEN
*
slice_len
+
DEFAULT_IM_END_TOKEN
+
'
\n
'
+
qs
else
:
qs
=
DEFAULT_IMAGE_TOKEN
*
slice_len
+
'
\n
'
+
qs
conv
=
conv_templates
[
'v1'
].
copy
()
conv
.
append_message
(
conv
.
roles
[
0
],
qs
)
conv
.
append_message
(
conv
.
roles
[
1
],
None
)
prompt
=
conv
.
get_prompt
()
input_ids
=
tokenizer_image_token
(
prompt
,
tokenizer
,
IMAGE_TOKEN_INDEX
,
return_tensors
=
'pt'
).
unsqueeze
(
0
).
cuda
()
stop_str
=
conv
.
sep
if
conv
.
sep_style
!=
SeparatorStyle
.
TWO
else
conv
.
sep2
keywords
=
[
stop_str
]
stopping_criteria
=
KeywordsStoppingCriteria
(
keywords
,
tokenizer
,
input_ids
)
with
torch
.
inference_mode
():
output_ids
=
model
.
generate
(
input_ids
,
images
=
video_frames
.
half
().
cuda
(),
do_sample
=
True
,
temperature
=
0.2
,
top_p
=
None
,
num_beams
=
1
,
output_scores
=
True
,
return_dict_in_generate
=
True
,
max_new_tokens
=
1024
,
use_cache
=
True
,
stopping_criteria
=
[
stopping_criteria
])
output_ids
=
output_ids
.
sequences
input_token_len
=
input_ids
.
shape
[
1
]
n_diff_input_output
=
(
input_ids
!=
output_ids
[:,
:
input_token_len
]).
sum
().
item
()
if
n_diff_input_output
>
0
:
print
(
f
'[Warning]
{
n_diff_input_output
}
output_ids are not the same as the input_ids'
)
outputs
=
tokenizer
.
batch_decode
(
output_ids
[:,
input_token_len
:],
skip_special_tokens
=
True
)[
0
]
outputs
=
outputs
.
strip
()
if
outputs
.
endswith
(
stop_str
):
outputs
=
outputs
[:
-
len
(
stop_str
)]
outputs
=
outputs
.
strip
()
return
outputs
def
generate_inner
(
self
,
message
,
dataset
=
None
):
question
,
video
=
self
.
message_to_promptvideo
(
message
)
response
=
self
.
get_model_output
(
self
.
model
,
self
.
processor
,
self
.
tokenizer
,
video
,
question
)
return
response
VLMEvalKit/vlmeval/vlm/video_llm/configs/llama_vid/processor/clip-patch14-224/config.json
0 → 100644
View file @
81028572
{
"_name_or_path"
:
"clip-vit-large-patch14/"
,
"architectures"
:
[
"CLIPModel"
],
"initializer_factor"
:
1.0
,
"logit_scale_init_value"
:
2.6592
,
"model_type"
:
"clip"
,
"projection_dim"
:
768
,
"text_config"
:
{
"_name_or_path"
:
""
,
"add_cross_attention"
:
false
,
"architectures"
:
null
,
"attention_dropout"
:
0.0
,
"bad_words_ids"
:
null
,
"bos_token_id"
:
0
,
"chunk_size_feed_forward"
:
0
,
"cross_attention_hidden_size"
:
null
,
"decoder_start_token_id"
:
null
,
"diversity_penalty"
:
0.0
,
"do_sample"
:
false
,
"dropout"
:
0.0
,
"early_stopping"
:
false
,
"encoder_no_repeat_ngram_size"
:
0
,
"eos_token_id"
:
2
,
"finetuning_task"
:
null
,
"forced_bos_token_id"
:
null
,
"forced_eos_token_id"
:
null
,
"hidden_act"
:
"quick_gelu"
,
"hidden_size"
:
768
,
"id2label"
:
{
"0"
:
"LABEL_0"
,
"1"
:
"LABEL_1"
},
"initializer_factor"
:
1.0
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
3072
,
"is_decoder"
:
false
,
"is_encoder_decoder"
:
false
,
"label2id"
:
{
"LABEL_0"
:
0
,
"LABEL_1"
:
1
},
"layer_norm_eps"
:
1e-05
,
"length_penalty"
:
1.0
,
"max_length"
:
20
,
"max_position_embeddings"
:
77
,
"min_length"
:
0
,
"model_type"
:
"clip_text_model"
,
"no_repeat_ngram_size"
:
0
,
"num_attention_heads"
:
12
,
"num_beam_groups"
:
1
,
"num_beams"
:
1
,
"num_hidden_layers"
:
12
,
"num_return_sequences"
:
1
,
"output_attentions"
:
false
,
"output_hidden_states"
:
false
,
"output_scores"
:
false
,
"pad_token_id"
:
1
,
"prefix"
:
null
,
"problem_type"
:
null
,
"projection_dim"
:
768
,
"pruned_heads"
:
{},
"remove_invalid_values"
:
false
,
"repetition_penalty"
:
1.0
,
"return_dict"
:
true
,
"return_dict_in_generate"
:
false
,
"sep_token_id"
:
null
,
"task_specific_params"
:
null
,
"temperature"
:
1.0
,
"tie_encoder_decoder"
:
false
,
"tie_word_embeddings"
:
true
,
"tokenizer_class"
:
null
,
"top_k"
:
50
,
"top_p"
:
1.0
,
"torch_dtype"
:
null
,
"torchscript"
:
false
,
"transformers_version"
:
"4.16.0.dev0"
,
"use_bfloat16"
:
false
,
"vocab_size"
:
49408
},
"text_config_dict"
:
{
"hidden_size"
:
768
,
"intermediate_size"
:
3072
,
"num_attention_heads"
:
12
,
"num_hidden_layers"
:
12
,
"projection_dim"
:
768
},
"torch_dtype"
:
"float32"
,
"transformers_version"
:
null
,
"vision_config"
:
{
"_name_or_path"
:
""
,
"add_cross_attention"
:
false
,
"architectures"
:
null
,
"attention_dropout"
:
0.0
,
"bad_words_ids"
:
null
,
"bos_token_id"
:
null
,
"chunk_size_feed_forward"
:
0
,
"cross_attention_hidden_size"
:
null
,
"decoder_start_token_id"
:
null
,
"diversity_penalty"
:
0.0
,
"do_sample"
:
false
,
"dropout"
:
0.0
,
"early_stopping"
:
false
,
"encoder_no_repeat_ngram_size"
:
0
,
"eos_token_id"
:
null
,
"finetuning_task"
:
null
,
"forced_bos_token_id"
:
null
,
"forced_eos_token_id"
:
null
,
"hidden_act"
:
"quick_gelu"
,
"hidden_size"
:
1024
,
"id2label"
:
{
"0"
:
"LABEL_0"
,
"1"
:
"LABEL_1"
},
"image_size"
:
224
,
"initializer_factor"
:
1.0
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
4096
,
"is_decoder"
:
false
,
"is_encoder_decoder"
:
false
,
"label2id"
:
{
"LABEL_0"
:
0
,
"LABEL_1"
:
1
},
"layer_norm_eps"
:
1e-05
,
"length_penalty"
:
1.0
,
"max_length"
:
20
,
"min_length"
:
0
,
"model_type"
:
"clip_vision_model"
,
"no_repeat_ngram_size"
:
0
,
"num_attention_heads"
:
16
,
"num_beam_groups"
:
1
,
"num_beams"
:
1
,
"num_hidden_layers"
:
24
,
"num_return_sequences"
:
1
,
"output_attentions"
:
false
,
"output_hidden_states"
:
false
,
"output_scores"
:
false
,
"pad_token_id"
:
null
,
"patch_size"
:
14
,
"prefix"
:
null
,
"problem_type"
:
null
,
"projection_dim"
:
768
,
"pruned_heads"
:
{},
"remove_invalid_values"
:
false
,
"repetition_penalty"
:
1.0
,
"return_dict"
:
true
,
"return_dict_in_generate"
:
false
,
"sep_token_id"
:
null
,
"task_specific_params"
:
null
,
"temperature"
:
1.0
,
"tie_encoder_decoder"
:
false
,
"tie_word_embeddings"
:
true
,
"tokenizer_class"
:
null
,
"top_k"
:
50
,
"top_p"
:
1.0
,
"torch_dtype"
:
null
,
"torchscript"
:
false
,
"transformers_version"
:
"4.16.0.dev0"
,
"use_bfloat16"
:
false
},
"vision_config_dict"
:
{
"hidden_size"
:
1024
,
"intermediate_size"
:
4096
,
"num_attention_heads"
:
16
,
"num_hidden_layers"
:
24
,
"patch_size"
:
14
,
"projection_dim"
:
768
}
}
VLMEvalKit/vlmeval/vlm/video_llm/configs/llama_vid/processor/clip-patch14-224/preprocessor_config.json
0 → 100644
View file @
81028572
{
"crop_size"
:
224
,
"do_center_crop"
:
true
,
"do_normalize"
:
true
,
"do_resize"
:
true
,
"feature_extractor_type"
:
"CLIPFeatureExtractor"
,
"image_mean"
:
[
0.48145466
,
0.4578275
,
0.40821073
],
"image_std"
:
[
0.26862954
,
0.26130258
,
0.27577711
],
"resample"
:
3
,
"size"
:
224
}
VLMEvalKit/vlmeval/vlm/video_llm/configs/videochat2_hd.json
0 → 100644
View file @
81028572
{
"model"
:
{
"model_cls"
:
"VideoChat2_it_hd_mistral"
,
"vit_blip_model_path"
:
"OpenGVLab/videochat2"
,
"mistral_model_path"
:
"mistralai/Mistral-7B-Instruct-v0.2"
,
"videochat2_model_path"
:
"OpenGVLab/VideoChat2_stage2_Mistral_7B"
,
"freeze_vit"
:
false
,
"freeze_qformer"
:
false
,
"max_txt_len"
:
512
,
"low_resource"
:
false
,
"vision_encoder"
:
{
"name"
:
"vit_l14"
,
"img_size"
:
224
,
"patch_size"
:
16
,
"d_model"
:
1024
,
"encoder_embed_dim"
:
1024
,
"encoder_depth"
:
24
,
"encoder_num_heads"
:
16
,
"drop_path_rate"
:
0.0
,
"num_frames"
:
8
,
"tubelet_size"
:
1
,
"use_checkpoint"
:
true
,
"checkpoint_num"
:
18
,
"pretrained"
:
""
,
"return_index"
:
-2
,
"vit_add_ln"
:
true
,
"ckpt_num_frame"
:
4
},
"num_query_token"
:
32
,
"qformer_hidden_dropout_prob"
:
0.1
,
"qformer_attention_probs_dropout_prob"
:
0.1
,
"qformer_drop_path_rate"
:
0.2
,
"extra_num_query_token"
:
64
,
"qformer_text_input"
:
true
,
"system"
:
""
,
"start_token"
:
"<Video>"
,
"end_token"
:
"</Video>"
,
"add_second_msg"
:
true
,
"img_start_token"
:
"<Image>"
,
"img_end_token"
:
"</Image>"
,
"random_shuffle"
:
true
,
"return_question_instruction"
:
false
,
"use_flash_attention"
:
true
,
"use_lora"
:
false
,
"lora_r"
:
16
,
"lora_alpha"
:
32
,
"lora_dropout"
:
0.1
,
"dynamic_config"
:
{
"local_size"
:
224
,
"hd_num"
:
6
,
"padding"
:
false
,
"add_global"
:
true
}
},
"device"
:
"cuda"
}
VLMEvalKit/vlmeval/vlm/video_llm/llama_vid.py
0 → 100644
View file @
81028572
import
torch
import
warnings
import
copy
as
cp
import
numpy
as
np
import
sys
import
os
from
..base
import
BaseModel
from
...smp
import
isimg
,
listinstr
,
load
,
dump
,
download_file
from
...dataset
import
DATASET_TYPE
from
decord
import
VideoReader
,
cpu
from
huggingface_hub
import
snapshot_download
def
load_video
(
video_path
):
vr
=
VideoReader
(
video_path
,
ctx
=
cpu
(
0
))
total_frame_num
=
len
(
vr
)
fps
=
round
(
vr
.
get_avg_fps
())
frame_idx
=
[
i
for
i
in
range
(
0
,
total_frame_num
,
fps
)]
spare_frames
=
vr
.
get_batch
(
frame_idx
).
asnumpy
()
return
spare_frames
def
change_file
(
file_path
,
mm_vision_tower
):
org_data
=
load
(
file_path
)
org_data
[
'image_processor'
]
=
'./vlmeval/vlm/video_llm/configs/llama_vid/processor/clip-patch14-224'
org_data
[
'mm_vision_tower'
]
=
mm_vision_tower
dump
(
org_data
,
file_path
)
class
LLaMAVID
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
False
VIDEO_LLM
=
True
def
__init__
(
self
,
model_path
=
'YanweiLi/llama-vid-7b-full-224-video-fps-1'
,
**
kwargs
):
assert
model_path
is
not
None
try
:
from
llamavid.model.builder
import
load_pretrained_model
from
llava.mm_utils
import
get_model_name_from_path
except
:
warnings
.
warn
(
'Please install LLaMA-VID from https://github.com/dvlab-research/LLaMA-VID.'
)
sys
.
exit
(
-
1
)
model_base
=
None
model_name
=
get_model_name_from_path
(
model_path
)
eva_vit_g_url
=
'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/eva_vit_g.pth'
true_model_path
=
snapshot_download
(
model_path
)
eva_vit_path
=
os
.
path
.
join
(
true_model_path
,
'eva_vit_g.pth'
)
if
not
os
.
path
.
exists
(
eva_vit_path
):
download_file
(
eva_vit_g_url
,
eva_vit_path
)
config_path
=
os
.
path
.
join
(
true_model_path
,
'config.json'
)
change_file
(
config_path
,
eva_vit_path
)
tokenizer
,
model
,
image_processor
,
context_len
=
load_pretrained_model
(
true_model_path
,
model_base
,
model_name
,
None
,
device_map
=
'cpu'
,
device
=
'cpu'
)
model
.
cuda
()
self
.
tokenizer
=
tokenizer
self
.
model
=
model
self
.
processor
=
image_processor
self
.
context_len
=
context_len
self
.
kwargs
=
kwargs
self
.
nframe
=
8
def
get_model_output
(
self
,
model
,
video_processor
,
tokenizer
,
video
,
qs
):
from
llamavid.constants
import
IMAGE_TOKEN_INDEX
,
DEFAULT_IMAGE_TOKEN
from
llamavid.constants
import
DEFAULT_IM_START_TOKEN
,
DEFAULT_IM_END_TOKEN
from
llamavid.conversation
import
conv_templates
,
SeparatorStyle
from
llava.mm_utils
import
tokenizer_image_token
,
KeywordsStoppingCriteria
original_qs
=
cp
.
deepcopy
(
qs
)
if
model
.
config
.
mm_use_im_start_end
:
qs
=
DEFAULT_IM_START_TOKEN
+
DEFAULT_IMAGE_TOKEN
+
DEFAULT_IM_END_TOKEN
+
'
\n
'
+
qs
else
:
qs
=
DEFAULT_IMAGE_TOKEN
+
'
\n
'
+
qs
conv_mode
=
'vicuna_v1'
conv
=
conv_templates
[
conv_mode
].
copy
()
conv
.
append_message
(
conv
.
roles
[
0
],
qs
)
conv
.
append_message
(
conv
.
roles
[
1
],
None
)
prompt
=
conv
.
get_prompt
()
# Check if the video exists
if
os
.
path
.
exists
(
video
):
video
=
load_video
(
video
)
video
=
video_processor
.
preprocess
(
video
,
return_tensors
=
'pt'
)[
'pixel_values'
].
half
().
cuda
()
video
=
[
video
]
input_ids
=
tokenizer_image_token
(
prompt
,
tokenizer
,
IMAGE_TOKEN_INDEX
,
return_tensors
=
'pt'
).
unsqueeze
(
0
).
cuda
()
stop_str
=
conv
.
sep
if
conv
.
sep_style
!=
SeparatorStyle
.
TWO
else
conv
.
sep2
keywords
=
[
stop_str
]
stopping_criteria
=
KeywordsStoppingCriteria
(
keywords
,
tokenizer
,
input_ids
)
cur_prompt
=
original_qs
with
torch
.
inference_mode
():
model
.
update_prompt
([[
cur_prompt
]])
output_ids
=
model
.
generate
(
input_ids
,
images
=
video
,
do_sample
=
True
,
temperature
=
0.2
,
max_new_tokens
=
1024
,
use_cache
=
True
,
stopping_criteria
=
[
stopping_criteria
],
)
input_token_len
=
input_ids
.
shape
[
1
]
n_diff_input_output
=
(
input_ids
!=
output_ids
[:,
:
input_token_len
]).
sum
().
item
()
if
n_diff_input_output
>
0
:
print
(
f
'[Warning]
{
n_diff_input_output
}
output_ids are not the same as the input_ids'
)
outputs
=
tokenizer
.
batch_decode
(
output_ids
[:,
input_token_len
:],
skip_special_tokens
=
True
)[
0
]
outputs
=
outputs
.
strip
()
if
outputs
.
endswith
(
stop_str
):
outputs
=
outputs
[:
-
len
(
stop_str
)]
outputs
=
outputs
.
strip
()
return
outputs
def
generate_inner
(
self
,
message
,
dataset
=
None
):
question
,
video
=
self
.
message_to_promptvideo
(
message
)
response
=
self
.
get_model_output
(
self
.
model
,
self
.
processor
,
self
.
tokenizer
,
video
,
question
)
return
response
VLMEvalKit/vlmeval/vlm/video_llm/pllava.py
0 → 100644
View file @
81028572
import
torch
import
warnings
import
copy
as
cp
import
numpy
as
np
import
sys
from
PIL
import
Image
import
torchvision
from
..base
import
BaseModel
from
...smp
import
isimg
,
listinstr
,
get_rank_and_world_size
from
...dataset
import
DATASET_TYPE
from
huggingface_hub
import
snapshot_download
class
PLLaVA
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
False
VIDEO_LLM
=
True
def
__init__
(
self
,
model_path
=
'ermu2001/pllava-13b'
,
dir_root
=
None
,
**
kwargs
):
sys
.
path
.
append
(
dir_root
)
try
:
from
tasks.eval.model_utils
import
load_pllava
except
:
warnings
.
warn
(
'Please first install requirements and set the root path to use PLLaVA.
\
Follow the instructions at https://github.com/magic-research/PLLaVA.'
)
sys
.
exit
(
-
1
)
rank
,
world_size
=
get_rank_and_world_size
()
self
.
nframe
=
16
self
.
use_lora
=
True
self
.
lora_alpha
=
4
self
.
pooling_shape
=
(
16
,
12
,
12
)
self
.
RESOLUTION
=
672
self
.
model_path
=
model_path
# remind that, once the model goes larger (30B+) may cause the memory to be heavily used up. Even Tearing Nodes.
weight_dir
=
snapshot_download
(
model_path
)
self
.
model
,
self
.
processor
=
load_pllava
(
model_path
,
num_frames
=
self
.
nframe
,
use_lora
=
self
.
use_lora
,
weight_dir
=
weight_dir
,
lora_alpha
=
self
.
lora_alpha
,
pooling_shape
=
self
.
pooling_shape
)
# position embedding
self
.
model
=
self
.
model
.
to
(
torch
.
device
(
rank
))
self
.
model
=
self
.
model
.
eval
()
def
load_video
(
self
,
video_path
,
num_segments
=
8
,
resolution
=
336
):
from
decord
import
VideoReader
,
cpu
transforms
=
torchvision
.
transforms
.
Resize
(
size
=
resolution
)
vr
=
VideoReader
(
video_path
,
ctx
=
cpu
(
0
),
num_threads
=
1
)
num_frames
=
len
(
vr
)
frame_indices
=
self
.
get_index
(
num_frames
,
num_segments
)
images_group
=
list
()
for
frame_index
in
frame_indices
:
img
=
Image
.
fromarray
(
vr
[
frame_index
].
asnumpy
())
images_group
.
append
(
transforms
(
img
))
return
images_group
def
get_index
(
self
,
num_frames
,
num_segments
):
seg_size
=
float
(
num_frames
-
1
)
/
num_segments
start
=
int
(
seg_size
/
2
)
offsets
=
np
.
array
([
start
+
int
(
np
.
round
(
seg_size
*
idx
))
for
idx
in
range
(
num_segments
)
])
return
offsets
def
generate_inner
(
self
,
message
,
dataset
=
None
):
from
tasks.eval.model_utils
import
pllava_answer
from
tasks.eval.eval_utils
import
conv_templates
question
,
video
=
self
.
message_to_promptvideo
(
message
)
img_list
=
self
.
load_video
(
video
,
num_segments
=
self
.
nframe
,
resolution
=
self
.
RESOLUTION
)
if
self
.
model_path
==
'ermu2001/pllava-34b'
:
# using slightly different conversation mode for 34b model
if
dataset
in
[
'Video-MME'
,
'MVBench'
,
'MVBench_MP4'
]:
# MCQ dataset
conv_mode
=
'eval_mvbench_llavanext'
else
:
# VQA dataset
conv_mode
=
'eval_videoqa_llavanext'
else
:
if
dataset
in
[
'Video-MME'
,
'MVBench'
,
'MVBench_MP4'
]:
# MCQ dataset
conv_mode
=
'eval_mvbench'
else
:
# VQA dataset
conv_mode
=
'eval_videoqabench'
conv
=
conv_templates
[
conv_mode
].
copy
()
if
dataset
in
[
'MVBench'
,
'MVBench_MP4'
]:
conv
.
user_query
(
message
[
1
][
'value'
],
message
[
0
][
'value'
],
message
[
-
2
][
'value'
],
is_mm
=
True
)
conv
.
assistant_response
(
message
[
-
1
][
'value'
])
else
:
conv
.
user_query
(
question
,
is_mm
=
True
)
llm_response
,
conv
=
pllava_answer
(
conv
=
conv
,
model
=
self
.
model
,
processor
=
self
.
processor
,
do_sample
=
False
,
img_list
=
img_list
,
max_new_tokens
=
512
,
print_res
=
False
)
if
dataset
in
[
'MVBench'
,
'MVBench_MP4'
]:
llm_response
=
'('
+
''
.
join
(
llm_response
.
split
(
message
[
-
1
][
'value'
])[
1
:])
return
llm_response
VLMEvalKit/vlmeval/vlm/video_llm/video_chatgpt.py
0 → 100644
View file @
81028572
import
torch
import
os
import
warnings
import
copy
as
cp
import
numpy
as
np
import
sys
from
..base
import
BaseModel
from
...smp
import
isimg
,
listinstr
from
...dataset
import
DATASET_TYPE
from
huggingface_hub
import
snapshot_download
class
VideoChatGPT
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
False
VIDEO_LLM
=
True
def
__init__
(
self
,
model_path
=
'MBZUAI/Video-ChatGPT-7B'
,
dir_root
=
None
,
**
kwargs
):
assert
model_path
is
not
None
sys
.
path
.
append
(
dir_root
)
try
:
from
video_chatgpt.eval.model_utils
import
initialize_model
except
:
warnings
.
warn
(
'Please first install requirements and set the root path to use Video-ChatGPT.
\
Follow the instructions at https://github.com/mbzuai-oryx/Video-ChatGPT.'
)
sys
.
exit
(
-
1
)
base_model_path
=
snapshot_download
(
'mmaaz60/LLaVA-7B-Lightening-v1-1'
)
projection_path
=
snapshot_download
(
model_path
)
projection_name
=
'video_chatgpt-7B.bin'
projection_path
=
os
.
path
.
join
(
projection_path
,
projection_name
)
model
,
vision_tower
,
tokenizer
,
image_processor
,
video_token_len
=
initialize_model
(
base_model_path
,
projection_path
)
self
.
tokenizer
=
tokenizer
self
.
model
=
model
self
.
processor
=
image_processor
self
.
context_len
=
video_token_len
self
.
kwargs
=
kwargs
self
.
vision_tower
=
vision_tower
self
.
nframe
=
8
def
get_model_output
(
self
,
model
,
video_processor
,
tokenizer
,
video
,
qs
):
from
video_chatgpt.eval.model_utils
import
load_video
from
video_chatgpt.inference
import
video_chatgpt_infer
conv_mode
=
'video-chatgpt_v1'
video_frames
=
load_video
(
video
)
# Run inference on the video and questions
output
=
video_chatgpt_infer
(
video_frames
,
qs
,
conv_mode
,
model
,
self
.
vision_tower
,
tokenizer
,
video_processor
,
self
.
context_len
,
)
return
output
def
generate_inner
(
self
,
message
,
dataset
=
None
):
question
,
video
=
self
.
message_to_promptvideo
(
message
)
response
=
self
.
get_model_output
(
self
.
model
,
self
.
processor
,
self
.
tokenizer
,
video
,
question
)
return
response
VLMEvalKit/vlmeval/vlm/video_llm/video_llava.py
0 → 100644
View file @
81028572
import
torch
import
warnings
import
copy
as
cp
import
numpy
as
np
import
sys
from
..base
import
BaseModel
from
...smp
import
isimg
,
listinstr
from
...dataset
import
DATASET_TYPE
def
read_video_pyav
(
container
,
indices
):
frames
=
[]
container
.
seek
(
0
)
start_index
=
indices
[
0
]
end_index
=
indices
[
-
1
]
for
i
,
frame
in
enumerate
(
container
.
decode
(
video
=
0
)):
if
i
>
end_index
:
break
if
i
>=
start_index
and
i
in
indices
:
frames
.
append
(
frame
)
return
np
.
stack
([
x
.
to_ndarray
(
format
=
'rgb24'
)
for
x
in
frames
])
class
VideoLLaVA_HF
(
BaseModel
):
INSTALL_REQ
=
False
INTERLEAVE
=
False
VIDEO_LLM
=
True
def
__init__
(
self
,
model_path
=
'LanguageBind/Video-LLaVA-7B-hf'
,
**
kwargs
):
try
:
from
transformers
import
VideoLlavaProcessor
,
VideoLlavaForConditionalGeneration
except
:
warnings
.
warn
(
'Please install the latest version transformers.
\
You can install by `pip install transformers==4.42.0`
\
or `pip install --upgrade git+https://github.com/huggingface/transformers.git`.'
)
sys
.
exit
(
-
1
)
assert
model_path
is
not
None
self
.
model_path
=
model_path
self
.
model
=
VideoLlavaForConditionalGeneration
.
from_pretrained
(
model_path
)
self
.
model
.
eval
().
cuda
()
self
.
processor
=
VideoLlavaProcessor
.
from_pretrained
(
model_path
)
self
.
kwargs
=
kwargs
self
.
nframe
=
8
torch
.
cuda
.
empty_cache
()
def
generate_inner
(
self
,
message
,
dataset
=
None
):
import
av
question
,
video
=
self
.
message_to_promptvideo
(
message
)
container
=
av
.
open
(
video
)
# sample uniformly 8 frames from the video
total_frames
=
container
.
streams
.
video
[
0
].
frames
indices
=
np
.
arange
(
0
,
total_frames
,
total_frames
/
self
.
nframe
).
astype
(
int
)
clip
=
read_video_pyav
(
container
,
indices
)
prompt
=
f
'USER: <video>
\n
{
question
}
ASSISTANT:'
inputs
=
self
.
processor
(
text
=
prompt
,
videos
=
clip
,
return_tensors
=
'pt'
).
to
(
self
.
model
.
device
)
# Generate args -- deperecated
generation_args
=
{
'max_new_tokens'
:
1024
,
'temperature'
:
0.2
,
'do_sample'
:
True
,
}
generation_args
.
update
(
self
.
kwargs
)
generate_ids
=
self
.
model
.
generate
(
**
inputs
,
**
generation_args
)
generate_ids
=
generate_ids
[:,
inputs
[
'input_ids'
].
shape
[
1
]:]
response
=
self
.
processor
.
batch_decode
(
generate_ids
,
skip_special_tokens
=
True
,
clean_up_tokenization_spaces
=
False
)[
0
]
return
response
class
VideoLLaVA
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
False
VIDEO_LLM
=
True
def
__init__
(
self
,
model_path
=
'LanguageBind/Video-LLaVA-7B'
,
**
kwargs
):
assert
model_path
is
not
None
try
:
from
videollava.conversation
import
conv_templates
,
SeparatorStyle
from
videollava.constants
import
DEFAULT_IMAGE_TOKEN
,
IMAGE_TOKEN_INDEX
from
videollava.constants
import
DEFAULT_VID_START_TOKEN
,
DEFAULT_VID_END_TOKEN
from
videollava.mm_utils
import
get_model_name_from_path
,
tokenizer_image_token
,
KeywordsStoppingCriteria
from
videollava.model.builder
import
load_pretrained_model
from
videollava.model.language_model.llava_llama
import
LlavaLlamaForCausalLM
from
videollava.train.train
import
smart_tokenizer_and_embedding_resize
except
:
warnings
.
warn
(
'Please install Video-LLaVA from https://github.com/FangXinyu-0913/Video-LLaVA.'
)
sys
.
exit
(
-
1
)
model_base
=
None
model_name
=
model_path
.
split
(
'/'
)[
-
1
]
tokenizer
,
model
,
processor
,
context_len
=
load_pretrained_model
(
model_path
,
model_base
,
model_name
)
self
.
tokenizer
=
tokenizer
self
.
model
=
model
self
.
processor
=
processor
self
.
context_len
=
context_len
self
.
kwargs
=
kwargs
self
.
nframe
=
8
def
get_model_output
(
self
,
model
,
video_processor
,
tokenizer
,
video
,
qs
):
from
videollava.conversation
import
conv_templates
,
SeparatorStyle
from
videollava.constants
import
DEFAULT_IMAGE_TOKEN
,
IMAGE_TOKEN_INDEX
from
videollava.constants
import
DEFAULT_VID_START_TOKEN
,
DEFAULT_VID_END_TOKEN
from
videollava.mm_utils
import
tokenizer_image_token
,
KeywordsStoppingCriteria
if
model
.
config
.
mm_use_im_start_end
:
qs
=
DEFAULT_VID_START_TOKEN
+
''
.
join
([
DEFAULT_IMAGE_TOKEN
]
*
8
)
+
DEFAULT_VID_END_TOKEN
+
'
\n
'
+
qs
else
:
qs
=
''
.
join
([
DEFAULT_IMAGE_TOKEN
]
*
8
)
+
'
\n
'
+
qs
conv_mode
=
'llava_v1'
device
=
torch
.
device
(
'cuda'
)
conv
=
conv_templates
[
conv_mode
].
copy
()
conv
.
append_message
(
conv
.
roles
[
0
],
qs
)
conv
.
append_message
(
conv
.
roles
[
1
],
None
)
prompt
=
conv
.
get_prompt
()
video_tensor
=
video_processor
.
preprocess
(
video
,
return_tensors
=
'pt'
)[
'pixel_values'
][
0
].
half
().
to
(
device
)
input_ids
=
tokenizer_image_token
(
prompt
,
tokenizer
,
IMAGE_TOKEN_INDEX
,
return_tensors
=
'pt'
).
unsqueeze
(
0
).
to
(
device
)
stop_str
=
conv
.
sep
if
conv
.
sep_style
!=
SeparatorStyle
.
TWO
else
conv
.
sep2
keywords
=
[
stop_str
]
stopping_criteria
=
KeywordsStoppingCriteria
(
keywords
,
tokenizer
,
input_ids
)
with
torch
.
inference_mode
():
output_ids
=
model
.
generate
(
input_ids
,
images
=
[
video_tensor
],
do_sample
=
True
,
temperature
=
0.2
,
max_new_tokens
=
1024
,
use_cache
=
True
,
stopping_criteria
=
[
stopping_criteria
])
input_token_len
=
input_ids
.
shape
[
1
]
n_diff_input_output
=
(
input_ids
!=
output_ids
[:,
:
input_token_len
]).
sum
().
item
()
if
n_diff_input_output
>
0
:
print
(
f
'[Warning]
{
n_diff_input_output
}
output_ids are not the same as the input_ids'
)
outputs
=
tokenizer
.
batch_decode
(
output_ids
[:,
input_token_len
:],
skip_special_tokens
=
True
)[
0
]
outputs
=
outputs
.
strip
()
if
outputs
.
endswith
(
stop_str
):
outputs
=
outputs
[:
-
len
(
stop_str
)]
outputs
=
outputs
.
strip
()
return
outputs
def
generate_inner
(
self
,
message
,
dataset
=
None
):
question
,
video
=
self
.
message_to_promptvideo
(
message
)
response
=
self
.
get_model_output
(
self
.
model
,
self
.
processor
[
'video'
],
self
.
tokenizer
,
video
,
question
)
return
response
VLMEvalKit/vlmeval/vlm/video_llm/videochat2.py
0 → 100644
View file @
81028572
import
torch
import
warnings
import
copy
as
cp
import
numpy
as
np
import
sys
import
os.path
as
osp
import
os
import
requests
import
shutil
import
huggingface_hub
from
transformers
import
StoppingCriteria
,
StoppingCriteriaList
from
huggingface_hub
import
snapshot_download
from
PIL
import
Image
from
torchvision.transforms
import
PILToTensor
from
torchvision
import
transforms
from
peft
import
get_peft_model
,
LoraConfig
,
TaskType
from
..base
import
BaseModel
from
...smp
import
*
from
...dataset
import
DATASET_TYPE
def
get_prompt
(
conv
):
ret
=
conv
.
system
+
conv
.
sep
for
role
,
message
in
conv
.
messages
:
if
message
:
ret
+=
role
+
' '
+
message
+
' '
+
conv
.
sep
else
:
ret
+=
role
return
ret
def
get_prompt2
(
conv
):
ret
=
conv
.
system
+
conv
.
sep
count
=
0
for
role
,
message
in
conv
.
messages
:
count
+=
1
if
count
==
len
(
conv
.
messages
):
ret
+=
role
+
' '
+
message
else
:
if
message
:
ret
+=
role
+
' '
+
message
+
' '
+
conv
.
sep
else
:
ret
+=
role
return
ret
class
StoppingCriteriaSub
(
StoppingCriteria
):
def
__init__
(
self
,
stops
=
[],
encounters
=
1
):
super
().
__init__
()
self
.
stops
=
stops
def
__call__
(
self
,
input_ids
:
torch
.
LongTensor
,
scores
:
torch
.
FloatTensor
):
for
stop
in
self
.
stops
:
if
torch
.
all
((
stop
==
input_ids
[
0
][
-
len
(
stop
):])).
item
():
return
True
return
False
class
VideoChat2_HD
(
BaseModel
):
INSTALL_REQ
=
True
INTERLEAVE
=
False
VIDEO_LLM
=
True
def
__init__
(
self
,
model_path
=
'OpenGVLab/VideoChat2_HD_stage4_Mistral_7B'
,
root
=
'./Ask-Anything'
,
config_file
=
'./configs/videochat2_hd.json'
,
**
kwargs
):
self
.
config_file
=
config_file
self
.
root
=
root
self
.
model_path
=
model_path
if
root
is
None
:
warnings
.
warn
(
'Please set `root` to Ask-Anything directory,
\
which is cloned from here: https://github.com/OpenGVLab/Ask-Anything'
)
sys
.
exit
(
-
1
)
sys
.
path
.
append
(
osp
.
join
(
root
,
'video_chat2'
))
try
:
from
utils.config
import
Config
from
utils.easydict
import
EasyDict
from
models
import
VideoChat2_it_hd_mistral
from
dataset.hd_utils
import
HD_transform_padding
,
HD_transform_no_padding
except
:
raise
ImportError
(
'Please first install VideoChat2 and set the root path to use VideoChat2, '
'which is cloned from here: https://github.com/OpenGVLab/Ask-Anything '
)
cfg
=
Config
.
from_file
(
self
.
config_file
)
def
download_file
(
url
,
pth
):
destination_folder
=
pth
# 确保目标文件夹存在
if
not
os
.
path
.
exists
(
destination_folder
):
os
.
makedirs
(
destination_folder
)
# 获取文件名
filename
=
os
.
path
.
basename
(
url
)
destination_path
=
os
.
path
.
join
(
destination_folder
,
filename
)
if
os
.
path
.
exists
(
destination_path
):
print
(
f
'File downloaded! No repeat download needed. Saved in
{
destination_path
}
'
)
return
# 下载文件
response
=
requests
.
get
(
url
,
stream
=
True
)
if
response
.
status_code
==
200
:
with
open
(
destination_path
,
'wb'
)
as
file
:
response
.
raw
.
decode_content
=
True
shutil
.
copyfileobj
(
response
.
raw
,
file
)
print
(
f
'File downloaded and saved to
{
destination_path
}
'
)
else
:
print
(
f
'Download failed, status code:
{
response
.
status_code
}
'
)
hf_token
=
os
.
environ
.
get
(
'HUGGINGFACE_TOKEN'
)
huggingface_hub
.
login
(
hf_token
)
videochat2_model_path
=
snapshot_download
(
repo_id
=
cfg
.
model
.
videochat2_model_path
,
repo_type
=
'model'
)
cfg
.
model
.
videochat2_model_path
=
osp
.
join
(
videochat2_model_path
,
'videochat2_mistral_7b_stage2.pth'
)
mistral_model_path
=
snapshot_download
(
repo_id
=
cfg
.
model
.
mistral_model_path
,
repo_type
=
'model'
)
cfg
.
model
.
mistral_model_path
=
mistral_model_path
vit_blip_model_path
=
snapshot_download
(
repo_id
=
cfg
.
model
.
vit_blip_model_path
,
repo_type
=
'model'
)
cfg
.
model
.
vit_blip_model_path
=
osp
.
join
(
vit_blip_model_path
,
'umt_l16_qformer.pth'
)
model
=
VideoChat2_it_hd_mistral
(
config
=
cfg
.
model
)
peft_config
=
LoraConfig
(
task_type
=
TaskType
.
CAUSAL_LM
,
inference_mode
=
False
,
r
=
16
,
lora_alpha
=
32
,
lora_dropout
=
0.
,
target_modules
=
[
'q_proj'
,
'k_proj'
,
'v_proj'
,
'o_proj'
,
'gate_proj'
,
'up_proj'
,
'down_proj'
,
'lm_head'
]
)
model
.
mistral_model
=
get_peft_model
(
model
.
mistral_model
,
peft_config
)
stage4_model_path
=
snapshot_download
(
repo_id
=
model_path
,
repo_type
=
'model'
)
state_dict
=
torch
.
load
(
osp
.
join
(
stage4_model_path
,
'videochat2_hd_mistral_7b_stage4.pth'
),
'cuda'
)
if
'model'
in
state_dict
.
keys
():
model
.
load_state_dict
(
state_dict
[
'model'
],
strict
=
False
)
else
:
model
.
load_state_dict
(
state_dict
,
strict
=
False
)
model
=
model
.
to
(
torch
.
device
(
'cuda'
))
model
=
model
.
eval
()
self
.
model
=
model
# position embedding
self
.
nframe
=
16
self
.
resolution
=
224
self
.
hd_num
=
6
new_pos_emb
=
self
.
get_sinusoid_encoding_table
(
n_position
=
(
self
.
resolution
//
16
)
**
2
*
self
.
nframe
,
cur_frame
=
self
.
nframe
)
self
.
model
.
vision_encoder
.
encoder
.
pos_embed
=
new_pos_emb
self
.
hd_transform
=
HD_transform_no_padding
mean
=
(
0.485
,
0.456
,
0.406
)
std
=
(
0.229
,
0.224
,
0.225
)
self
.
transform
=
transforms
.
Compose
([
transforms
.
Lambda
(
lambda
x
:
x
.
float
().
div
(
255.0
)),
transforms
.
Normalize
(
mean
,
std
)
])
def
get_sinusoid_encoding_table
(
self
,
n_position
=
784
,
d_hid
=
1024
,
cur_frame
=
8
,
ckpt_num_frame
=
4
,
pre_n_position
=
784
):
''' Sinusoid position encoding table '''
# TODO: make it with torch instead of numpy
def
get_position_angle_vec
(
position
):
return
[
position
/
np
.
power
(
10000
,
2
*
(
hid_j
//
2
)
/
d_hid
)
for
hid_j
in
range
(
d_hid
)]
# generate checkpoint position embedding
sinusoid_table
=
np
.
array
([
get_position_angle_vec
(
pos_i
)
for
pos_i
in
range
(
pre_n_position
)])
sinusoid_table
[:,
0
::
2
]
=
np
.
sin
(
sinusoid_table
[:,
0
::
2
])
# dim 2i
sinusoid_table
[:,
1
::
2
]
=
np
.
cos
(
sinusoid_table
[:,
1
::
2
])
# dim 2i+1
sinusoid_table
=
torch
.
tensor
(
sinusoid_table
,
dtype
=
torch
.
float
,
requires_grad
=
False
).
unsqueeze
(
0
)
print
(
f
'n_position:
{
n_position
}
'
)
print
(
f
'pre_n_position:
{
pre_n_position
}
'
)
if
n_position
!=
pre_n_position
:
T
=
ckpt_num_frame
# checkpoint frame
P
=
14
# checkpoint size
C
=
d_hid
new_P
=
int
((
n_position
//
cur_frame
)
**
0.5
)
# testing size
if
new_P
!=
14
:
print
(
f
'Pretraining uses 14x14, but current version is
{
new_P
}
x
{
new_P
}
'
)
print
(
'Interpolate the position embedding'
)
sinusoid_table
=
sinusoid_table
.
reshape
(
-
1
,
T
,
P
,
P
,
C
)
sinusoid_table
=
sinusoid_table
.
reshape
(
-
1
,
P
,
P
,
C
).
permute
(
0
,
3
,
1
,
2
)
sinusoid_table
=
torch
.
nn
.
functional
.
interpolate
(
sinusoid_table
,
size
=
(
new_P
,
new_P
),
mode
=
'bicubic'
,
align_corners
=
False
)
# BT, C, H, W -> BT, H, W, C -> B, T, H, W, C
sinusoid_table
=
sinusoid_table
.
permute
(
0
,
2
,
3
,
1
).
reshape
(
-
1
,
T
,
new_P
,
new_P
,
C
)
sinusoid_table
=
sinusoid_table
.
flatten
(
1
,
3
)
# B, THW, C
if
cur_frame
!=
ckpt_num_frame
:
print
(
f
'Pretraining uses 4 frames, but current frame is
{
cur_frame
}
'
)
print
(
'Interpolate the position embedding'
)
T
=
ckpt_num_frame
# checkpoint frame
new_T
=
cur_frame
# testing frame
# interpolate
P
=
int
((
n_position
//
cur_frame
)
**
0.5
)
# testing size
C
=
d_hid
sinusoid_table
=
sinusoid_table
.
reshape
(
-
1
,
T
,
P
,
P
,
C
)
sinusoid_table
=
sinusoid_table
.
permute
(
0
,
2
,
3
,
4
,
1
).
reshape
(
-
1
,
C
,
T
)
# BHW, C, T
sinusoid_table
=
torch
.
nn
.
functional
.
interpolate
(
sinusoid_table
,
size
=
new_T
,
mode
=
'linear'
)
sinusoid_table
=
sinusoid_table
.
reshape
(
1
,
P
,
P
,
C
,
new_T
).
permute
(
0
,
4
,
1
,
2
,
3
)
# B, T, H, W, C
sinusoid_table
=
sinusoid_table
.
flatten
(
1
,
3
)
# B, THW, C
return
sinusoid_table
def
get_index
(
self
,
bound
,
fps
,
max_frame
,
first_idx
=
0
):
if
bound
:
start
,
end
=
bound
[
0
],
bound
[
1
]
else
:
start
,
end
=
-
100000
,
100000
start_idx
=
max
(
first_idx
,
round
(
start
*
fps
))
end_idx
=
min
(
round
(
end
*
fps
),
max_frame
)
seg_size
=
float
(
end_idx
-
start_idx
)
/
self
.
nframe
frame_indices
=
np
.
array
([
int
(
start_idx
+
(
seg_size
/
2
)
+
np
.
round
(
seg_size
*
idx
))
for
idx
in
range
(
self
.
nframe
)
])
return
frame_indices
def
read_video
(
self
,
video_path
,
bound
=
None
):
from
decord
import
VideoReader
,
cpu
vr
=
VideoReader
(
video_path
,
ctx
=
cpu
(
0
),
num_threads
=
1
)
max_frame
=
len
(
vr
)
-
1
fps
=
float
(
vr
.
get_avg_fps
())
frame_indices
=
self
.
get_index
(
bound
,
fps
,
max_frame
,
first_idx
=
0
)
frames
=
vr
.
get_batch
(
frame_indices
)
frames
=
frames
.
permute
(
0
,
3
,
1
,
2
)
frames
=
self
.
hd_transform
(
frames
.
float
(),
image_size
=
self
.
resolution
,
hd_num
=
self
.
hd_num
)
torch_imgs
=
self
.
transform
(
frames
)
return
torch_imgs
def
ask
(
self
,
text
,
conv
):
conv
.
messages
.
append
([
conv
.
roles
[
0
],
text
])
def
get_context_emb
(
self
,
conv
,
model
,
img_list
,
answer_prompt
=
None
,
print_res
=
False
):
if
answer_prompt
:
prompt
=
get_prompt2
(
conv
)
else
:
prompt
=
get_prompt
(
conv
)
if
print_res
:
print
(
prompt
)
if
'<VideoHere>'
in
prompt
:
prompt_segs
=
prompt
.
split
(
'<VideoHere>'
)
else
:
prompt_segs
=
prompt
.
split
(
'<ImageHere>'
)
assert
len
(
prompt_segs
)
==
len
(
img_list
)
+
1
,
'Unmatched numbers of image placeholders and images.'
with
torch
.
no_grad
():
seg_tokens
=
[
model
.
mistral_tokenizer
(
seg
,
return_tensors
=
'pt'
,
add_special_tokens
=
i
==
0
).
to
(
'cuda'
).
input_ids
# only add bos to the first seg
for
i
,
seg
in
enumerate
(
prompt_segs
)
]
seg_embs
=
[
model
.
mistral_model
.
base_model
.
model
.
model
.
embed_tokens
(
seg_t
)
for
seg_t
in
seg_tokens
]
# seg_embs = [model.mistral_model.model.embed_tokens(seg_t) for seg_t in seg_tokens]
mixed_embs
=
[
emb
for
pair
in
zip
(
seg_embs
[:
-
1
],
img_list
)
for
emb
in
pair
]
+
[
seg_embs
[
-
1
]]
mixed_embs
=
torch
.
cat
(
mixed_embs
,
dim
=
1
)
return
mixed_embs
def
answer
(
self
,
conv
,
model
,
img_list
,
do_sample
=
True
,
max_new_tokens
=
500
,
num_beams
=
1
,
min_length
=
1
,
top_p
=
0.9
,
repetition_penalty
=
1.0
,
length_penalty
=
1
,
temperature
=
1.0
,
answer_prompt
=
None
,
print_res
=
False
):
stop_words_ids
=
[
torch
.
tensor
([
2
]).
to
(
'cuda'
),
torch
.
tensor
([
29871
,
2
]).
to
(
'cuda'
)]
# '</s>' can be encoded in two different ways.
stopping_criteria
=
StoppingCriteriaList
([
StoppingCriteriaSub
(
stops
=
stop_words_ids
)])
conv
.
messages
.
append
([
conv
.
roles
[
1
],
answer_prompt
])
embs
=
self
.
get_context_emb
(
conv
,
model
,
img_list
,
answer_prompt
=
answer_prompt
,
print_res
=
print_res
)
with
torch
.
no_grad
():
outputs
=
model
.
mistral_model
.
generate
(
inputs_embeds
=
embs
,
max_new_tokens
=
max_new_tokens
,
stopping_criteria
=
stopping_criteria
,
num_beams
=
num_beams
,
do_sample
=
do_sample
,
min_length
=
min_length
,
top_p
=
top_p
,
repetition_penalty
=
repetition_penalty
,
length_penalty
=
length_penalty
,
temperature
=
temperature
,
)
output_token
=
outputs
[
0
]
if
output_token
[
0
]
==
0
:
# the model might output a unknow token <unk> at the beginning. remove it
output_token
=
output_token
[
1
:]
if
output_token
[
0
]
==
1
:
# some users find that there is a start token <s> at the beginning. remove it
output_token
=
output_token
[
1
:]
output_text
=
model
.
mistral_tokenizer
.
decode
(
output_token
,
add_special_tokens
=
False
)
output_text
=
output_text
.
split
(
'</s>'
)[
0
]
# remove the stop sign </s>
# output_text = output_text.split('[/INST]')[-1].strip()
conv
.
messages
[
-
1
][
1
]
=
output_text
+
'</s>'
return
output_text
,
output_token
.
cpu
().
numpy
()
def
infer_data
(
self
,
data_sample
,
system
=
' '
,
question_prompt
=
''
,
# add in the end of question
answer_prompt
=
None
,
# add in the begining of answer
system_q
=
False
,
# whether add question in the system prompt for QFormer
print_res
=
True
,
system_llm
=
False
):
assert
system_q
is
False
,
'do not support system_q now'
video
=
data_sample
[
'video'
]
T_
,
C
,
H
,
W
=
video
.
shape
video
=
video
.
reshape
(
1
,
T_
,
C
,
H
,
W
).
to
(
'cuda'
)
video_list
=
[]
with
torch
.
no_grad
():
if
system_q
:
raise
NotImplementedError
else
:
video_emb
,
_
,
_
=
self
.
model
.
encode_img
(
video
,
system
)
video_list
.
append
(
video_emb
[
0
])
question
=
data_sample
[
'question'
]
from
utils.easydict
import
EasyDict
chat
=
EasyDict
({
'system'
:
system
,
'roles'
:
(
'[INST]'
,
'[/INST]'
),
'messages'
:
[],
'sep'
:
''
})
if
data_sample
[
'subtitle'
]
!=
''
:
subtitle
=
f
"This video's subtitles are listed below:
{
data_sample
[
'subtitle'
]
}
"
chat
.
messages
.
append
([
chat
.
roles
[
0
],
f
'
{
subtitle
}
\n
<Video><VideoHere></Video> [/INST]'
])
else
:
chat
.
messages
.
append
([
chat
.
roles
[
0
],
'<Video><VideoHere></Video> [/INST]'
])
if
system_llm
:
prompt
=
system
+
question
+
question_prompt
else
:
prompt
=
question
+
question_prompt
self
.
ask
(
prompt
,
chat
)
llm_message
=
self
.
answer
(
conv
=
chat
,
model
=
self
.
model
,
do_sample
=
False
,
img_list
=
video_list
,
max_new_tokens
=
100
,
answer_prompt
=
answer_prompt
,
print_res
=
print_res
)[
0
]
return
llm_message
.
strip
()
def
qa_template
(
self
,
data
):
question
=
data
.
split
(
'Answer:'
)[
0
].
split
(
'
\n
'
)[
0
]
+
'
\n
'
question
+=
'Options:
\n
'
choices
=
data
.
split
(
'Answer:'
)[
0
].
split
(
'
\n
'
)[
1
:]
choices
=
[
item
for
item
in
choices
if
item
!=
''
]
# remove blank space
for
idx
,
c
in
enumerate
(
choices
):
cur_choice
,
cur_text
=
c
[
0
],
c
[
3
:]
question
+=
f
'(
{
cur_choice
}
)
{
cur_text
}
\n
'
question
=
question
.
rstrip
()
return
question
def
split_subtitle
(
self
,
data
):
if
'This video
\'
s subtitles are listed below'
in
data
:
# 找到subtitle的起始和结束位置
start_marker
=
'This video
\'
s subtitles are listed below:'
end_marker
=
'Select the best answer to the following multiple-choice question based on the video.'
start_index
=
data
.
find
(
start_marker
)
+
len
(
start_marker
)
end_index
=
data
.
find
(
end_marker
)
# 提取subtitle部分
subtitle
=
data
[
start_index
:
end_index
].
strip
()
return
subtitle
else
:
return
''
def
generate_inner
(
self
,
message
,
dataset
=
None
):
if
dataset
==
'Video-MME'
:
_
,
video
=
self
.
message_to_promptvideo
(
message
)
torch_imgs
=
self
.
read_video
(
video
)
subtitle
=
self
.
split_subtitle
(
message
[
-
2
][
'value'
])
question
=
self
.
qa_template
(
message
[
-
1
][
'value'
])
example
=
{
'subtitle'
:
subtitle
,
'video'
:
torch_imgs
,
'question'
:
question
}
pred_option
=
self
.
infer_data
(
example
,
' '
,
question_prompt
=
'
\n
Only give the best option.'
,
answer_prompt
=
'Best option:('
,
system_q
=
False
,
print_res
=
False
,
system_llm
=
True
)
return_message
=
'('
+
pred_option
.
split
(
'
\n
'
)[
0
]
return
return_message
elif
dataset
==
'MVBench'
or
dataset
==
'MVBench_MP4'
:
_
,
video
=
self
.
message_to_promptvideo
(
message
)
torch_imgs
=
self
.
read_video
(
video
)
example
=
{
'subtitle'
:
''
,
'video'
:
torch_imgs
,
'question'
:
message
[
1
][
'value'
]
}
pred_option
=
self
.
infer_data
(
example
,
message
[
0
][
'value'
],
question_prompt
=
'
\n
Only give the best option.'
,
answer_prompt
=
'Best option:('
,
system_q
=
False
,
print_res
=
False
,
system_llm
=
True
)
return_message
=
'('
+
pred_option
.
split
(
'
\n
'
)[
0
]
return
return_message
else
:
question
,
video
=
self
.
message_to_promptvideo
(
message
)
torch_imgs
=
self
.
read_video
(
video
)
example
=
{
'subtitle'
:
''
,
'video'
:
torch_imgs
,
'question'
:
f
'Question:
{
question
}
\n
Answer:'
}
pred_result
=
self
.
infer_data
(
example
,
' '
,
system_q
=
False
,
print_res
=
False
,
system_llm
=
False
)
return
pred_result
Prev
1
…
5
6
7
8
9
10
11
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment