Commit 81028572 authored by luopl's avatar luopl
Browse files

init

parents
Pipeline #1722 canceled with stages
from PIL import Image
import torch
from .base import BaseModel
from ..smp import *
class Phi3Vision(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='microsoft/Phi-3-vision-128k-instruct', **kwargs):
try:
from transformers import AutoProcessor, AutoModelForCausalLM
except:
warnings.warn('Please install the latest version transformers.')
sys.exit(-1)
model = AutoModelForCausalLM.from_pretrained(
model_path, device_map='cuda', trust_remote_code=True, torch_dtype='auto').eval()
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
self.model = model
self.processor = processor
self.kwargs = kwargs
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
image = Image.open(image_path).convert('RGB')
messages = [
{'role': 'user', 'content': f'<|image_1|>\n{prompt}'}
]
prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(prompt, [image], return_tensors='pt').to('cuda')
generation_args = {
'max_new_tokens': 500,
'temperature': 0.0,
'do_sample': False,
}
generation_args.update(self.kwargs)
generate_ids = self.model.generate(
**inputs,
eos_token_id=self.processor.tokenizer.eos_token_id,
**generation_args
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = self.processor.batch_decode(
generate_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
return response
def chat_inner(self, message, dataset=None):
messages = []
image_cnt = 1
image_list = []
for msg in message:
content = ''
# If message is just text in the conversation
if len(msg['content']) == 1 and msg['content'][0]['type'] == 'text':
msg_new = {'role': msg['role'], 'content': msg['content'][0]['value']}
messages.append(msg_new)
continue
# If both image & text is present
for x in msg['content']:
if x['type'] == 'text':
content += x['value']
elif x['type'] == 'image':
image = Image.open(x['value']).convert('RGB')
content += f'<|image_{image_cnt}|>\n'
image_list.append(image)
image_cnt += 1
msg_new = {'role': msg['role'], 'content': content}
messages.append(msg_new)
prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(prompt, image_list, return_tensors='pt').to('cuda')
generation_args = {
'max_new_tokens': 500,
'temperature': 0.0,
'do_sample': False,
}
generation_args.update(self.kwargs)
generate_ids = self.model.generate(
**inputs,
eos_token_id=self.processor.tokenizer.eos_token_id,
**generation_args
)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = self.processor.batch_decode(
generate_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
return response
class Phi3_5Vision(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='microsoft/Phi-3.5-vision-instruct', **kwargs):
try:
from transformers import AutoProcessor, AutoModelForCausalLM
except:
warnings.warn('Please install the latest version transformers.')
sys.exit(-1)
model = AutoModelForCausalLM.from_pretrained(
model_path, device_map='cuda', trust_remote_code=True, torch_dtype='auto',
_attn_implementation='flash_attention_2').eval()
# for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True, num_crops=4)
self.model = model
self.processor = processor
self.kwargs = kwargs
def generate_inner(self, message, dataset=None):
prompt = '\n'.join([msg['value'] for msg in message if msg['type'] == 'text'])
images = [Image.open(msg['value']).convert('RGB') for msg in message if msg['type'] == 'image']
num_images = len(images)
placeholder = ''
for i in range(1, num_images + 1):
placeholder += f'<|image_{i}|>\n'
messages = [
{'role': 'user', 'content': placeholder + prompt}
]
prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = self.processor(prompt, images, return_tensors='pt').to('cuda')
generation_args = {
'max_new_tokens': 1000,
'temperature': 0.0,
'do_sample': False,
}
generation_args.update(self.kwargs)
generate_ids = self.model.generate(
**inputs,
eos_token_id=self.processor.tokenizer.eos_token_id,
**generation_args
)
# remove input tokens
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = self.processor.batch_decode(
generate_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
return response
import torch
from PIL import Image
from .base import BaseModel
from ..smp import *
import warnings
from huggingface_hub import snapshot_download
class Pixtral(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self, model_path='mistralai/Pixtral-12B-2409', **kwargs):
self.model_path = model_path
try:
from mistral_inference.transformer import Transformer
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
except ImportError as err:
warnings.warn('Please install `mistral-inference` and `mistral_common`')
raise err
if get_cache_path(model_path) is None:
snapshot_download(repo_id=model_path)
cache_path = get_cache_path(self.model_path)
self.tokenizer = MistralTokenizer.from_file(f'{cache_path}/tekken.json')
model = Transformer.from_folder(cache_path, device='cpu')
model.cuda()
self.model = model
self.max_tokens = 512
def generate_inner(self, message, dataset=None):
try:
from mistral_inference.generate import generate
from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageURLChunk
from mistral_common.protocol.instruct.request import ChatCompletionRequest
except ImportError as err:
warnings.warn('Please install `mistral-inference` and `mistral_common`')
raise err
msg_new = []
for msg in message:
tp, val = msg['type'], msg['value']
if tp == 'text':
msg_new.append(TextChunk(text=val))
elif tp == 'image':
b64 = encode_image_file_to_base64(val)
image_url = f'data:image/jpeg;base64,{b64}'
msg_new.append(ImageURLChunk(image_url=image_url))
completion_request = ChatCompletionRequest(messages=[UserMessage(content=msg_new)])
encoded = self.tokenizer.encode_chat_completion(completion_request)
images = encoded.images
tokens = encoded.tokens
out_tokens, _ = generate(
[tokens],
self.model,
images=[images],
max_tokens=self.max_tokens,
temperature=0,
eos_id=self.tokenizer.instruct_tokenizer.tokenizer.eos_id)
result = self.tokenizer.decode(out_tokens[0])
return result
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import warnings
import os.path as osp
from PIL import Image
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE
class QH_360VL(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='qihoo360/360VL-70B', **kwargs):
assert model_path is not None
self.model_path = model_path
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(model_path,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
device_map='auto',
trust_remote_code=True).eval()
vision_tower = self.model.get_vision_tower()
vision_tower.load_model()
vision_tower.to(device='cuda', dtype=torch.float16)
self.image_processor = vision_tower.image_processor
self.tokenizer.pad_token = self.tokenizer.eos_token
self.kwargs = kwargs
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
torch.cuda.empty_cache()
def generate(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
print(prompt)
image = Image.open(image_path).convert('RGB')
terminators = [
self.tokenizer.convert_tokens_to_ids('<|eot_id|>',)
]
inputs = self.model.build_conversation_input_ids(self.tokenizer,
query=prompt,
image=image,
image_processor=self.image_processor)
input_ids = inputs['input_ids'].to(device='cuda', non_blocking=True)
images = inputs['image'].to(dtype=torch.float16, device='cuda', non_blocking=True)
output_ids = self.model.generate(input_ids=input_ids,
images=images,
do_sample=False,
num_beams=1,
max_new_tokens=512,
eos_token_id=terminators,
use_cache=True)
input_token_len = input_ids.shape[1]
outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
response = outputs.strip()
return response
from .model import Qwen2VLChat
from .prompt import Qwen2VLPromptMixin
from __future__ import annotations
import os
import warnings
import torch
from ..base import BaseModel
from .prompt import Qwen2VLPromptMixin
def ensure_image_url(image: str) -> str:
prefixes = ['http://', 'https://', 'file://', 'data:image;']
if any(image.startswith(prefix) for prefix in prefixes):
return image
if os.path.exists(image):
return 'file://' + image
raise ValueError(f'Invalid image: {image}')
def ensure_video_url(video: str) -> str:
prefixes = ['http://', 'https://', 'file://', 'data:video;']
if any(video.startswith(prefix) for prefix in prefixes):
return video
if os.path.exists(video):
return 'file://' + video
raise ValueError(f'Invalid video: {video}')
class Qwen2VLChat(Qwen2VLPromptMixin, BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
VIDEO_LLM = True
def __init__(
self,
model_path: str,
min_pixels: int | None = None,
max_pixels: int | None = None,
max_new_tokens=2048,
top_p=0.001,
top_k=1,
temperature=0.01,
repetition_penalty=1.0,
use_custom_prompt: bool = True,
system_prompt: str | None = None,
verbose: bool = True,
):
super().__init__(use_custom_prompt=use_custom_prompt)
self.min_pixels = min_pixels
self.max_pixels = max_pixels
self.generate_kwargs = dict(
max_new_tokens=max_new_tokens,
top_p=top_p,
top_k=top_k,
temperature=temperature,
repetition_penalty=repetition_penalty,
)
self.system_prompt = system_prompt
self.verbose = verbose
self.fps = 2.0
from transformers import Qwen2VLForConditionalGeneration, Qwen2VLProcessor
assert model_path is not None
self.model_path = model_path
self.processor = Qwen2VLProcessor.from_pretrained(model_path)
if '72b' not in self.model_path.lower():
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
model_path, torch_dtype='auto', device_map='cpu', attn_implementation='flash_attention_2'
)
self.model.cuda().eval()
else:
self.model = Qwen2VLForConditionalGeneration.from_pretrained(
model_path, torch_dtype='auto', device_map='auto', attn_implementation='flash_attention_2'
)
self.model.cuda().eval()
torch.cuda.empty_cache()
def _prepare_content(self, inputs: list[dict[str, str]], dataset: str | None = None) -> list[dict[str, str]]:
"""
inputs list[dict[str, str]], each dict has keys: ['type', 'value']
"""
content = []
for s in inputs:
if s['type'] == 'image':
item = {'type': 'image', 'image': ensure_image_url(s['value'])}
if dataset == 'OCRBench':
item['min_pixels'] = 10 * 10 * 28 * 28
warnings.warn(f"OCRBench dataset uses custom min_pixels={item['min_pixels']}")
if self.max_pixels is not None:
item['max_pixels'] = self.max_pixels
else:
if self.min_pixels is not None:
item['min_pixels'] = self.min_pixels
if self.max_pixels is not None:
item['max_pixels'] = self.max_pixels
elif s['type'] == 'video':
item = {'type': 'video', 'video': ensure_video_url(s['value'])}
if self.fps is not None:
item['fps'] = self.fps
elif s['type'] == 'text':
item = {'type': 'text', 'text': s['value']}
else:
raise ValueError(f"Invalid message type: {s['type']}, {s}")
content.append(item)
return content
def generate_inner(self, message, dataset=None):
try:
from qwen_vl_utils import process_vision_info
except ImportError:
warnings.warn("qwen_vl_utils not found, please install it via 'pip install qwen-vl-utils'")
raise
messages = []
if self.system_prompt is not None:
messages.append({'role': 'system', 'content': self.system_prompt})
messages.append({'role': 'user', 'content': self._prepare_content(message, dataset=dataset)})
if self.verbose:
print(f'\033[31m{messages}\033[0m')
text = self.processor.apply_chat_template([messages], tokenize=False, add_generation_prompt=True)
images, videos = process_vision_info([messages])
inputs = self.processor(text=text, images=images, videos=videos, padding=True, return_tensors='pt')
inputs = inputs.to('cuda')
generated_ids = self.model.generate(
**inputs,
**self.generate_kwargs,
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, generated_ids)
]
out = self.processor.tokenizer.batch_decode(
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
response = out[0]
if self.verbose:
print(f'\033[32m{response}\033[0m')
return response
from __future__ import annotations
class Qwen2VLPromptMixin:
"""
Mixin class for Qwen2VLChat to build custom prompt for different datasets.
Requires the following methods to be implemented in the subclass:
- dump_image(line, dataset: str) -> str | list[str]
Implements the following methods:
- use_custom_prompt(dataset: str) -> bool
- build_prompt(line, dataset: str) -> list[dict[str, str]]
"""
def __init__(self, *args, use_custom_prompt: bool = True, **kwargs) -> None:
super().__init__(*args, **kwargs)
self._use_custom_prompt = use_custom_prompt
def set_dump_image(self, dump_image_func):
self.dump_image_func = dump_image_func
def dump_image(self, line, dataset):
return self.dump_image_func(line)
def use_custom_prompt(self, dataset: str) -> bool:
from vlmeval.dataset import DATASET_TYPE
dataset_type = DATASET_TYPE(dataset, default=None)
if not self._use_custom_prompt:
return False
if dataset in {'MMMU_DEV_VAL', 'MMMU_TEST'}:
return True
if dataset_type == 'MCQ':
return True
if dataset_type == 'Y/N' and dataset in {'HallusionBench', 'POPE'}: # MME has it's own prompt
return True
if dataset_type == 'VQA' and dataset not in {'MMVet'}: # MMVet VQA has it's own prompt
return True
return False
def build_prompt(self, line, dataset: str) -> list[dict[str, str]]:
from vlmeval.dataset import DATASET_TYPE
if dataset in {'MMMU_DEV_VAL', 'MMMU_TEST'}:
return self._build_mmmu_prompt(line, dataset)
dataset_type = DATASET_TYPE(dataset, default=None)
if dataset_type == 'MCQ':
return self._build_mcq_prompt(line, dataset)
if dataset_type == 'Y/N':
return self._build_yorn_prompt(line, dataset)
if dataset_type == 'VQA':
return self._build_vqa_prompt(line, dataset)
raise ValueError(f'Unsupported dataset: {dataset}')
def _build_mmmu_prompt(self, line, dataset: str) -> list[dict[str, str]]:
"""change the prompt for MMMU dataset: keep all images at beginning."""
import string
import pandas as pd
tgt_path = self.dump_image(line, dataset)
question = line['question']
options = {cand: line[cand] for cand in string.ascii_uppercase if cand in line and not pd.isna(line[cand])}
options_prompt = 'Options:\n'
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = ''
if hint is not None:
prompt += f'Hint: {hint}\n'
prompt += f'Question: {question}\n'
if len(options):
prompt += options_prompt
prompt += 'Please select the correct answer from the options above. \n'
prompt = prompt.rstrip()
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=prompt))
return msgs
def _build_mcq_prompt(self, line, dataset: str) -> list[dict[str, str]]:
"""change the prompt for MCQ dataset: use chinese prompt if the question contains chinese characters."""
MCQ_CN_PROMPT = '请直接回答选项字母。'
MCQ_EN_PROMPT = 'Please select the correct answer from the options above.'
import string
import pandas as pd
def cn_string(s):
import re
if re.search('[\u4e00-\u9fff]', s):
return True
return False
tgt_path = self.dump_image(line, dataset)
question = line['question']
options = {cand: line[cand] for cand in string.ascii_uppercase if cand in line and not pd.isna(line[cand])}
options_prompt = 'Options:\n'
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = ''
if hint is not None:
prompt += f'Hint: {hint}\n'
prompt += f'Question: {question}\n'
if len(options):
prompt += options_prompt
prompt += MCQ_CN_PROMPT if cn_string(prompt) else MCQ_EN_PROMPT
prompt = prompt.rstrip()
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=prompt))
return msgs
def _build_yorn_prompt(self, line, dataset: str) -> list[dict[str, str]]:
"""change the prompt for YORN dataset:"""
YORN_PROMPT = ' Please answer yes or no.'
tgt_path = self.dump_image(line, dataset)
question = line['question']
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=question))
assert msgs[-1]['type'] == 'text'
msgs[-1]['value'] += YORN_PROMPT
return msgs
def _build_vqa_prompt(self, line, dataset: str) -> list[dict[str, str]]:
"""change the prompt for VQA dataset:"""
VQA_PROMPT = '\nPlease try to answer the question with short words or phrases if possible.'
tgt_path = self.dump_image(line, dataset)
question = line['question']
msgs = []
if isinstance(tgt_path, list):
msgs.extend([dict(type='image', value=p) for p in tgt_path])
else:
msgs = [dict(type='image', value=tgt_path)]
msgs.append(dict(type='text', value=question))
assert msgs[-1]['type'] == 'text'
msgs[-1]['value'] += VQA_PROMPT
return msgs
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import warnings
import copy as cp
from .base import BaseModel
from ..smp import isimg, listinstr
from ..dataset import DATASET_TYPE
class QwenVL(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self, model_path='Qwen/Qwen-VL', **kwargs):
assert model_path is not None
self.model_path = model_path
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
tokenizer.padding_side = 'left'
tokenizer.pad_token_id = tokenizer.eod_id
self.tokenizer = tokenizer
self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cuda', trust_remote_code=True).eval()
default_kwargs = dict(
do_sample=False,
num_beams=1,
max_new_tokens=512,
min_new_tokens=1,
num_return_sequences=1,
use_cache=True,
output_hidden_states=True,
pad_token_id=tokenizer.eod_id,
eos_token_id=tokenizer.eod_id)
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
torch.cuda.empty_cache()
def adjust_kwargs(self, dataset):
kwargs = cp.deepcopy(self.kwargs)
if DATASET_TYPE(dataset) in ['MCQ', 'Y/N']:
kwargs['max_new_tokens'] = 32
elif DATASET_TYPE(dataset) == 'Caption' and 'COCO' in dataset:
kwargs['max_new_tokens'] = 32
elif DATASET_TYPE(dataset) == 'VQA':
if listinstr(['OCRVQA', 'ChartQA', 'DocVQA'], dataset):
kwargs['max_new_tokens'] = 100
elif listinstr(['TextVQA'], dataset):
kwargs['max_new_tokens'] = 10
return kwargs
def generate_inner(self, message, dataset=None):
if dataset is not None:
kwargs = self.adjust_kwargs(dataset)
else:
kwargs = self.kwargs
prompt = ''
for s in message:
if s['type'] == 'image':
prompt += f'<img>{s["value"]}</img>'
elif s['type'] == 'text':
prompt += s['value']
if dataset is not None and DATASET_TYPE(dataset) == 'VQA':
prompt += ' Answer:'
encoded = self.tokenizer([prompt], return_tensors='pt', padding='longest')
input_ids = encoded.input_ids.to('cuda')
attention_mask = encoded.attention_mask.to('cuda')
pred = self.model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
**kwargs)
answer = self.tokenizer.decode(pred[0][input_ids.size(1):].cpu(), skip_special_tokens=True).strip()
return answer
class QwenVLChat(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self, model_path='Qwen/Qwen-VL-Chat', **kwargs):
assert model_path is not None
self.model_path = model_path
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cuda', trust_remote_code=True).eval()
torch.cuda.empty_cache()
self.kwargs = kwargs
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
def build_history(self, message):
def concat_tilist(tilist):
image_cnt = 1
prompt = ''
for item in tilist:
if item['type'] == 'text':
prompt += item['value']
elif item['type'] == 'image':
prompt += f"Picture {image_cnt}: <img>{item['value']}</img>\n"
image_cnt += 1
return prompt
assert len(message) % 2 == 0
hist = []
for i in range(len(message) // 2):
m1, m2 = message[2 * i], message[2 * i + 1]
assert m1['role'] == 'user' and m2['role'] == 'assistant'
hist.append((concat_tilist(m1['content']), concat_tilist(m2['content'])))
return hist
def generate_inner(self, message, dataset=None):
vl_list = [{'image': s['value']} if s['type'] == 'image' else {'text': s['value']} for s in message]
query = self.tokenizer.from_list_format(vl_list)
response, _ = self.model.chat(self.tokenizer, query=query, history=None, **self.kwargs)
return response
def chat_inner(self, message, dataset=None):
assert len(message) % 2 == 1 and message[-1]['role'] == 'user'
history = self.build_history(message[:-1])
vl_list = [
{'image': s['value']} if s['type'] == 'image' else {'text': s['value']}
for s in message[-1]['content']
]
query = self.tokenizer.from_list_format(vl_list)
response, _ = self.model.chat(self.tokenizer, query=query, history=history, **self.kwargs)
return response
import sys
import torch
import os.path as osp
import os
import warnings
from .base import BaseModel
from ..dataset import DATASET_TYPE
from ..smp import *
from PIL import Image
'''
Please follow the instructions to download ckpt.
https://github.com/RBDash-Team/RBDash?tab=readme-ov-file#pretrained-weights
'''
class RBDash(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
def __init__(self, model_path, root=None, conv_mode='qwen', **kwargs):
from huggingface_hub import snapshot_download
if root is None:
warnings.warn('Please set `root` to RBDash code directory, \
which is cloned from here: "https://github.com/RBDash-Team/RBDash?tab=readme-ov-file" ')
sys.exit(-1)
warnings.warn('Please follow the instructions of RBDash to put the ckpt file in the right place, \
which can be found at https://github.com/RBDash-Team/RBDash?tab=readme-ov-file#structure')
assert model_path == 'RBDash-Team/RBDash-v1.2-72b', 'We only support RBDash-v1.2-72b for now'
sys.path.append(root)
try:
from rbdash.model.builder import load_pretrained_model
from rbdash.mm_utils import get_model_name_from_path
except:
raise ImportError(
'Please first install RBdash and set the root path to use RBdash, '
'which is cloned from here: "https://github.com/RBDash-Team/RBDash?tab=readme-ov-file" '
)
VLMEvalKit_path = os.getcwd()
os.chdir(root)
warnings.warn('Please set `root` to RBdash code directory, \
which is cloned from here: "https://github.com/RBDash-Team/RBDash?tab=readme-ov-file" ')
try:
model_name = get_model_name_from_path(model_path)
except:
raise ImportError(
'Please follow the instructions of RBdash to put the ckpt file in the right place, '
'which can be found at https://github.com/RBDash-Team/RBDash?tab=readme-ov-file#structure'
)
download_model_path = snapshot_download(model_path)
internvit_local_dir = './model_zoo/OpenGVLab/InternViT-6B-448px-V1-5'
os.makedirs(internvit_local_dir, exist_ok=True)
snapshot_download('OpenGVLab/InternViT-6B-448px-V1-5', local_dir=internvit_local_dir)
convnext_local_dir = './model_zoo/OpenAI/openclip-convnext-large-d-320-laion2B-s29B-b131K-ft-soup'
os.makedirs(convnext_local_dir, exist_ok=True)
snapshot_download('laion/CLIP-convnext_large_d_320.laion2B-s29B-b131K-ft-soup', local_dir=convnext_local_dir)
preprocessor_url = 'https://huggingface.co/openai/clip-vit-large-patch14-336/blob/main/preprocessor_config.json'
download_file_path = osp.join(convnext_local_dir, 'preprocessor_config.json')
if not osp.exists(download_file_path):
print(f'download preprocessor to {download_file_path}')
download_file(preprocessor_url, download_file_path)
tokenizer, model, image_processor, image_processor_aux, context_len = load_pretrained_model(
download_model_path, None, model_name, device_map='auto'
)
os.chdir(VLMEvalKit_path)
self.model = model
self.tokenizer = tokenizer
self.image_processor = image_processor
self.image_processor_aux = image_processor_aux
self.conv_mode = conv_mode
if tokenizer.unk_token is None:
tokenizer.unk_token = '<|endoftext|>'
tokenizer.pad_token = tokenizer.unk_token
kwargs_default = dict(temperature=float(0.2), num_beams=1, top_p=None, max_new_tokens=128, use_cache=True)
kwargs_default.update(kwargs)
self.kwargs = kwargs_default
def generate_inner(self, message, dataset=None):
try:
from rbdash.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, \
DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from rbdash.conversation import conv_templates
from rbdash.mm_utils import tokenizer_image_token, process_images
except:
raise ImportError(
'Please first install RBdash and set the root path to use RBdash, '
'which is cloned from here: "https://github.com/RBDash-Team/RBDash?tab=readme-ov-file" '
)
prompt, image = self.message_to_promptimg(message, dataset=dataset)
image = Image.open(image).convert('RGB')
if self.model.config.mm_use_im_start_end:
prompt = (
DEFAULT_IM_START_TOKEN
+ DEFAULT_IMAGE_TOKEN
+ DEFAULT_IM_END_TOKEN
+ '\n'
+ prompt
)
else:
prompt = DEFAULT_IMAGE_TOKEN + '\n' + prompt
conv = conv_templates[self.conv_mode].copy()
conv.append_message(conv.roles[0], prompt)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
input_ids = input_ids.unsqueeze(0).cuda()
if hasattr(self.model.config, 'image_size_aux'):
if not hasattr(self.image_processor, 'image_size_raw'):
self.image_processor.image_size_raw = self.image_processor.crop_size.copy()
self.image_processor.crop_size['height'] = self.model.config.image_size_aux
self.image_processor.crop_size['width'] = self.model.config.image_size_aux
self.image_processor.size['shortest_edge'] = self.model.config.image_size_aux
self.image_processor_aux.crop_size['height'] = self.model.config.image_size_aux
self.image_processor_aux.crop_size['width'] = self.model.config.image_size_aux
self.image_processor_aux.size[
'shortest_edge'
] = self.model.config.image_size_aux
image_tensor = process_images([image], self.image_processor, self.model.config)[0]
image_grid = getattr(self.model.config, 'image_grid', 1)
if hasattr(self.model.config, 'image_size_aux'):
raw_shape = [
self.image_processor.image_size_raw['height'] * image_grid,
self.image_processor.image_size_raw['width'] * image_grid
]
if self.image_processor is not self.image_processor_aux:
image_tensor_aux = process_images([image], self.image_processor_aux, self.model.config)[
0
]
else:
image_tensor_aux = image_tensor
image_tensor = torch.nn.functional.interpolate(
image_tensor[None],
size=raw_shape,
mode='bilinear',
align_corners=False
)[0]
else:
image_tensor_aux = []
if image_grid >= 2:
raw_image = image_tensor.reshape(
3, image_grid, self.image_processor.image_size_raw['height'],
image_grid, self.image_processor.image_size_raw['width']
)
raw_image = raw_image.permute(1, 3, 0, 2, 4)
raw_image = raw_image.reshape(
-1, 3, self.image_processor.image_size_raw['height'], self.image_processor.image_size_raw['width']
)
if getattr(self.model.config, 'image_global', False):
global_image = image_tensor
if len(global_image.shape) == 3:
global_image = global_image[None]
global_image = torch.nn.functional.interpolate(
global_image,
size=[
self.image_processor.image_size_raw['height'],
self.image_processor.image_size_raw['width']
],
mode='bilinear',
align_corners=False
)
raw_image = torch.cat([raw_image, global_image], dim=0)
image_tensor = raw_image.contiguous()
images = image_tensor[None].to(dtype=self.model.dtype, device='cuda', non_blocking=True)
if len(image_tensor_aux) > 0:
images_aux = image_tensor_aux[None].to(dtype=self.model.dtype, device='cuda', non_blocking=True)
else:
images_aux = None
with torch.inference_mode():
output_ids = self.model.generate(
input_ids,
max_new_tokens=512,
images=images,
images_aux=images_aux,
do_sample=True if self.kwargs['temperature'] > 0 else False,
temperature=self.kwargs['temperature'],
top_p=self.kwargs['top_p'],
num_beams=self.kwargs['num_beams']
)
outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
return outputs
def use_custom_prompt(self, dataset):
assert dataset is not None
if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset):
# For Multi-Turn we don't have custom prompt
return False
if 'mme' in dataset.lower():
return True
elif 'hallusionbench' in dataset.lower():
return True
elif 'mmmu' in dataset.lower():
return True
elif 'mmbench' in dataset.lower():
return True
return False
def build_mme(self, line):
question = line['question']
prompt = question + 'Answer the question using a single word or phrase.'
return prompt
def build_hallusionbench(self, line):
question = line['question']
prompt = question + '\nAnswer the question using a single word or phrase.'
return prompt
def build_mmbench(self, line):
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = ''
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
prompt = f'{question}\n'
if len(options):
prompt += options_prompt
prompt += "Answer with the option's letter from the given choices directly."
else:
prompt += 'Answer the question using a single word or phrase.'
return prompt
def build_mmmu(self, line):
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = ''
for key, item in options.items():
options_prompt += f'({key}) {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = ''
if hint is not None:
prompt += f'Hint: {hint}\n'
prompt += f'{question}\n'
if len(options):
prompt += options_prompt
prompt += "\nAnswer with the option's letter from the given choices directly."
else:
prompt += 'Answer the question using a single word or phrase.'
return prompt
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
if 'mme' in dataset.lower():
prompt = self.build_mme(line)
elif 'hallusionbench' in dataset.lower():
prompt = self.build_hallusionbench(line)
elif 'mmmu' in dataset.lower():
prompt = self.build_mmmu(line)
elif 'mmbench' in dataset.lower():
prompt = self.build_mmbench(line)
ret = [dict(type='text', value=prompt)]
ret.extend([dict(type='image', value=s) for s in tgt_path])
return ret
import torch
from PIL import Image
from abc import abstractproperty
import sys
import os.path as osp
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE
import copy
class SliME(BaseModel):
INSTALL_REQ = True
INTERLEAVE = True
DEFAULT_IMAGE_TOKEN = '<image>'
IMAGE_TOKEN_INDEX = -200
def __init__(self, model_path='yifanzhang114/SliME-Llama3-8B', **kwargs):
assert model_path is not None
try:
from llava.model.builder import load_pretrained_model
from llava.conversation import conv_templates
from llava.mm_utils import get_model_name_from_path, tokenizer_image_token
except:
warnings.warn('Please install requirements on https://github.com/yfzhang114/SliME before using SliME')
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None, model_name, device_map=None)
model.cuda().eval()
model.tie_weights()
if 'llama3' in model_path.lower():
conv_mode = 'llama3'
elif 'vicuna' in model_path.lower():
conv_mode = 'v1'
self.conv_template = conv_mode
self.conv_templates = conv_templates
self.tokenizer = tokenizer
self.model = model
self.image_processor = image_processor
self.tokenizer_image_token = tokenizer_image_token
def generate_inner(self, message, dataset=None):
content, images = '', []
for msg in message:
if msg['type'] == 'text':
content += msg['value']
else:
images.append(Image.open(msg['value']).convert('RGB'))
content += (self.DEFAULT_IMAGE_TOKEN + '\n')
preprocess = self.image_processor.preprocess
image_tokenizer = self.tokenizer_image_token
image_tensor = [
preprocess(f, return_tensors='pt')['pixel_values'][0].half().cuda() for f in images
]
image_tensor = torch.stack(image_tensor)
conv = copy.deepcopy(self.conv_templates[self.conv_template])
conv.messages = list(conv.messages)
conv.append_message(conv.roles[0], content)
conv.append_message(conv.roles[1], None)
prompt_question = conv.get_prompt()
input_ids = image_tokenizer(prompt_question, self.tokenizer, self.IMAGE_TOKEN_INDEX, return_tensors='pt')
input_ids = input_ids.unsqueeze(0).cuda()
cont = self.model.generate(
input_ids,
images=image_tensor,
do_sample=False,
temperature=0,
max_new_tokens=512,
)
text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0]
return text_outputs
import sys
import torch
from abc import abstractproperty
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE
from transformers import AutoTokenizer, BitsAndBytesConfig
class TransCoreM(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
def load_pretrained_model(self, model_path, load_8bit=False, load_4bit=False, revision='main'):
from transcorem.model import TransCoreMQWenForCausalLM
from transcorem.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
import transcorem.config_param as config_param
kwargs = {'revision': revision}
if load_8bit:
kwargs['load_in_8bit'] = True
elif load_4bit:
kwargs['load_in_4bit'] = True
kwargs['quantization_config'] = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type='nf4'
)
else:
kwargs['torch_dtype'] = torch.float16
config_param.model_path = model_path
tokenizer = AutoTokenizer.from_pretrained(
model_path, use_fast=False, revision=revision, trust_remote_code=True)
model = TransCoreMQWenForCausalLM.from_pretrained(
model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
image_processor = None
mm_use_im_start_end = getattr(model.config, 'mm_use_im_start_end', False)
mm_use_im_patch_token = getattr(model.config, 'mm_use_im_patch_token', True)
if mm_use_im_patch_token:
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
if mm_use_im_start_end:
tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
model.resize_token_embeddings(len(tokenizer))
vision_tower = model.get_vision_tower()
if not vision_tower.is_loaded:
vision_tower.load_model()
vision_tower.to(device='cpu', dtype=torch.float16)
image_processor = vision_tower.image_processor
if hasattr(model.config, 'max_sequence_length'):
context_len = model.config.max_sequence_length
else:
context_len = 2048
return tokenizer, model, image_processor, context_len
def __init__(self,
root=None,
revision='main',
**kwargs):
self.root = root
self.revision = revision
sys.path.append(root)
model_path = 'PCIResearch/TransCore-M'
assert osp.exists(model_path) or splitlen(model_path) == 2
self.tokenizer, self.model, self.image_processor, self.context_len = self.load_pretrained_model(
model_path=model_path, revision=revision)
self.model = self.model.cuda()
print('==============conv_mode: transcorem_v1')
self.conv_mode = 'transcorem_v1'
kwargs_default = dict(do_sample=False, temperature=0.0, max_new_tokens=512, top_p=None, num_beams=1)
kwargs_default.update(kwargs)
self.kwargs = kwargs_default
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'MCQ':
return True
return False
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if len(options):
prompt += (
'\n请直接回答选项字母。' if cn_string(prompt) else
"\nAnswer with the option's letter from the given choices directly."
)
else:
prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=f) for f in tgt_path])
return message
def generate_inner(self, message, dataset=None):
from transcorem.mm_utils import highres_process_images, tokenizer_image_token, KeywordsStoppingCriteria
from transcorem.constants import (
IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN)
from transcorem.conversation import conv_templates, SeparatorStyle
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
image = Image.open(image_path).convert('RGB')
args = abstractproperty()
args.image_aspect_ratio = 'pad'
image_patches = highres_process_images(image, self.image_processor, args, base_reso=336)
image_patches = [patch.unsqueeze(0).to('cuda', dtype=torch.float16) for patch in image_patches]
if self.model.config.mm_use_im_start_end:
inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + prompt
else:
inp = DEFAULT_IMAGE_TOKEN + '\n' + prompt
conv = conv_templates[self.conv_mode].copy()
conv.append_message(conv.roles[0], inp)
conv.append_message(conv.roles[1], None)
prompt_conv = conv.get_prompt()
input_ids = tokenizer_image_token(prompt_conv, self.tokenizer, IMAGE_TOKEN_INDEX,
return_tensors='pt').unsqueeze(0).cuda()
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids)
with torch.inference_mode():
output_ids = self.model.generate(
input_ids,
images=image_patches,
use_cache=True,
stopping_criteria=[stopping_criteria],
**self.kwargs)
input_token_len = input_ids.shape[1]
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
if n_diff_input_output > 0:
print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
outputs = outputs.strip()
if outputs.endswith(stop_str):
outputs = outputs[:-len(stop_str)]
outputs = outputs.strip()
return outputs
from .video_llava import VideoLLaVA, VideoLLaVA_HF
from .videochat2 import VideoChat2_HD
from .chat_uni_vi import Chatunivi
from .video_chatgpt import VideoChatGPT
from .llama_vid import LLaMAVID
from .pllava import PLLaVA
__all__ = ['VideoLLaVA', 'VideoLLaVA_HF', 'Chatunivi', 'VideoChatGPT', 'LLaMAVID', 'VideoChat2_HD', 'PLLaVA']
import torch
import warnings
import copy as cp
import numpy as np
import sys
import os
from ..base import BaseModel
from ...smp import isimg, listinstr
from ...dataset import DATASET_TYPE
from decord import VideoReader, cpu
from PIL import Image
def _get_rawvideo_dec(
video_path,
image_processor,
max_frames=64,
image_resolution=224,
video_framerate=1,
s=None,
e=None,
):
# speed up video decode via decord.
video_mask = np.zeros(max_frames, dtype=np.int64)
max_video_length = 0
# T x 3 x H x W
video = np.zeros((max_frames, 3, image_resolution, image_resolution), dtype=np.float64)
if s is None:
start_time, end_time = None, None
else:
start_time = int(s)
end_time = int(e)
start_time = start_time if start_time >= 0.0 else 0.0
end_time = end_time if end_time >= 0.0 else 0.0
if start_time > end_time:
start_time, end_time = end_time, start_time
elif start_time == end_time:
end_time = start_time + 1
if os.path.exists(video_path):
vreader = VideoReader(video_path, ctx=cpu(0))
else:
print(video_path)
raise FileNotFoundError
fps = vreader.get_avg_fps()
f_start = 0 if start_time is None else int(start_time * fps)
f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
num_frames = f_end - f_start + 1
if num_frames > 0:
# T x 3 x H x W
sample_fps = int(video_framerate)
t_stride = int(round(float(fps) / sample_fps))
all_pos = list(range(f_start, f_end + 1, t_stride))
if len(all_pos) > max_frames:
sample_pos = [all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)]
else:
sample_pos = all_pos
patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
patch_images = torch.stack(
[image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images]
)
slice_len = patch_images.shape[0]
max_video_length = max_video_length if max_video_length > slice_len else slice_len
if slice_len < 1:
pass
else:
video[:slice_len, ...] = patch_images
return patch_images, slice_len
else:
print('video path: {} error.'.format(video_path))
video_mask[:max_video_length] = [1] * max_video_length
return torch.from_numpy(video), video_mask
class Chatunivi(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
VIDEO_LLM = True
def __init__(self, model_path='Chat-UniVi/Chat-UniVi', **kwargs):
assert model_path is not None
try:
from ChatUniVi.model.builder import load_pretrained_model
except:
warnings.warn('Please install Chat-UniVi from https://github.com/PKU-YuanGroup/Chat-UniVi.git.')
sys.exit(-1)
model_name = 'ChatUniVi'
tokenizer, model, processor, context_len = load_pretrained_model(model_path, None, model_name)
self.tokenizer = tokenizer
self.model = model
vision_tower = model.get_vision_tower()
if not vision_tower.is_loaded:
vision_tower.load_model()
image_processor = vision_tower.image_processor
self.processor = image_processor
self.context_len = context_len
self.kwargs = kwargs
self.nframe = 64
self.resolution = 224
if 'v1.5' in model_path:
self.resolution = 336
def get_model_output(self, model, video_processor, tokenizer, video, qs):
from ChatUniVi.conversation import conv_templates, SeparatorStyle
from ChatUniVi.constants import (
DEFAULT_IMAGE_PATCH_TOKEN,
DEFAULT_IMAGE_TOKEN,
IMAGE_TOKEN_INDEX,
DEFAULT_IM_START_TOKEN,
DEFAULT_IM_END_TOKEN,
MAX_IMAGE_LENGTH,
)
from ChatUniVi.mm_utils import (
tokenizer_image_token,
KeywordsStoppingCriteria,
)
mm_use_im_start_end = getattr(model.config, 'mm_use_im_start_end', False)
mm_use_im_patch_token = getattr(model.config, 'mm_use_im_patch_token', True)
if mm_use_im_patch_token:
tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
if mm_use_im_start_end:
tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
model.resize_token_embeddings(len(tokenizer))
if model.config.config['use_cluster']:
for n, m in model.named_modules():
m = m.to(dtype=torch.bfloat16)
video_frames, slice_len = _get_rawvideo_dec(
video, video_processor, max_frames=MAX_IMAGE_LENGTH, image_resolution=self.resolution
)
if model.config.mm_use_im_start_end:
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN * slice_len + DEFAULT_IM_END_TOKEN + '\n' + qs
else:
qs = DEFAULT_IMAGE_TOKEN * slice_len + '\n' + qs
conv = conv_templates['v1'].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(
0).cuda()
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=video_frames.half().cuda(),
do_sample=True,
temperature=0.2,
top_p=None,
num_beams=1,
output_scores=True,
return_dict_in_generate=True,
max_new_tokens=1024,
use_cache=True,
stopping_criteria=[stopping_criteria])
output_ids = output_ids.sequences
input_token_len = input_ids.shape[1]
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
if n_diff_input_output > 0:
print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
outputs = outputs.strip()
if outputs.endswith(stop_str):
outputs = outputs[:-len(stop_str)]
outputs = outputs.strip()
return outputs
def generate_inner(self, message, dataset=None):
question, video = self.message_to_promptvideo(message)
response = self.get_model_output(self.model, self.processor, self.tokenizer, video, question)
return response
{
"_name_or_path": "clip-vit-large-patch14/",
"architectures": [
"CLIPModel"
],
"initializer_factor": 1.0,
"logit_scale_init_value": 2.6592,
"model_type": "clip",
"projection_dim": 768,
"text_config": {
"_name_or_path": "",
"add_cross_attention": false,
"architectures": null,
"attention_dropout": 0.0,
"bad_words_ids": null,
"bos_token_id": 0,
"chunk_size_feed_forward": 0,
"cross_attention_hidden_size": null,
"decoder_start_token_id": null,
"diversity_penalty": 0.0,
"do_sample": false,
"dropout": 0.0,
"early_stopping": false,
"encoder_no_repeat_ngram_size": 0,
"eos_token_id": 2,
"finetuning_task": null,
"forced_bos_token_id": null,
"forced_eos_token_id": null,
"hidden_act": "quick_gelu",
"hidden_size": 768,
"id2label": {
"0": "LABEL_0",
"1": "LABEL_1"
},
"initializer_factor": 1.0,
"initializer_range": 0.02,
"intermediate_size": 3072,
"is_decoder": false,
"is_encoder_decoder": false,
"label2id": {
"LABEL_0": 0,
"LABEL_1": 1
},
"layer_norm_eps": 1e-05,
"length_penalty": 1.0,
"max_length": 20,
"max_position_embeddings": 77,
"min_length": 0,
"model_type": "clip_text_model",
"no_repeat_ngram_size": 0,
"num_attention_heads": 12,
"num_beam_groups": 1,
"num_beams": 1,
"num_hidden_layers": 12,
"num_return_sequences": 1,
"output_attentions": false,
"output_hidden_states": false,
"output_scores": false,
"pad_token_id": 1,
"prefix": null,
"problem_type": null,
"projection_dim" : 768,
"pruned_heads": {},
"remove_invalid_values": false,
"repetition_penalty": 1.0,
"return_dict": true,
"return_dict_in_generate": false,
"sep_token_id": null,
"task_specific_params": null,
"temperature": 1.0,
"tie_encoder_decoder": false,
"tie_word_embeddings": true,
"tokenizer_class": null,
"top_k": 50,
"top_p": 1.0,
"torch_dtype": null,
"torchscript": false,
"transformers_version": "4.16.0.dev0",
"use_bfloat16": false,
"vocab_size": 49408
},
"text_config_dict": {
"hidden_size": 768,
"intermediate_size": 3072,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"projection_dim": 768
},
"torch_dtype": "float32",
"transformers_version": null,
"vision_config": {
"_name_or_path": "",
"add_cross_attention": false,
"architectures": null,
"attention_dropout": 0.0,
"bad_words_ids": null,
"bos_token_id": null,
"chunk_size_feed_forward": 0,
"cross_attention_hidden_size": null,
"decoder_start_token_id": null,
"diversity_penalty": 0.0,
"do_sample": false,
"dropout": 0.0,
"early_stopping": false,
"encoder_no_repeat_ngram_size": 0,
"eos_token_id": null,
"finetuning_task": null,
"forced_bos_token_id": null,
"forced_eos_token_id": null,
"hidden_act": "quick_gelu",
"hidden_size": 1024,
"id2label": {
"0": "LABEL_0",
"1": "LABEL_1"
},
"image_size": 224,
"initializer_factor": 1.0,
"initializer_range": 0.02,
"intermediate_size": 4096,
"is_decoder": false,
"is_encoder_decoder": false,
"label2id": {
"LABEL_0": 0,
"LABEL_1": 1
},
"layer_norm_eps": 1e-05,
"length_penalty": 1.0,
"max_length": 20,
"min_length": 0,
"model_type": "clip_vision_model",
"no_repeat_ngram_size": 0,
"num_attention_heads": 16,
"num_beam_groups": 1,
"num_beams": 1,
"num_hidden_layers": 24,
"num_return_sequences": 1,
"output_attentions": false,
"output_hidden_states": false,
"output_scores": false,
"pad_token_id": null,
"patch_size": 14,
"prefix": null,
"problem_type": null,
"projection_dim" : 768,
"pruned_heads": {},
"remove_invalid_values": false,
"repetition_penalty": 1.0,
"return_dict": true,
"return_dict_in_generate": false,
"sep_token_id": null,
"task_specific_params": null,
"temperature": 1.0,
"tie_encoder_decoder": false,
"tie_word_embeddings": true,
"tokenizer_class": null,
"top_k": 50,
"top_p": 1.0,
"torch_dtype": null,
"torchscript": false,
"transformers_version": "4.16.0.dev0",
"use_bfloat16": false
},
"vision_config_dict": {
"hidden_size": 1024,
"intermediate_size": 4096,
"num_attention_heads": 16,
"num_hidden_layers": 24,
"patch_size": 14,
"projection_dim": 768
}
}
{
"crop_size": 224,
"do_center_crop": true,
"do_normalize": true,
"do_resize": true,
"feature_extractor_type": "CLIPFeatureExtractor",
"image_mean": [
0.48145466,
0.4578275,
0.40821073
],
"image_std": [
0.26862954,
0.26130258,
0.27577711
],
"resample": 3,
"size": 224
}
{
"model": {
"model_cls": "VideoChat2_it_hd_mistral",
"vit_blip_model_path": "OpenGVLab/videochat2",
"mistral_model_path": "mistralai/Mistral-7B-Instruct-v0.2",
"videochat2_model_path": "OpenGVLab/VideoChat2_stage2_Mistral_7B",
"freeze_vit": false,
"freeze_qformer": false,
"max_txt_len": 512,
"low_resource": false,
"vision_encoder": {
"name": "vit_l14",
"img_size": 224,
"patch_size": 16,
"d_model": 1024,
"encoder_embed_dim": 1024,
"encoder_depth": 24,
"encoder_num_heads": 16,
"drop_path_rate": 0.0,
"num_frames": 8,
"tubelet_size": 1,
"use_checkpoint": true,
"checkpoint_num": 18,
"pretrained": "",
"return_index": -2,
"vit_add_ln": true,
"ckpt_num_frame": 4
},
"num_query_token": 32,
"qformer_hidden_dropout_prob": 0.1,
"qformer_attention_probs_dropout_prob": 0.1,
"qformer_drop_path_rate": 0.2,
"extra_num_query_token": 64,
"qformer_text_input": true,
"system": "",
"start_token": "<Video>",
"end_token": "</Video>",
"add_second_msg": true,
"img_start_token": "<Image>",
"img_end_token": "</Image>",
"random_shuffle": true,
"return_question_instruction": false,
"use_flash_attention": true,
"use_lora": false,
"lora_r": 16,
"lora_alpha": 32,
"lora_dropout": 0.1,
"dynamic_config": {
"local_size": 224,
"hd_num": 6,
"padding": false,
"add_global": true
}
},
"device": "cuda"
}
import torch
import warnings
import copy as cp
import numpy as np
import sys
import os
from ..base import BaseModel
from ...smp import isimg, listinstr, load, dump, download_file
from ...dataset import DATASET_TYPE
from decord import VideoReader, cpu
from huggingface_hub import snapshot_download
def load_video(video_path):
vr = VideoReader(video_path, ctx=cpu(0))
total_frame_num = len(vr)
fps = round(vr.get_avg_fps())
frame_idx = [i for i in range(0, total_frame_num, fps)]
spare_frames = vr.get_batch(frame_idx).asnumpy()
return spare_frames
def change_file(file_path, mm_vision_tower):
org_data = load(file_path)
org_data['image_processor'] = './vlmeval/vlm/video_llm/configs/llama_vid/processor/clip-patch14-224'
org_data['mm_vision_tower'] = mm_vision_tower
dump(org_data, file_path)
class LLaMAVID(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
VIDEO_LLM = True
def __init__(self, model_path='YanweiLi/llama-vid-7b-full-224-video-fps-1', **kwargs):
assert model_path is not None
try:
from llamavid.model.builder import load_pretrained_model
from llava.mm_utils import get_model_name_from_path
except:
warnings.warn('Please install LLaMA-VID from https://github.com/dvlab-research/LLaMA-VID.')
sys.exit(-1)
model_base = None
model_name = get_model_name_from_path(model_path)
eva_vit_g_url = 'https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/eva_vit_g.pth'
true_model_path = snapshot_download(model_path)
eva_vit_path = os.path.join(true_model_path, 'eva_vit_g.pth')
if not os.path.exists(eva_vit_path):
download_file(eva_vit_g_url, eva_vit_path)
config_path = os.path.join(true_model_path, 'config.json')
change_file(config_path, eva_vit_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(
true_model_path, model_base, model_name, None, device_map='cpu', device='cpu'
)
model.cuda()
self.tokenizer = tokenizer
self.model = model
self.processor = image_processor
self.context_len = context_len
self.kwargs = kwargs
self.nframe = 8
def get_model_output(self, model, video_processor, tokenizer, video, qs):
from llamavid.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from llamavid.constants import DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llamavid.conversation import conv_templates, SeparatorStyle
from llava.mm_utils import tokenizer_image_token, KeywordsStoppingCriteria
original_qs = cp.deepcopy(qs)
if model.config.mm_use_im_start_end:
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
else:
qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
conv_mode = 'vicuna_v1'
conv = conv_templates[conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
# Check if the video exists
if os.path.exists(video):
video = load_video(video)
video = video_processor.preprocess(video, return_tensors='pt')['pixel_values'].half().cuda()
video = [video]
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
cur_prompt = original_qs
with torch.inference_mode():
model.update_prompt([[cur_prompt]])
output_ids = model.generate(
input_ids,
images=video,
do_sample=True,
temperature=0.2,
max_new_tokens=1024,
use_cache=True,
stopping_criteria=[stopping_criteria],
)
input_token_len = input_ids.shape[1]
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
if n_diff_input_output > 0:
print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
outputs = outputs.strip()
if outputs.endswith(stop_str):
outputs = outputs[: -len(stop_str)]
outputs = outputs.strip()
return outputs
def generate_inner(self, message, dataset=None):
question, video = self.message_to_promptvideo(message)
response = self.get_model_output(self.model, self.processor, self.tokenizer, video, question)
return response
import torch
import warnings
import copy as cp
import numpy as np
import sys
from PIL import Image
import torchvision
from ..base import BaseModel
from ...smp import isimg, listinstr, get_rank_and_world_size
from ...dataset import DATASET_TYPE
from huggingface_hub import snapshot_download
class PLLaVA(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
VIDEO_LLM = True
def __init__(self, model_path='ermu2001/pllava-13b', dir_root=None, **kwargs):
sys.path.append(dir_root)
try:
from tasks.eval.model_utils import load_pllava
except:
warnings.warn(
'Please first install requirements and set the root path to use PLLaVA. \
Follow the instructions at https://github.com/magic-research/PLLaVA.'
)
sys.exit(-1)
rank, world_size = get_rank_and_world_size()
self.nframe = 16
self.use_lora = True
self.lora_alpha = 4
self.pooling_shape = (16, 12, 12)
self.RESOLUTION = 672
self.model_path = model_path
# remind that, once the model goes larger (30B+) may cause the memory to be heavily used up. Even Tearing Nodes.
weight_dir = snapshot_download(model_path)
self.model, self.processor = load_pllava(
model_path, num_frames=self.nframe, use_lora=self.use_lora,
weight_dir=weight_dir, lora_alpha=self.lora_alpha, pooling_shape=self.pooling_shape
)
# position embedding
self.model = self.model.to(torch.device(rank))
self.model = self.model.eval()
def load_video(self, video_path, num_segments=8, resolution=336):
from decord import VideoReader, cpu
transforms = torchvision.transforms.Resize(size=resolution)
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
num_frames = len(vr)
frame_indices = self.get_index(num_frames, num_segments)
images_group = list()
for frame_index in frame_indices:
img = Image.fromarray(vr[frame_index].asnumpy())
images_group.append(transforms(img))
return images_group
def get_index(self, num_frames, num_segments):
seg_size = float(num_frames - 1) / num_segments
start = int(seg_size / 2)
offsets = np.array([
start + int(np.round(seg_size * idx)) for idx in range(num_segments)
])
return offsets
def generate_inner(self, message, dataset=None):
from tasks.eval.model_utils import pllava_answer
from tasks.eval.eval_utils import conv_templates
question, video = self.message_to_promptvideo(message)
img_list = self.load_video(video, num_segments=self.nframe, resolution=self.RESOLUTION)
if self.model_path == 'ermu2001/pllava-34b': # using slightly different conversation mode for 34b model
if dataset in ['Video-MME', 'MVBench', 'MVBench_MP4']: # MCQ dataset
conv_mode = 'eval_mvbench_llavanext'
else: # VQA dataset
conv_mode = 'eval_videoqa_llavanext'
else:
if dataset in ['Video-MME', 'MVBench', 'MVBench_MP4']: # MCQ dataset
conv_mode = 'eval_mvbench'
else: # VQA dataset
conv_mode = 'eval_videoqabench'
conv = conv_templates[conv_mode].copy()
if dataset in ['MVBench', 'MVBench_MP4']:
conv.user_query(message[1]['value'], message[0]['value'], message[-2]['value'], is_mm=True)
conv.assistant_response(message[-1]['value'])
else:
conv.user_query(question, is_mm=True)
llm_response, conv = pllava_answer(
conv=conv, model=self.model, processor=self.processor,
do_sample=False, img_list=img_list, max_new_tokens=512, print_res=False
)
if dataset in ['MVBench', 'MVBench_MP4']:
llm_response = '(' + ''.join(llm_response.split(message[-1]['value'])[1:])
return llm_response
import torch
import os
import warnings
import copy as cp
import numpy as np
import sys
from ..base import BaseModel
from ...smp import isimg, listinstr
from ...dataset import DATASET_TYPE
from huggingface_hub import snapshot_download
class VideoChatGPT(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
VIDEO_LLM = True
def __init__(self, model_path='MBZUAI/Video-ChatGPT-7B', dir_root=None, **kwargs):
assert model_path is not None
sys.path.append(dir_root)
try:
from video_chatgpt.eval.model_utils import initialize_model
except:
warnings.warn(
'Please first install requirements and set the root path to use Video-ChatGPT. \
Follow the instructions at https://github.com/mbzuai-oryx/Video-ChatGPT.'
)
sys.exit(-1)
base_model_path = snapshot_download('mmaaz60/LLaVA-7B-Lightening-v1-1')
projection_path = snapshot_download(model_path)
projection_name = 'video_chatgpt-7B.bin'
projection_path = os.path.join(projection_path, projection_name)
model, vision_tower, tokenizer, image_processor, video_token_len = initialize_model(
base_model_path, projection_path
)
self.tokenizer = tokenizer
self.model = model
self.processor = image_processor
self.context_len = video_token_len
self.kwargs = kwargs
self.vision_tower = vision_tower
self.nframe = 8
def get_model_output(self, model, video_processor, tokenizer, video, qs):
from video_chatgpt.eval.model_utils import load_video
from video_chatgpt.inference import video_chatgpt_infer
conv_mode = 'video-chatgpt_v1'
video_frames = load_video(video)
# Run inference on the video and questions
output = video_chatgpt_infer(
video_frames,
qs,
conv_mode,
model,
self.vision_tower,
tokenizer,
video_processor,
self.context_len,
)
return output
def generate_inner(self, message, dataset=None):
question, video = self.message_to_promptvideo(message)
response = self.get_model_output(self.model, self.processor, self.tokenizer, video, question)
return response
import torch
import warnings
import copy as cp
import numpy as np
import sys
from ..base import BaseModel
from ...smp import isimg, listinstr
from ...dataset import DATASET_TYPE
def read_video_pyav(container, indices):
frames = []
container.seek(0)
start_index = indices[0]
end_index = indices[-1]
for i, frame in enumerate(container.decode(video=0)):
if i > end_index:
break
if i >= start_index and i in indices:
frames.append(frame)
return np.stack([x.to_ndarray(format='rgb24') for x in frames])
class VideoLLaVA_HF(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
VIDEO_LLM = True
def __init__(self, model_path='LanguageBind/Video-LLaVA-7B-hf', **kwargs):
try:
from transformers import VideoLlavaProcessor, VideoLlavaForConditionalGeneration
except:
warnings.warn('Please install the latest version transformers. \
You can install by `pip install transformers==4.42.0` \
or `pip install --upgrade git+https://github.com/huggingface/transformers.git`.')
sys.exit(-1)
assert model_path is not None
self.model_path = model_path
self.model = VideoLlavaForConditionalGeneration.from_pretrained(model_path)
self.model.eval().cuda()
self.processor = VideoLlavaProcessor.from_pretrained(model_path)
self.kwargs = kwargs
self.nframe = 8
torch.cuda.empty_cache()
def generate_inner(self, message, dataset=None):
import av
question, video = self.message_to_promptvideo(message)
container = av.open(video)
# sample uniformly 8 frames from the video
total_frames = container.streams.video[0].frames
indices = np.arange(0, total_frames, total_frames / self.nframe).astype(int)
clip = read_video_pyav(container, indices)
prompt = f'USER: <video>\n{question} ASSISTANT:'
inputs = self.processor(text=prompt, videos=clip, return_tensors='pt').to(self.model.device)
# Generate args -- deperecated
generation_args = {
'max_new_tokens': 1024,
'temperature': 0.2,
'do_sample': True,
}
generation_args.update(self.kwargs)
generate_ids = self.model.generate(**inputs, **generation_args)
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = self.processor.batch_decode(
generate_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=False
)[0]
return response
class VideoLLaVA(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
VIDEO_LLM = True
def __init__(self, model_path='LanguageBind/Video-LLaVA-7B', **kwargs):
assert model_path is not None
try:
from videollava.conversation import conv_templates, SeparatorStyle
from videollava.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
from videollava.constants import DEFAULT_VID_START_TOKEN, DEFAULT_VID_END_TOKEN
from videollava.mm_utils import get_model_name_from_path, tokenizer_image_token, KeywordsStoppingCriteria
from videollava.model.builder import load_pretrained_model
from videollava.model.language_model.llava_llama import LlavaLlamaForCausalLM
from videollava.train.train import smart_tokenizer_and_embedding_resize
except:
warnings.warn('Please install Video-LLaVA from https://github.com/FangXinyu-0913/Video-LLaVA.')
sys.exit(-1)
model_base = None
model_name = model_path.split('/')[-1]
tokenizer, model, processor, context_len = load_pretrained_model(model_path, model_base, model_name)
self.tokenizer = tokenizer
self.model = model
self.processor = processor
self.context_len = context_len
self.kwargs = kwargs
self.nframe = 8
def get_model_output(self, model, video_processor, tokenizer, video, qs):
from videollava.conversation import conv_templates, SeparatorStyle
from videollava.constants import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
from videollava.constants import DEFAULT_VID_START_TOKEN, DEFAULT_VID_END_TOKEN
from videollava.mm_utils import tokenizer_image_token, KeywordsStoppingCriteria
if model.config.mm_use_im_start_end:
qs = DEFAULT_VID_START_TOKEN + ''.join([DEFAULT_IMAGE_TOKEN] * 8) + DEFAULT_VID_END_TOKEN + '\n' + qs
else:
qs = ''.join([DEFAULT_IMAGE_TOKEN] * 8) + '\n' + qs
conv_mode = 'llava_v1'
device = torch.device('cuda')
conv = conv_templates[conv_mode].copy()
conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
video_tensor = video_processor.preprocess(video, return_tensors='pt')['pixel_values'][0].half().to(device)
input_ids = tokenizer_image_token(prompt, tokenizer,
IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(device)
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=[video_tensor],
do_sample=True,
temperature=0.2,
max_new_tokens=1024,
use_cache=True,
stopping_criteria=[stopping_criteria])
input_token_len = input_ids.shape[1]
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
if n_diff_input_output > 0:
print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]
outputs = outputs.strip()
if outputs.endswith(stop_str):
outputs = outputs[:-len(stop_str)]
outputs = outputs.strip()
return outputs
def generate_inner(self, message, dataset=None):
question, video = self.message_to_promptvideo(message)
response = self.get_model_output(self.model, self.processor['video'], self.tokenizer, video, question)
return response
import torch
import warnings
import copy as cp
import numpy as np
import sys
import os.path as osp
import os
import requests
import shutil
import huggingface_hub
from transformers import StoppingCriteria, StoppingCriteriaList
from huggingface_hub import snapshot_download
from PIL import Image
from torchvision.transforms import PILToTensor
from torchvision import transforms
from peft import get_peft_model, LoraConfig, TaskType
from ..base import BaseModel
from ...smp import *
from ...dataset import DATASET_TYPE
def get_prompt(conv):
ret = conv.system + conv.sep
for role, message in conv.messages:
if message:
ret += role + ' ' + message + ' ' + conv.sep
else:
ret += role
return ret
def get_prompt2(conv):
ret = conv.system + conv.sep
count = 0
for role, message in conv.messages:
count += 1
if count == len(conv.messages):
ret += role + ' ' + message
else:
if message:
ret += role + ' ' + message + ' ' + conv.sep
else:
ret += role
return ret
class StoppingCriteriaSub(StoppingCriteria):
def __init__(self, stops=[], encounters=1):
super().__init__()
self.stops = stops
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
for stop in self.stops:
if torch.all((stop == input_ids[0][-len(stop):])).item():
return True
return False
class VideoChat2_HD(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
VIDEO_LLM = True
def __init__(self, model_path='OpenGVLab/VideoChat2_HD_stage4_Mistral_7B',
root='./Ask-Anything', config_file='./configs/videochat2_hd.json',
**kwargs):
self.config_file = config_file
self.root = root
self.model_path = model_path
if root is None:
warnings.warn('Please set `root` to Ask-Anything directory, \
which is cloned from here: https://github.com/OpenGVLab/Ask-Anything')
sys.exit(-1)
sys.path.append(osp.join(root, 'video_chat2'))
try:
from utils.config import Config
from utils.easydict import EasyDict
from models import VideoChat2_it_hd_mistral
from dataset.hd_utils import HD_transform_padding, HD_transform_no_padding
except:
raise ImportError(
'Please first install VideoChat2 and set the root path to use VideoChat2, '
'which is cloned from here: https://github.com/OpenGVLab/Ask-Anything '
)
cfg = Config.from_file(self.config_file)
def download_file(url, pth):
destination_folder = pth
# 确保目标文件夹存在
if not os.path.exists(destination_folder):
os.makedirs(destination_folder)
# 获取文件名
filename = os.path.basename(url)
destination_path = os.path.join(destination_folder, filename)
if os.path.exists(destination_path):
print(f'File downloaded! No repeat download needed. Saved in {destination_path}')
return
# 下载文件
response = requests.get(url, stream=True)
if response.status_code == 200:
with open(destination_path, 'wb') as file:
response.raw.decode_content = True
shutil.copyfileobj(response.raw, file)
print(f'File downloaded and saved to {destination_path}')
else:
print(f'Download failed, status code: {response.status_code}')
hf_token = os.environ.get('HUGGINGFACE_TOKEN')
huggingface_hub.login(hf_token)
videochat2_model_path = snapshot_download(repo_id=cfg.model.videochat2_model_path, repo_type='model')
cfg.model.videochat2_model_path = osp.join(videochat2_model_path, 'videochat2_mistral_7b_stage2.pth')
mistral_model_path = snapshot_download(repo_id=cfg.model.mistral_model_path, repo_type='model')
cfg.model.mistral_model_path = mistral_model_path
vit_blip_model_path = snapshot_download(repo_id=cfg.model.vit_blip_model_path, repo_type='model')
cfg.model.vit_blip_model_path = osp.join(vit_blip_model_path, 'umt_l16_qformer.pth')
model = VideoChat2_it_hd_mistral(config=cfg.model)
peft_config = LoraConfig(
task_type=TaskType.CAUSAL_LM, inference_mode=False,
r=16, lora_alpha=32, lora_dropout=0.,
target_modules=[
'q_proj', 'k_proj', 'v_proj', 'o_proj',
'gate_proj', 'up_proj', 'down_proj', 'lm_head'
]
)
model.mistral_model = get_peft_model(model.mistral_model, peft_config)
stage4_model_path = snapshot_download(repo_id=model_path, repo_type='model')
state_dict = torch.load(osp.join(stage4_model_path, 'videochat2_hd_mistral_7b_stage4.pth'), 'cuda')
if 'model' in state_dict.keys():
model.load_state_dict(state_dict['model'], strict=False)
else:
model.load_state_dict(state_dict, strict=False)
model = model.to(torch.device('cuda'))
model = model.eval()
self.model = model
# position embedding
self.nframe = 16
self.resolution = 224
self.hd_num = 6
new_pos_emb = self.get_sinusoid_encoding_table(
n_position=(self.resolution // 16) ** 2 * self.nframe,
cur_frame=self.nframe
)
self.model.vision_encoder.encoder.pos_embed = new_pos_emb
self.hd_transform = HD_transform_no_padding
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
self.transform = transforms.Compose([
transforms.Lambda(lambda x: x.float().div(255.0)),
transforms.Normalize(mean, std)
])
def get_sinusoid_encoding_table(self, n_position=784, d_hid=1024,
cur_frame=8, ckpt_num_frame=4,
pre_n_position=784):
''' Sinusoid position encoding table '''
# TODO: make it with torch instead of numpy
def get_position_angle_vec(position):
return [position / np.power(10000, 2 * (hid_j // 2) / d_hid) for hid_j in range(d_hid)]
# generate checkpoint position embedding
sinusoid_table = np.array([get_position_angle_vec(pos_i) for pos_i in range(pre_n_position)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
sinusoid_table = torch.tensor(sinusoid_table, dtype=torch.float, requires_grad=False).unsqueeze(0)
print(f'n_position: {n_position}')
print(f'pre_n_position: {pre_n_position}')
if n_position != pre_n_position:
T = ckpt_num_frame # checkpoint frame
P = 14 # checkpoint size
C = d_hid
new_P = int((n_position // cur_frame) ** 0.5) # testing size
if new_P != 14:
print(f'Pretraining uses 14x14, but current version is {new_P}x{new_P}')
print('Interpolate the position embedding')
sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C)
sinusoid_table = sinusoid_table.reshape(-1, P, P, C).permute(0, 3, 1, 2)
sinusoid_table = torch.nn.functional.interpolate(
sinusoid_table, size=(new_P, new_P), mode='bicubic', align_corners=False)
# BT, C, H, W -> BT, H, W, C -> B, T, H, W, C
sinusoid_table = sinusoid_table.permute(0, 2, 3, 1).reshape(-1, T, new_P, new_P, C)
sinusoid_table = sinusoid_table.flatten(1, 3) # B, THW, C
if cur_frame != ckpt_num_frame:
print(f'Pretraining uses 4 frames, but current frame is {cur_frame}')
print('Interpolate the position embedding')
T = ckpt_num_frame # checkpoint frame
new_T = cur_frame # testing frame
# interpolate
P = int((n_position // cur_frame) ** 0.5) # testing size
C = d_hid
sinusoid_table = sinusoid_table.reshape(-1, T, P, P, C)
sinusoid_table = sinusoid_table.permute(0, 2, 3, 4, 1).reshape(-1, C, T) # BHW, C, T
sinusoid_table = torch.nn.functional.interpolate(sinusoid_table, size=new_T, mode='linear')
sinusoid_table = sinusoid_table.reshape(1, P, P, C, new_T).permute(0, 4, 1, 2, 3) # B, T, H, W, C
sinusoid_table = sinusoid_table.flatten(1, 3) # B, THW, C
return sinusoid_table
def get_index(self, bound, fps, max_frame, first_idx=0):
if bound:
start, end = bound[0], bound[1]
else:
start, end = -100000, 100000
start_idx = max(first_idx, round(start * fps))
end_idx = min(round(end * fps), max_frame)
seg_size = float(end_idx - start_idx) / self.nframe
frame_indices = np.array([
int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
for idx in range(self.nframe)
])
return frame_indices
def read_video(self, video_path, bound=None):
from decord import VideoReader, cpu
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
max_frame = len(vr) - 1
fps = float(vr.get_avg_fps())
frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
frames = vr.get_batch(frame_indices)
frames = frames.permute(0, 3, 1, 2)
frames = self.hd_transform(frames.float(), image_size=self.resolution, hd_num=self.hd_num)
torch_imgs = self.transform(frames)
return torch_imgs
def ask(self, text, conv):
conv.messages.append([conv.roles[0], text])
def get_context_emb(self, conv, model, img_list, answer_prompt=None, print_res=False):
if answer_prompt:
prompt = get_prompt2(conv)
else:
prompt = get_prompt(conv)
if print_res:
print(prompt)
if '<VideoHere>' in prompt:
prompt_segs = prompt.split('<VideoHere>')
else:
prompt_segs = prompt.split('<ImageHere>')
assert len(prompt_segs) == len(img_list) + 1, 'Unmatched numbers of image placeholders and images.'
with torch.no_grad():
seg_tokens = [
model.mistral_tokenizer(
seg, return_tensors='pt', add_special_tokens=i == 0).to('cuda').input_ids
# only add bos to the first seg
for i, seg in enumerate(prompt_segs)
]
seg_embs = [model.mistral_model.base_model.model.model.embed_tokens(seg_t) for seg_t in seg_tokens]
# seg_embs = [model.mistral_model.model.embed_tokens(seg_t) for seg_t in seg_tokens]
mixed_embs = [emb for pair in zip(seg_embs[:-1], img_list) for emb in pair] + [seg_embs[-1]]
mixed_embs = torch.cat(mixed_embs, dim=1)
return mixed_embs
def answer(self, conv, model, img_list, do_sample=True, max_new_tokens=500, num_beams=1, min_length=1, top_p=0.9,
repetition_penalty=1.0, length_penalty=1, temperature=1.0, answer_prompt=None, print_res=False):
stop_words_ids = [
torch.tensor([2]).to('cuda'),
torch.tensor([29871, 2]).to('cuda')] # '</s>' can be encoded in two different ways.
stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
conv.messages.append([conv.roles[1], answer_prompt])
embs = self.get_context_emb(conv, model, img_list, answer_prompt=answer_prompt, print_res=print_res)
with torch.no_grad():
outputs = model.mistral_model.generate(
inputs_embeds=embs,
max_new_tokens=max_new_tokens,
stopping_criteria=stopping_criteria,
num_beams=num_beams,
do_sample=do_sample,
min_length=min_length,
top_p=top_p,
repetition_penalty=repetition_penalty,
length_penalty=length_penalty,
temperature=temperature,
)
output_token = outputs[0]
if output_token[0] == 0: # the model might output a unknow token <unk> at the beginning. remove it
output_token = output_token[1:]
if output_token[0] == 1: # some users find that there is a start token <s> at the beginning. remove it
output_token = output_token[1:]
output_text = model.mistral_tokenizer.decode(output_token, add_special_tokens=False)
output_text = output_text.split('</s>')[0] # remove the stop sign </s>
# output_text = output_text.split('[/INST]')[-1].strip()
conv.messages[-1][1] = output_text + '</s>'
return output_text, output_token.cpu().numpy()
def infer_data(
self, data_sample, system=' ',
question_prompt='', # add in the end of question
answer_prompt=None, # add in the begining of answer
system_q=False, # whether add question in the system prompt for QFormer
print_res=True,
system_llm=False
):
assert system_q is False, 'do not support system_q now'
video = data_sample['video']
T_, C, H, W = video.shape
video = video.reshape(1, T_, C, H, W).to('cuda')
video_list = []
with torch.no_grad():
if system_q:
raise NotImplementedError
else:
video_emb, _, _ = self.model.encode_img(video, system)
video_list.append(video_emb[0])
question = data_sample['question']
from utils.easydict import EasyDict
chat = EasyDict({
'system': system,
'roles': ('[INST]', '[/INST]'),
'messages': [],
'sep': ''
})
if data_sample['subtitle'] != '':
subtitle = f"This video's subtitles are listed below: {data_sample['subtitle']}"
chat.messages.append([chat.roles[0], f'{subtitle}\n<Video><VideoHere></Video> [/INST]'])
else:
chat.messages.append([chat.roles[0], '<Video><VideoHere></Video> [/INST]'])
if system_llm:
prompt = system + question + question_prompt
else:
prompt = question + question_prompt
self.ask(prompt, chat)
llm_message = self.answer(
conv=chat, model=self.model, do_sample=False,
img_list=video_list, max_new_tokens=100,
answer_prompt=answer_prompt, print_res=print_res
)[0]
return llm_message.strip()
def qa_template(self, data):
question = data.split('Answer:')[0].split('\n')[0] + '\n'
question += 'Options:\n'
choices = data.split('Answer:')[0].split('\n')[1:]
choices = [item for item in choices if item != ''] # remove blank space
for idx, c in enumerate(choices):
cur_choice, cur_text = c[0], c[3:]
question += f'({cur_choice}) {cur_text}\n'
question = question.rstrip()
return question
def split_subtitle(self, data):
if 'This video\'s subtitles are listed below' in data:
# 找到subtitle的起始和结束位置
start_marker = 'This video\'s subtitles are listed below:'
end_marker = 'Select the best answer to the following multiple-choice question based on the video.'
start_index = data.find(start_marker) + len(start_marker)
end_index = data.find(end_marker)
# 提取subtitle部分
subtitle = data[start_index:end_index].strip()
return subtitle
else:
return ''
def generate_inner(self, message, dataset=None):
if dataset == 'Video-MME':
_, video = self.message_to_promptvideo(message)
torch_imgs = self.read_video(video)
subtitle = self.split_subtitle(message[-2]['value'])
question = self.qa_template(message[-1]['value'])
example = {
'subtitle': subtitle,
'video': torch_imgs,
'question': question
}
pred_option = self.infer_data(
example,
' ',
question_prompt='\nOnly give the best option.',
answer_prompt='Best option:(',
system_q=False,
print_res=False,
system_llm=True
)
return_message = '(' + pred_option.split('\n')[0]
return return_message
elif dataset == 'MVBench' or dataset == 'MVBench_MP4':
_, video = self.message_to_promptvideo(message)
torch_imgs = self.read_video(video)
example = {
'subtitle': '',
'video': torch_imgs,
'question': message[1]['value']
}
pred_option = self.infer_data(
example,
message[0]['value'],
question_prompt='\nOnly give the best option.',
answer_prompt='Best option:(',
system_q=False,
print_res=False,
system_llm=True
)
return_message = '(' + pred_option.split('\n')[0]
return return_message
else:
question, video = self.message_to_promptvideo(message)
torch_imgs = self.read_video(video)
example = {
'subtitle': '',
'video': torch_imgs,
'question': f'Question:{question}\nAnswer:'
}
pred_result = self.infer_data(
example,
' ',
system_q=False,
print_res=False,
system_llm=False
)
return pred_result
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment