"vscode:/vscode.git/clone" did not exist on "186f92042b5d16771ea416b442e842888037142f"
Commit 81028572 authored by luopl's avatar luopl
Browse files

init

parents
Pipeline #1722 canceled with stages
import torch
import sys
import os.path as osp
import warnings
from transformers import StoppingCriteriaList
from .base import BaseModel
class MiniGPT4(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
def __init__(self,
mode='v2',
root='/mnt/petrelfs/share_data/duanhaodong/MiniGPT-4/',
temperature=1,
max_out_len=512):
if root is None:
warnings.warn(
'Please set root to the directory of MiniGPT-4, which is cloned from here: '
'https://github.com/Vision-CAIR/MiniGPT-4. '
)
if mode == 'v2':
cfg = 'minigptv2_eval.yaml'
elif mode == 'v1_7b':
cfg = 'minigpt4_7b_eval.yaml'
elif mode == 'v1_13b':
cfg = 'minigpt4_13b_eval.yaml'
else:
raise NotImplementedError
self.mode = mode
self.temperature = temperature
self.max_out_len = max_out_len
self.root = root
this_dir = osp.dirname(__file__)
self.cfg = osp.join(this_dir, 'misc', cfg)
sys.path.append(self.root)
from omegaconf import OmegaConf
from minigpt4.common.registry import registry
from minigpt4.conversation.conversation import StoppingCriteriaSub, CONV_VISION_Vicuna0, CONV_VISION_minigptv2
device = torch.cuda.current_device()
self.device = device
cfg_path = self.cfg
cfg = OmegaConf.load(cfg_path)
model_cfg = cfg.model
model_cfg.device_8bit = device
model_cls = registry.get_model_class(model_cfg.arch)
model = model_cls.from_config(model_cfg)
model = model.to(device)
model.eval()
vis_processor_cfg = cfg.datasets.cc_sbu_align.vis_processor.train
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)
self.model = model
self.vis_processor = vis_processor
self.CONV_VISION = CONV_VISION_minigptv2 if self.mode == 'v2' else CONV_VISION_Vicuna0
stop_words_ids = [[835], [2277, 29937]]
stop_words_ids = [torch.tensor(ids).to(device) for ids in stop_words_ids]
self.stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
def generate_inner(self, message, dataset=None):
from minigpt4.conversation.conversation import Chat
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
if self.mode == 'v2':
chat = Chat(self.model, self.vis_processor, device=self.device)
else:
chat = Chat(self.model, self.vis_processor, device=self.device, stopping_criteria=self.stopping_criteria)
chat_state = self.CONV_VISION.copy()
img_list = []
_ = chat.upload_img(image_path, chat_state, img_list)
chat.encode_img(img_list)
chat.ask(prompt, chat_state)
with torch.inference_mode():
msg = chat.answer(conv=chat_state, img_list=img_list)[0]
return msg
import pprint
import numpy as np
import torch
import torchvision.transforms as T
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images, target_aspect_ratio
def dynamic_preprocess2(image, min_num=1, max_num=12, prior_aspect_ratio=None, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
new_target_ratios = []
for i in target_ratios:
if prior_aspect_ratio[0] % i[0] or prior_aspect_ratio[1] % i[1]:
new_target_ratios.append(i)
else:
continue
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, new_target_ratios, orig_width, orig_height, image_size)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(image, input_size=448, min_num=1, max_num=12):
image = image.convert('RGB')
transform = build_transform(input_size=input_size)
images, target_aspect_ratio = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True,
min_num=min_num, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values, target_aspect_ratio
def load_image2(image, input_size=448, min_num=1, max_num=12, target_aspect_ratio=None):
image = image.convert('RGB')
transform = build_transform(input_size=input_size)
images = dynamic_preprocess2(image, image_size=input_size, use_thumbnail=True, min_num=min_num,
max_num=max_num, prior_aspect_ratio=target_aspect_ratio)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
import warnings
from .base import BaseModel
from ..dataset import DATASET_TYPE
class MiniMonkey(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='mx262/MiniMonkey', **kwargs):
assert model_path is not None
self.model_path = model_path
self.model_type = torch.bfloat16
self.model = AutoModel.from_pretrained(
self.model_path,
low_cpu_mem_usage=True,
trust_remote_code=True).eval().to(self.model_type).cuda()
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True, use_fast=False)
self.kwargs = kwargs
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
torch.cuda.empty_cache()
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
if dataset is None:
return self.generate_vanilla(image_path, prompt)
assert isinstance(dataset, str)
if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'Y/N' or dataset == 'HallusionBench':
return self.generate_multichoice(image_path, prompt)
else:
return self.generate_vanilla(image_path, prompt)
def generate_vanilla(self, image_path, prompt):
image = Image.open(image_path).convert('RGB')
pixel_values, target_aspect_ratio = load_image(image, min_num=4, max_num=12)
pixel_values = pixel_values.cuda().to(self.model_type)
pixel_values2 = load_image2(image, min_num=3, max_num=7, target_aspect_ratio=target_aspect_ratio)
pixel_values2 = pixel_values2.cuda().to(self.model_type)
pixel_values = torch.cat([pixel_values2[:-1], pixel_values[:-1], pixel_values2[-1:]], 0)
generation_config = dict(do_sample=False, max_new_tokens=512)
response, history = self.model.chat(self.tokenizer, pixel_values,
target_aspect_ratio, prompt, generation_config,
history=None, return_history=True)
return response
def generate_multichoice(self, image_path, prompt):
return self.generate_vanilla(image_path, prompt)
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model:
arch: instruct_vicuna13b
load_finetuned: False
load_pretrained: True
pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth"
finetuned: ""
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
freeze_vit: True
# Q-Former
num_query_token: 32
# path to Vicuna checkpoint
llm_model: "Please set the path to your vicuna-13b-v1.1"
# generation configs
prompt: ""
preprocess:
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
# Copyright (c) 2022, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
model:
arch: instruct_vicuna7b
load_finetuned: False
load_pretrained: True
pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth"
finetuned: ""
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
freeze_vit: True
# Q-Former
num_query_token: 32
# path to Vicuna checkpoint
llm_model: "Please set the path to your vicuna-7b-v1.1"
# generation configs
prompt: ""
preprocess:
vis_processor:
train:
name: "blip2_image_train"
image_size: 224
eval:
name: "blip_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
eval:
name: "blip_caption"
model:
arch: minigpt4
model_type: pretrain_vicuna_7b
max_txt_len: 160
end_sym: "###"
low_resource: True
prompt_template: '###Human: {} ###Assistant: '
ckpt: "please set this value to the path of pretrained checkpoint"
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
freeze_vit: True
freeze_qformer: True
# Q-Former
num_query_token: 32
# generation configs
prompt: ""
llama_model: "please set this value to the path of vicuna-13b-v0"
datasets:
cc_sbu_align:
vis_processor:
train:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
run:
task: image_text_pretrain
model:
arch: minigpt4
model_type: pretrain_vicuna_7b
max_txt_len: 160
end_sym: "###"
low_resource: True
prompt_template: '###Human: {} ###Assistant: '
ckpt: "please set this value to the path of pretrained checkpoint"
# vit encoder
image_size: 224
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
freeze_vit: True
freeze_qformer: True
# Q-Former
num_query_token: 32
# generation configs
prompt: ""
llama_model: "please set this value to the path of vicuna-7b-v0"
datasets:
cc_sbu_align:
vis_processor:
train:
name: "blip2_image_eval"
image_size: 224
text_processor:
train:
name: "blip_caption"
run:
task: image_text_pretrain
model:
arch: minigpt_v2
model_type: pretrain
max_txt_len: 160
end_sym: "</s>"
low_resource: True
prompt_template: '[INST] {} [/INST]'
ckpt: "please set this value to the path of pretrained checkpoint"
lora_r: 64
lora_alpha: 16
# vit encoder
image_size: 448
drop_path_rate: 0
use_grad_checkpoint: False
vit_precision: "fp16"
freeze_vit: True
# generation configs
prompt: ""
# LLM
llama_model: "please set this value to the path of llama2-chat-7b"
datasets:
cc_sbu_align:
vis_processor:
train:
name: "blip2_image_eval"
image_size: 448
text_processor:
train:
name: "blip_caption"
run:
task: image_text_pretrain
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import warnings
from .base import BaseModel
from ..smp import *
class LLama3Mixsense(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='Zero-Vision/Llama-3-MixSenseV1_1', **kwargs):
assert model_path is not None
transformers.logging.set_verbosity_error()
transformers.logging.disable_progress_bar()
warnings.filterwarnings('ignore')
self.tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True
)
self.model = AutoModelForCausalLM.from_pretrained(
model_path, trust_remote_code=True
).to('cuda').eval()
self.kwargs = kwargs
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message)
input_ids = self.model.text_process(prompt, self.tokenizer).to(device='cuda')
image = Image.open(image_path).convert('RGB')
image_tensor = self.model.image_process([image]).to(dtype=self.model.dtype, device='cuda')
# generate
with torch.inference_mode():
output_ids = self.model.generate(
input_ids,
images=image_tensor,
max_new_tokens=2048,
use_cache=True,
eos_token_id=[
self.tokenizer.eos_token_id,
self.tokenizer.convert_tokens_to_ids(['<|eot_id|>'])[0],
],
)
return self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
import warnings
from .base import BaseModel
from PIL import Image
from ..smp import *
from ..dataset import DATASET_TYPE
import pandas as pd
import string
import torchvision.transforms as T
import transformers
from torchvision.transforms.functional import InterpolationMode
class MMAlaya(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='DataCanvas/MMAlaya', **kwargs):
assert model_path is not None
self.model_path = model_path
self.tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
model_path, device_map='cpu', trust_remote_code=True
).eval()
# need initialize tokenizer
model.initialize_tokenizer(self.tokenizer)
self.model = model.cuda()
self.kwargs = kwargs
warnings.warn(
f'Following kwargs received: {self.kwargs}, will use as generation config. '
)
torch.cuda.empty_cache()
def generate_inner(self, message, dataset=None):
# read image
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
image = Image.open(image_path).convert('RGB')
# tokenize prompt, and proprecess image
input_ids, image_tensor, stopping_criteria = self.model.prepare_for_inference(
prompt, self.tokenizer, image, return_tensors='pt'
)
with torch.inference_mode():
output_ids = self.model.generate(
inputs=input_ids.cuda(),
images=image_tensor.cuda(),
do_sample=False,
max_new_tokens=512,
num_beams=1,
use_cache=True,
stopping_criteria=[stopping_criteria],
)
# truncate input_ids in generate_ids and then decode to text
input_token_len = input_ids.shape[1]
response = self.tokenizer.batch_decode(
output_ids[:, input_token_len:].cpu(),
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)[0].strip()
return response
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose(
[
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD),
]
)
return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(
image, min_num=1, max_num=6, image_size=448, use_thumbnail=False
):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j)
for n in range(min_num, max_num + 1)
for i in range(1, n + 1)
for j in range(1, n + 1)
if i * j <= max_num and i * j >= min_num
)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size
)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size,
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(image_file, input_size=448, max_num=6, upscale=False):
image = Image.open(image_file).convert('RGB')
if upscale:
image = image.resize((image.width * 2, image.height * 2), Image.BILINEAR)
transform = build_transform(input_size=input_size)
images = dynamic_preprocess(
image, image_size=input_size, use_thumbnail=True, max_num=max_num
)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
# This function is used to split InternVL2-Llama3-76B
def split_model(model_name):
import math
device_map = {}
num_gpus = torch.cuda.device_count()
rank, world_size = get_rank_and_world_size()
num_gpus = num_gpus // world_size
assert num_gpus >= 1
if num_gpus == 1:
return device_map
num_layers = {'InternVL2-8B': 32, 'InternVL2-26B': 48,
'InternVL2-40B': 60, 'InternVL2-Llama3-76B': 80}[model_name]
# Since the first GPU will be used for ViT, treat it as 0.2 GPU.
num_layers_per_gpu = math.ceil(num_layers / (num_gpus - 0.8))
num_layers_per_gpu = [num_layers_per_gpu] * num_gpus
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.2)
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for j in range(num_layer):
device_map[f'language_model.model.layers.{layer_cnt}'] = rank + world_size * i
layer_cnt += 1
device_map['vision_model'] = rank
device_map['mlp1'] = rank
device_map['language_model.model.tok_embeddings'] = rank
device_map['language_model.model.embed_tokens'] = rank
device_map['language_model.output'] = rank
device_map['language_model.model.norm'] = rank
device_map['language_model.lm_head'] = rank
device_map[f'language_model.model.layers.{num_layers - 1}'] = rank
return device_map
class MMAlaya2(BaseModel):
"""
This implementation fine-tunes 20 LoRA modules based on the InternVL-Chat-V1-5 model.
The fine-tuned LoRA modules are then merged with the InternVL-Chat-V1-5 model
using the PEFT model merging method, TIES.
The code is based on the implementation in `vlmeval/vlm/internvl_chat.py`.
"""
INSTALL_REQ = False
INTERLEAVE = True
def __init__(
self,
model_path='DataCanvas/MMAlaya2',
load_in_8bit=False,
**kwargs,
):
assert model_path is not None
assert version_cmp(transformers.__version__, '4.36.2', 'ge')
self.model_path = model_path
self.tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True, use_fast=False
)
# Regular expression to match the pattern "Image" followed by a number, e.g. Image1
self.pattern = r'Image(\d+)'
# Replacement pattern to insert a hyphen between "Image" and the number, e.g. Image-1
self.replacement = r'Image-\1'
# Convert InternVL2 response to dataset format
# e.g. Image1 -> Image-1
# Regular expression to match the pattern "Image-" followed by a number
self.reverse_pattern = r'Image-(\d+)'
# Replacement pattern to remove the hyphen (Image-1 -> Image1)
self.reverse_replacement = r'Image\1'
device_map = split_model('InternVL2-26B')
if len(device_map) == 0:
device_map = {'': 'cuda'}
self.model = AutoModel.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
load_in_8bit=load_in_8bit,
device_map=device_map
).eval()
self.image_size = self.model.config.vision_config.image_size
kwargs_default = dict(
do_sample=False, max_new_tokens=1024, top_p=None, num_beams=1
)
kwargs_default.update(kwargs)
self.kwargs = kwargs_default
warnings.warn(
f'Following kwargs received: {self.kwargs}, will use as generation config. '
)
def use_custom_prompt(self, dataset):
assert dataset is not None
if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset):
# For Multi-Turn we don't have custom prompt
return False
else:
return True
def build_multi_choice_prompt(self, line, dataset=None):
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if len(options):
prompt += (
'\n请直接回答选项字母。'
if cn_string(prompt)
else "\nAnswer with the option's letter from the given choices directly."
)
else:
prompt += (
'\n请直接回答问题。'
if cn_string(prompt)
else '\nAnswer the question directly.'
)
return prompt
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert dataset is None or isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
if dataset is not None and listinstr(['MME'], dataset):
question = line['question']
prompt = question + ' Answer the question using a single word or phrase.'
elif dataset is not None and listinstr(['HallusionBench'], dataset):
question = line['question']
prompt = (
question
+ ' Please answer yes or no. Answer the question using a single word or phrase.'
)
elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
prompt = self.build_multi_choice_prompt(line, dataset)
elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
if listinstr(['MathVista', 'MathVision', 'MathVerse'], dataset):
prompt = line['question']
elif listinstr(['LLaVABench'], dataset):
question = line['question']
prompt = question + '\nAnswer this question in detail.'
elif listinstr(['MMVet'], dataset):
prompt = line['question']
else:
question = line['question']
prompt = question + '\nAnswer the question using a single word or phrase.'
else:
prompt = line['question']
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
def set_max_num(self, dataset):
if dataset is not None and listinstr(['ChartQA_TEST', 'MMMU_DEV_VAL'], dataset):
self.max_num = 12
elif dataset is not None and listinstr(['DocVQA_VAL', 'DocVQA_TEST'], dataset):
self.max_num = 18
elif dataset is not None and listinstr(
['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench'], dataset
):
self.max_num = 24
elif dataset is not None and listinstr(
['MMBench-Video', 'Video-MME', 'Video'], dataset
):
self.max_num = 1
else:
self.max_num = 6
def generate_inner(self, message, dataset=None):
self.set_max_num(dataset)
image_num = len([x for x in message if x['type'] == 'image'])
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
if image_num > 1:
image_path = [x['value'] for x in message if x['type'] == 'image']
pixel_values_list = []
max_num = max(1, self.max_num // image_num)
for file_name in image_path:
pixel_values_list.append(load_image(file_name, max_num=max_num).cuda().to(torch.bfloat16))
pixel_values = torch.cat(pixel_values_list, dim=0)
elif image_num == 1:
image_path = [x['value'] for x in message if x['type'] == 'image'][0]
pixel_values = (
load_image(image_path, max_num=self.max_num).cuda().to(torch.bfloat16)
)
else:
pixel_values = None
with torch.no_grad():
response = self.model.chat(
self.tokenizer,
pixel_values=pixel_values,
question=prompt,
generation_config=self.kwargs,
# verbose=False,
)
return response
if __name__ == '__main__':
model = MMAlaya2(max_new_tokens=1024, do_sample=False)
response = model.generate_inner(
[
{'type': 'image', 'value': './assets/apple.jpg'},
{'type': 'text', 'value': '请详细描述一下这张图片。'},
]
)
print(response)
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import warnings
from .base import BaseModel
from ..dataset import DATASET_TYPE
class Monkey(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='echo840/Monkey', **kwargs):
assert model_path is not None
self.model_path = model_path
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', trust_remote_code=True).eval()
self.model = model.cuda()
self.kwargs = kwargs
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
torch.cuda.empty_cache()
def generate_vanilla(self, image_path, prompt):
cur_prompt = f'<img>{image_path}</img> {prompt} Answer: '
input_ids = self.tokenizer(cur_prompt, return_tensors='pt', padding='longest')
attention_mask = input_ids.attention_mask
input_ids = input_ids.input_ids
output_ids = self.model.generate(
input_ids=input_ids.cuda(),
attention_mask=attention_mask.cuda(),
do_sample=False,
num_beams=1,
max_new_tokens=512,
min_new_tokens=1,
length_penalty=1,
num_return_sequences=1,
output_hidden_states=True,
use_cache=True,
pad_token_id=self.tokenizer.eod_id,
eos_token_id=self.tokenizer.eod_id,
)
response = self.tokenizer.decode(
output_ids[0][input_ids.size(1):].cpu(),
skip_special_tokens=True
).strip()
return response
def generate_multichoice(self, image_path, prompt):
cur_prompt = f'<img>{image_path}</img> \n {prompt} Answer: '
input_ids = self.tokenizer(cur_prompt, return_tensors='pt', padding='longest')
attention_mask = input_ids.attention_mask
input_ids = input_ids.input_ids
output_ids = self.model.generate(
input_ids=input_ids.cuda(),
attention_mask=attention_mask.cuda(),
do_sample=False,
num_beams=1,
max_new_tokens=10,
min_new_tokens=1,
length_penalty=1,
num_return_sequences=1,
output_hidden_states=True,
use_cache=True,
pad_token_id=self.tokenizer.eod_id,
eos_token_id=self.tokenizer.eod_id,
)
response = self.tokenizer.decode(
output_ids[0][input_ids.size(1):].cpu(),
skip_special_tokens=True
).strip()
return response
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
if dataset is None:
return self.generate_vanilla(image_path, prompt)
assert isinstance(dataset, str)
if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'Y/N' or dataset == 'HallusionBench':
return self.generate_multichoice(image_path, prompt)
else:
return self.generate_vanilla(image_path, prompt)
class MonkeyChat(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='echo840/Monkey-Chat', **kwargs):
assert model_path is not None
self.model_path = model_path
self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map='cpu', trust_remote_code=True).eval()
self.model = model.cuda()
self.kwargs = kwargs
self.tokenizer.padding_side = 'left'
self.tokenizer.pad_token_id = self.tokenizer.eod_id
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
torch.cuda.empty_cache()
def generate_vanilla(self, image_path, prompt):
cur_prompt = f'<img>{image_path}</img> {prompt} Answer: '
input_ids = self.tokenizer(cur_prompt, return_tensors='pt', padding='longest')
attention_mask = input_ids.attention_mask
input_ids = input_ids.input_ids
output_ids = self.model.generate(
input_ids=input_ids.cuda(),
attention_mask=attention_mask.cuda(),
do_sample=False,
num_beams=1,
max_new_tokens=512,
min_new_tokens=1,
length_penalty=1,
num_return_sequences=1,
output_hidden_states=True,
use_cache=True,
pad_token_id=self.tokenizer.eod_id,
eos_token_id=self.tokenizer.eod_id,
)
response = self.tokenizer.decode(
output_ids[0][input_ids.size(1):].cpu(),
skip_special_tokens=True
).strip()
return response
def generate_multichoice(self, image_path, prompt):
cur_prompt = f'<img>{image_path}</img> \n {prompt} Answer: '
input_ids = self.tokenizer(cur_prompt, return_tensors='pt', padding='longest')
attention_mask = input_ids.attention_mask
input_ids = input_ids.input_ids
output_ids = self.model.generate(
input_ids=input_ids.cuda(),
attention_mask=attention_mask.cuda(),
do_sample=False,
num_beams=1,
max_new_tokens=10,
min_new_tokens=1,
length_penalty=1,
num_return_sequences=1,
output_hidden_states=True,
use_cache=True,
pad_token_id=self.tokenizer.eod_id,
eos_token_id=self.tokenizer.eod_id,
)
response = self.tokenizer.decode(
output_ids[0][input_ids.size(1):].cpu(),
skip_special_tokens=True
).strip()
return response
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
if dataset is None:
return self.generate_vanilla(image_path, prompt)
assert isinstance(dataset, str)
if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'Y/N' or dataset == 'HallusionBench':
return self.generate_multichoice(image_path, prompt)
else:
return self.generate_vanilla(image_path, prompt)
import torch
import re
from PIL import Image
from abc import abstractproperty
import sys
import os.path as osp
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE
import copy
class Moondream1(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self,
model_path='vikhyatk/moondream1',
**kwargs):
try:
from transformers import AutoModelForCausalLM, CodeGenTokenizerFast as Tokenizer
except:
warnings.warn('''Please install Transformers version 4.36.2 by running: "pip install transformers==4.36.2",
please intall torchvision>=0.16.''')
warnings.warn('''Please install Transformers version 4.36.2 by running: "pip install transformers==4.36.2",
please intall torchvision>=0.16.''')
assert osp.exists(model_path) or splitlen(model_path) == 2
self.model = (
AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16)
.to(torch.device('cuda'))
)
self.tokenizer = Tokenizer.from_pretrained(model_path)
default_kwargs = dict(
max_new_tokens=512,
)
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
torch.cuda.empty_cache()
def generate_inner(self, message, dataset=None):
images = []
prompt = ''
for s in message:
if s['type'] == 'image':
images.append(s['value'])
elif s['type'] == 'text':
prompt += s['value']
images = [Image.open(s) for s in images]
enc_image = self.model.encode_image(images[0])
prompt_wtmpl = prompt = f'<image>\n\nQuestion: {prompt}\n\nAnswer: '
answer = self.model.generate(
enc_image, prompt_wtmpl, eos_text='<END>', tokenizer=self.tokenizer, **self.kwargs)[0]
cleaned_answer = re.sub('<$', '', re.sub('END$', '', answer)).strip()
return cleaned_answer
def use_custom_prompt(self, dataset):
assert dataset is not None
if listinstr(['MMMU'], dataset):
return False
if DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet':
return True
return False
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
question = line['question']
if dataset == 'MMVet':
prompt = question + '\nAnswer the question directly. '
elif DATASET_TYPE(dataset) == 'MCQ':
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = ''
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = f'Hint: {hint}\n' if hint is not None else ''
prompt += f'{question}\n'
prompt += (
f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. '
if len(options) else 'Answer the question directly. '
)
else:
raise NotImplementedError
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
class Moondream2(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self,
model_path='vikhyatk/moondream2',
**kwargs):
try:
from transformers import AutoModelForCausalLM, AutoTokenizer
except:
warnings.warn('''Please install Transformers version 4.44 by running: "pip install transformers==4.44.0",
please intall torchvision>=0.16.''')
warnings.warn('''Please install Transformers version 4.44 by running: "pip install transformers==4.44.0",
please intall torchvision>=0.16.''')
assert osp.exists(model_path) or splitlen(model_path) == 2
flash_attn_flag = False
try:
import flash_attn
flash_attn_flag = True
except ImportError:
pass
if flash_attn_flag:
self.model = (
AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16,
attn_implementation='flash_attention_2')
.to(torch.device('cuda'))
)
else:
self.model = (
AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype=torch.float16)
.to(torch.device('cuda'))
)
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
default_kwargs = dict(
max_new_tokens=512,
)
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
torch.cuda.empty_cache()
def generate_inner(self, message, dataset=None):
images = []
prompt = ''
for s in message:
if s['type'] == 'image':
images.append(s['value'])
elif s['type'] == 'text':
prompt += s['value']
images = [Image.open(s) for s in images]
enc_image = self.model.encode_image(images[0])
prompt_wtmpl = prompt = f'<image>\n\nQuestion: {prompt}\n\nAnswer: '
answer = self.model.generate(
enc_image, prompt_wtmpl, eos_text='<END>', tokenizer=self.tokenizer, **self.kwargs)[0]
cleaned_answer = re.sub('<$', '', re.sub('END$', '', answer)).strip()
return cleaned_answer
def use_custom_prompt(self, dataset):
assert dataset is not None
if listinstr(['MMMU'], dataset):
return False
if DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet':
return True
return False
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
question = line['question']
if dataset == 'MMVet':
prompt = question + '\nAnswer the question directly. '
elif DATASET_TYPE(dataset) == 'MCQ':
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = ''
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = f'Hint: {hint}\n' if hint is not None else ''
prompt += f'{question}\n'
prompt += (
f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. '
if len(options) else 'Answer the question directly. '
)
else:
raise NotImplementedError
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
import sys
import torch
from PIL import Image
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE
class mPLUG_Owl2(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
def __init__(self, model_path='MAGAer13/mplug-owl2-llama2-7b', **kwargs):
try:
from mplug_owl2.model.builder import load_pretrained_model
from mplug_owl2.mm_utils import get_model_name_from_path
except:
warnings.warn('Please install mPLUG_Owl2 before using mPLUG_Owl2. ')
sys.exit(-1)
model_name = get_model_name_from_path(model_path)
tokenizer, model, image_processor, context_len = load_pretrained_model(
model_path, None, model_name, load_8bit=False, load_4bit=False, device='cpu')
self.model = model.cuda()
self.device = self.model.device
self.image_processor = image_processor
tokenizer.padding_side = 'left'
tokenizer.pad_token_id = tokenizer.eos_token_id
self.tokenizer = tokenizer
self.context_len = context_len
kwargs_default = dict(
max_new_tokens=512, do_sample=False, num_beams=1,
min_new_tokens=1, length_penalty=1, num_return_sequences=1)
kwargs_default.update(kwargs)
self.kwargs = kwargs_default
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
def use_custom_prompt(self, dataset):
assert dataset is not None
if listinstr(['MMMU'], dataset):
return False
if DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet':
return True
return False
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
question = line['question']
if dataset == 'MMVet':
prompt = question + '\nAnswer the question directly. '
elif DATASET_TYPE(dataset) == 'MCQ':
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = ''
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = f'Hint: {hint}\n' if hint is not None else ''
prompt += f'{question}\n'
prompt += (
f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. '
if len(options) else 'Answer the question directly. '
)
else:
raise NotImplementedError
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
def generate_inner(self, message, dataset=None):
from mplug_owl2.constants import IMAGE_TOKEN_INDEX
from mplug_owl2.mm_utils import process_images, tokenizer_image_token
kwargs = cp.deepcopy(self.kwargs)
if dataset in ['MMVet', 'LLaVABench']:
kwargs['length_penalty'] = 0
elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
kwargs['length_penalty'] = 0
elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
kwargs['max_new_tokens'] = 10
num_images = len([x for x in message if x['type'] == 'image'])
assert num_images >= 0
prompt_full = 'USER: '
images = []
if num_images == 1:
prompt, image = self.message_to_promptimg(message, dataset=dataset)
prompt_full += f'<|image|>{prompt} \nASSISTANT: '
images.append(image)
else:
for msg in message:
if msg['type'] == 'image':
images.append(msg['value'])
prompt_full += '<|image|>'
elif msg['type'] == 'text':
prompt_full += msg['value']
prompt_full += '\nASSISTANT: '
def preproc_image(fname):
image = Image.open(fname).convert('RGB')
max_edge = max(image.size)
image = image.resize((max_edge, max_edge))
return image
images = [preproc_image(fname) for fname in images]
image_tensor = process_images(images, self.image_processor)
image_tensor = image_tensor.to(self.device, dtype=torch.float16)
input_ids = tokenizer_image_token(
prompt_full, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
with torch.inference_mode():
output_ids = self.model.generate(
input_ids=input_ids,
images=image_tensor,
output_hidden_states=True,
use_cache=True,
**kwargs)
answer = self.tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
return answer.split('</s>')[0]
import torch
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE
from torchvision import transforms
from transformers import AutoTokenizer, AutoModel
import io
import random
import numpy as np
import math
def get_frame_indices(num_frames, vlen, sample='rand', fix_start=None, input_fps=1, max_num_frames=-1):
if sample in ['rand', 'middle']:
acc_samples = min(num_frames, vlen)
# split the video into `acc_samples` intervals, and sample from each interval.
intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
ranges = []
for idx, interv in enumerate(intervals[:-1]):
ranges.append((interv, intervals[idx + 1] - 1))
if sample == 'rand':
try:
frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
except:
frame_indices = np.random.permutation(vlen)[:acc_samples]
frame_indices.sort()
frame_indices = list(frame_indices)
elif fix_start is not None:
frame_indices = [x[0] + fix_start for x in ranges]
elif sample == 'middle':
frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
else:
raise NotImplementedError
if len(frame_indices) < num_frames: # padded with last frame
padded_frame_indices = [frame_indices[-1]] * num_frames
padded_frame_indices[:len(frame_indices)] = frame_indices
frame_indices = padded_frame_indices
elif 'fps' in sample: # fps0.5, sequentially sample frames at 0.5 fps
output_fps = float(sample[3:])
duration = float(vlen) / input_fps
delta = 1 / output_fps # gap between frames, this is also the clip length each frame represents
frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
frame_indices = np.around(frame_seconds * input_fps).astype(int)
frame_indices = [e for e in frame_indices if e < vlen]
if max_num_frames > 0 and len(frame_indices) > max_num_frames:
frame_indices = frame_indices[:max_num_frames]
# frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames)
elif 'interval' in sample:
if num_frames == 1:
frame_indices = [random.randint(0, vlen - 1)]
else:
# transform FPS
interval = 8
clip_length = num_frames * interval * input_fps / 30
max_idx = max(vlen - clip_length, 0)
start_idx = random.uniform(0, max_idx)
end_idx = start_idx + clip_length - 1
frame_indices = torch.linspace(start_idx, end_idx, num_frames)
frame_indices = torch.clamp(frame_indices, 0, vlen - 1).long().tolist()
else:
raise ValueError
return frame_indices
def get_frame_indices_start_end(num_frames, vlen, fps, start_time, end_time):
start_idx = max(int(fps * start_time), 0) if start_time is not None and not math.isnan(start_time) else 0
end_idx = min(int(fps * end_time), vlen) if end_time is not None and not math.isnan(end_time) else vlen
clip_len = end_idx - start_idx
acc_samples = min(num_frames, clip_len)
# split the video into `acc_samples` intervals, and sample from each interval.
intervals = np.linspace(start=start_idx, stop=end_idx, num=acc_samples + 1).astype(int)
ranges = []
for idx, interv in enumerate(intervals[:-1]):
ranges.append((interv, intervals[idx + 1] - 1))
try:
frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
except:
frame_indices = np.random.permutation(list(range(start_idx, end_idx)))[:acc_samples]
frame_indices.sort()
frame_indices = list(frame_indices)
if len(frame_indices) < num_frames: # padded with last frame
padded_frame_indices = [frame_indices[-1]] * num_frames
padded_frame_indices[:len(frame_indices)] = frame_indices
frame_indices = padded_frame_indices
return frame_indices
def read_frames_decord(
video_path, width=None, height=None,
num_frames=8, sample='rand', fix_start=None,
max_num_frames=-1, start_time=None, end_time=None
):
import decord
decord.bridge.set_bridge('torch')
if video_path.lower().endswith('.webm'):
# a workaround for webm, large/auto num_threads will cause error.
num_threads = 2
else:
num_threads = 0
if width is not None and height is not None:
video_reader = decord.VideoReader(video_path, width=width, height=height, num_threads=num_threads)
else:
video_reader = decord.VideoReader(video_path, num_threads=num_threads)
vlen = len(video_reader)
fps = video_reader.get_avg_fps()
if start_time and end_time:
frame_indices = get_frame_indices_start_end(
num_frames, vlen, fps, start_time, end_time
)
else:
frame_indices = get_frame_indices(
num_frames, vlen, sample=sample, fix_start=fix_start,
input_fps=fps, max_num_frames=max_num_frames
)
frames = video_reader.get_batch(frame_indices)
if isinstance(frames, torch.Tensor):
frames = frames.numpy() # (T, H, W, C), torch.uint8
else:
print(frames.shape)
frames = frames.asnumpy()
timestamp = {
'num_frames': len(frame_indices),
'timestamp': ', '.join([str(round(f / fps, 1)) for f in frame_indices])
}
return frames, timestamp
class mPLUG_Owl3(BaseModel):
# No separate model module is required, but the dependencies must be met.
# https://github.com/X-PLUG/mPLUG-Owl/blob/main/mPLUG-Owl3/requirements.txt
INSTALL_REQ = True
INTERLEAVE = True
INSTALL_REQ_TXT = 'https://github.com/X-PLUG/mPLUG-Owl/blob/main/mPLUG-Owl3/requirements.txt'
def __init__(self, model_path=None, **kwargs):
assert model_path is not None
self.tokenizer = AutoTokenizer.from_pretrained(
model_path
)
self.model = AutoModel.from_pretrained(
model_path,
attn_implementation='sdpa',
torch_dtype=torch.half,
trust_remote_code=True
)
self.model.eval().cuda()
self.processor = self.model.init_processor(self.tokenizer)
self.logger = get_logger('mPLUG_Owl3')
if self.INSTALL_REQ:
self.logger.info(
f'Please remember to meet the requirements first\n'
f'Here: {self.INSTALL_REQ_TXT}'
)
def use_custom_prompt(self, dataset):
assert dataset is not None
if listinstr(['MMMU'], dataset):
return False
if listinstr(['MVBench', 'MMVet'], dataset):
return True
return False
def save_video_into_images(self, line, num_frames=16, dataset_class=None):
video_url = {
'video': osp.join(line['prefix'], line['video']),
'num_frames': num_frames,
'bound': line.get('bound', None)
}
if osp.isdir(video_url['video']):
frame_paths = []
max_frame = len(os.listdir(video_url['video']))
fps = 3
if video_url['bound']:
start, end = line['start'], line['end']
else:
start, end = -100000, 100000
start_idx = max(1, round(start * fps))
end_idx = min(round(end * fps), max_frame)
seg_size = float(end_idx - start_idx) / num_frames
frame_indices = np.array([
int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
for idx in range(num_frames)
])
for frame_index in frame_indices:
img = os.path.join(video_url['video'], f'{frame_index:05d}.jpg')
frame_paths.append(img)
return frame_paths
if isinstance(video_url, dict):
if video_url['bound']:
start_time = line['start']
end_time = line['end']
else:
start_time = None
end_time = None
num_frames = video_url.get('num_frames', num_frames)
video_url = video_url['video']
else:
start_time = None
end_time = None
video_url = str(video_url)
if not osp.exists(video_url): # for MVBench_MP4
video_url = osp.join(dataset_class.data_root, video_url)
video, timestamp = read_frames_decord(
video_url, num_frames=num_frames, sample='middle', start_time=start_time, end_time=end_time
)
to_pil = transforms.ToPILImage()
frames = [to_pil(video[ti]) for ti in range(video.shape[0])]
lmu_root = LMUDataRoot()
frame_root = osp.join(lmu_root, 'images', dataset_class.dataset_name, 'mplug_owl3')
frame_root = osp.join(frame_root, video_url.split('/')[-1].split('.')[0])
os.makedirs(frame_root, exist_ok=True)
frame_tmpl = 'frame-{}-of-{}.jpg'
frame_paths = [osp.join(frame_root, frame_tmpl.format(i, num_frames)) for i in range(1, num_frames + 1)]
for im, pth in zip(frames, frame_paths):
if not osp.exists(pth):
im.save(pth)
return frame_paths
# Currently same to mPLUG_Owl2
def build_prompt(self, line, dataset=None, num_frames=16, video_llm=False):
if not isinstance(dataset, str):
dataset_class = dataset
dataset = dataset_class.dataset_name
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
if dataset_class.MODALITY == 'VIDEO':
if listinstr(['MVBench'], dataset):
tgt_path = self.save_video_into_images(line, num_frames, dataset_class)
else:
tgt_path = dataset_class.save_video_into_images(line, num_frames)
if type(line['candidates']) != list:
line['candidates'] = eval(line['candidates'])
for idx, c in enumerate(line['candidates']):
line[chr(ord('A') + idx)] = c
else:
tgt_path = self.dump_image(line, dataset)
question = line['question']
if dataset == 'MMVet':
prompt = question + '\nAnswer the question directly. '
elif listinstr(['MCQ', 'Video-MCQ'], DATASET_TYPE(dataset)):
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = ''
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = f'Hint: {hint}\n' if hint is not None else ''
prompt += f'{question}\n'
prompt += (
f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. '
if len(options) else 'Answer the question directly. '
)
else:
raise NotImplementedError
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
def preproc_image(self, fname, dataset=None):
from PIL import Image
image = Image.open(fname).convert('RGB')
# resize to max_size
max_size = 448 * 16
if max(image.size) > max_size and not listinstr(['MVBench'], dataset):
w, h = image.size
if w > h:
new_w = max_size
new_h = int(h * max_size / w)
else:
new_h = max_size
new_w = int(w * max_size / h)
image = image.resize((new_w, new_h), resample=Image.BICUBIC)
return image
def generate_inner(self, message, dataset=None):
num_images = len([x for x in message if x['type'] == 'image'])
assert num_images >= 0
images = []
prompt_full = ''
for msg in message:
if msg['type'] == 'image':
images.append(msg['value'])
prompt_full += '<|image|>'
elif msg['type'] == 'text':
prompt_full += msg['value']
needed_messages = [
{'role': 'user', 'content': prompt_full},
{'role': 'assistant', 'content': ''}
]
images = [self.preproc_image(fname, dataset) for fname in images]
inputs = self.processor(needed_messages, images=images, videos=None, cut_enable=False)
inputs.to('cuda')
if listinstr(['MVBench'], dataset):
inputs.update({
'tokenizer': self.tokenizer,
'max_new_tokens': 100,
'decode_text': True,
'do_sample': True,
'top_k': 1,
})
else:
inputs.update({
'tokenizer': self.tokenizer,
'max_new_tokens': 1024,
'decode_text': True,
})
g = self.model.generate(**inputs)
return g[0]
import torch
from PIL import Image
import re
from transformers import AutoModel, AutoProcessor
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE
class OmChat(BaseModel):
INSTALL_REQ = True
INTERLEAVE = True
def __init__(self, model_path='omlab/omchat-v2.0-13B-single-beta_hf', **kwargs):
# Recommend to install `transformers==4.44.0`
assert model_path is not None
self.model_path = model_path
print(f'load from {self.model_path}')
model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True, torch_dtype=torch.float16)
self.model = model.cuda().eval()
self.kwargs = kwargs
self.processor = AutoProcessor.from_pretrained(self.model_path, trust_remote_code=True)
torch.cuda.empty_cache()
# system prompt
self.default_system_prompt = 'You are a helpful assistant. Focus on accuracy and reliability in your response.'
self.new1_system_prompt = 'You are a helpful assistant.'
self.new2_system_prompt = (
'Read the following question carefully, '
'solve it step by step, '
'and then output the final answer in the format of '
"'Answer: single number or single word or phrase'.\n\n"
)
# suffix_prompt for MCQ
self.mcq_suffix_prompt_en = 'Please select the correct answer from the options above. \n'
self.mcq_suffix_prompt_cn = '请直接回答选项字母。\n'
# suffix_prompt for Y/N
self.yorn_suffix_prompt = ' Please answer yes or no. Answer the question using a single word or phrase.'
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'Y/N':
return True
return False
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
if isinstance(line, int):
line = self.data.iloc[line]
question = line['question']
if DATASET_TYPE(dataset) == 'MCQ':
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = 'Options:\n'
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
prompt = ''
if hint is not None:
prompt += f'Hint: {hint}\n'
prompt += f'Question: {question}\n'
if len(options):
prompt += options_prompt
if not dataset.startswith('MMMU_'):
if not cn_string(prompt):
prompt += self.mcq_suffix_prompt_en
else:
prompt += self.mcq_suffix_prompt_cn
elif DATASET_TYPE(dataset) == 'Y/N':
prompt = question + self.yorn_suffix_prompt
print(DATASET_TYPE(dataset))
message = []
if isinstance(tgt_path, list):
message.extend([dict(type='image', value=p) for p in tgt_path])
else:
message = [dict(type='image', value=tgt_path)]
message.append(dict(type='text', value=prompt))
return message
def message_to_promptimg(self, message, dataset=None):
if dataset is None or listinstr(['MMMU'], dataset):
prompt = '\n'.join([
re.sub(r'<image\s*\d+>', '<image>', x['value'])
for x in message
if x['type'] == 'text'
])
image = [x['value'] for x in message if x['type'] == 'image']
else:
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
image = [x['value'] for x in message if x['type'] == 'image']
return prompt, image
def generate_inner(self, message, dataset=None):
def replace_last_dot(input_string):
if input_string.endswith('.'):
return input_string[:-1]
else:
return input_string
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
image = [Image.open(img_path).convert('RGB') for img_path in image_path]
default_kwargs = dict(
max_new_tokens=1024,
do_sample=False,
temperature=0.0,
top_p=1)
if dataset is not None and listinstr(['MathVista_MINI'], dataset):
system_prompt = self.new2_system_prompt
elif dataset is not None and listinstr(['MMMU_DEV_VAL', 'MMStar'], dataset):
system_prompt = self.new1_system_prompt
else:
system_prompt = self.default_system_prompt
inputs = self.processor(text=prompt, system_prompt=system_prompt, images=image, return_tensors='pt').to('cuda')
default_kwargs.update(self.kwargs)
with torch.inference_mode():
output_ids = self.model.generate(
**inputs,
eos_token_id=self.model.generation_config.eos_token_id,
**default_kwargs
)
res = self.processor.tokenizer.decode(output_ids[0, inputs.input_ids.shape[1]:]).strip()
if '<|im_end|>' in res:
res = res.split('<|im_end|>')[0].strip()
if dataset != 'MMMU_DEV_VAL':
if res.startswith('Answer: '):
res = res[len('Answer: '):]
match = re.search(r'\nThe answer is:(.+)', res)
if match:
res = match.group(1).strip()
# for OCRBench
doc_match = re.search(r'<doc>(.*?)<\/doc>', res)
if doc_match:
res = doc_match.group(1).strip()
res = replace_last_dot(res)
return res
import torch
from PIL import Image
from transformers import AutoTokenizer
from .base import BaseModel
from ..smp import *
from ..dataset import DATASET_TYPE
DEFAULT_IMAGE_TOKEN = '<image>'
DEFAULT_IMAGE_PATCH_TOKEN = '<im_patch>'
DEFAULT_IM_START_TOKEN = '<im_start>'
DEFAULT_IM_END_TOKEN = '<im_end>'
def init_omni_lmm(model_path):
from omnilmm.model.omnilmm import OmniLMMForCausalLM
from omnilmm.utils import disable_torch_init
from omnilmm.model.utils import build_transform
torch.backends.cuda.matmul.allow_tf32 = True
disable_torch_init()
tokenizer = AutoTokenizer.from_pretrained(model_path, model_max_length=2048)
model = OmniLMMForCausalLM.from_pretrained(
model_path, tune_clip=True, torch_dtype=torch.bfloat16, device_map='cpu'
)
model = model.to(device='cuda', dtype=torch.bfloat16)
image_processor = build_transform(
is_train=False, input_size=model.model.config.image_size, std_mode='OPENAI_CLIP'
)
mm_use_im_start_end = getattr(model.config, 'mm_use_im_start_end', False)
assert mm_use_im_start_end
tokenizer.add_tokens(
[DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN],
special_tokens=True,
)
vision_config = model.model.vision_config
vision_config.im_patch_token = tokenizer.convert_tokens_to_ids(
[DEFAULT_IMAGE_PATCH_TOKEN]
)[0]
vision_config.use_im_start_end = mm_use_im_start_end
vision_config.im_start_token, vision_config.im_end_token = (
tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN])
)
image_token_len = model.model.config.num_query
return model, image_processor, image_token_len, tokenizer
def expand_question_into_multimodal(
question_text, image_token_len, im_st_token, im_ed_token, im_patch_token
):
if '<image>' in question_text[0]['content']:
question_text[0]['content'] = question_text[0]['content'].replace(
'<image>', im_st_token + im_patch_token * image_token_len + im_ed_token
)
else:
question_text[0]['content'] = (
im_st_token
+ im_patch_token * image_token_len
+ im_ed_token
+ '\n'
+ question_text[0]['content']
)
return question_text
def wrap_question_for_omni_lmm(question, image_token_len, tokenizer):
from omnilmm.train.train_utils import omni_preprocess
question = expand_question_into_multimodal(
question,
image_token_len,
DEFAULT_IM_START_TOKEN,
DEFAULT_IM_END_TOKEN,
DEFAULT_IMAGE_PATCH_TOKEN,
)
conversation = question
data_dict = omni_preprocess(
sources=[conversation], tokenizer=tokenizer, generation=True
)
data_dict = dict(input_ids=data_dict['input_ids'][0], labels=data_dict['labels'][0])
return data_dict
class OmniLMM12B(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
def __init__(self, model_path, root, **kwargs) -> None:
sys.path.append(root)
model, img_processor, image_token_len, tokenizer = init_omni_lmm(model_path)
self.model = model
self.image_token_len = image_token_len
self.image_transform = img_processor
self.tokenizer = tokenizer
self.model.eval()
default_kwargs = dict(
max_new_tokens=512,
do_sample=False,
output_scores=True,
return_dict_in_generate=True,
repetition_penalty=1.1,
)
default_kwargs.update(kwargs)
self.kwargs = default_kwargs
torch.cuda.empty_cache()
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
try:
image = Image.open(image_path).convert('RGB')
except:
logger = get_logger('OmniLMM Inference')
logger.error('Image Decode Error')
return 'Image Decode Error'
msgs = [dict(role='user', content=prompt)]
input_ids = wrap_question_for_omni_lmm(
msgs, self.image_token_len, self.tokenizer
)['input_ids']
input_ids = torch.as_tensor(input_ids)
image = self.image_transform(image)
with torch.inference_mode():
output = self.model.generate_vllm(
input_ids=input_ids.unsqueeze(0).cuda(),
images=image.unsqueeze(0).half().cuda(),
**self.kwargs,
)
response = self.tokenizer.decode(
output.sequences[0], skip_special_tokens=True
)
response = response.strip()
return response
def use_custom_prompt(self, dataset):
assert dataset is not None
if DATASET_TYPE(dataset) == 'MCQ':
return True
return False
def build_prompt(self, line, dataset=None):
assert dataset is None or isinstance(dataset, str)
assert self.use_custom_prompt(dataset)
tgt_path = self.dump_image(line, dataset)
question = line['question']
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = 'Options:\n'
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = ''
if hint is not None:
prompt += f'Hint: {hint}\n'
prompt += f'{question}\n'
if len(options):
prompt += options_prompt
prompt = (
"""
Study the image carefully and pick the option associated with the correct answer.
Focus solely on selecting the option and avoid including any other content.\n
"""
+ prompt
)
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
return message
import sys
import torch
from PIL import Image
import os.path as osp
import warnings
from .base import BaseModel
from ..smp import splitlen, get_cache_path
from huggingface_hub import snapshot_download
class OpenFlamingo(BaseModel):
INSTALL_REQ = True
INTERLEAVE = True
def __init__(self,
name,
mpt_pth=None,
ckpt_pth=None,
**kwargs):
if mpt_pth is None:
warnings.warn(
'Please set `mpt_pth` to the directory of MPT-7B, which is cloned from here: '
'https://huggingface.co/mosaicml/mpt-7b. '
)
sys.exit(-1)
if ckpt_pth is None:
warnings.warn(
'Please set `ckpt_pth` to the openflamingo ckpt, which is the `checkpoint.pt` file downloaded '
'from: https://huggingface.co/openflamingo/OpenFlamingo-9B-vitl-mpt7b/tree/main. '
)
sys.exit(-1)
else:
if osp.exists(ckpt_pth):
if ckpt_pth.endswith('checkpoint.pt'):
pass
elif osp.isdir(ckpt_pth):
ckpt_pth = osp.join(ckpt_pth, 'checkpoint.pt')
if not osp.exists(ckpt_pth):
sys.exit(-1)
elif splitlen(ckpt_pth, '/') == 2:
cache_path = get_cache_path(ckpt_pth)
if cache_path is None:
snapshot_download(ckpt_pth)
cache_path = get_cache_path(ckpt_pth)
if cache_path is None:
sys.exit(-1)
else:
ckpt_pth = osp.join(cache_path, 'checkpoint.pt')
self.name = name
assert name in ['v2']
self.mpt_pth = mpt_pth
try:
from open_flamingo import create_model_and_transforms
except:
raise ImportError('Please first install open_flamingo to use OpenFlamingo')
model, image_processor, tokenizer = create_model_and_transforms(
clip_vision_encoder_path='ViT-L-14',
clip_vision_encoder_pretrained='openai',
lang_encoder_path=mpt_pth,
tokenizer_path=mpt_pth,
cross_attn_every_n_layers=4)
ckpt = torch.load(ckpt_pth)
model.load_state_dict(ckpt, strict=False)
torch.cuda.empty_cache()
self.model = model.eval().cuda()
self.tokenizer = tokenizer
self.tokenizer.padding_side = 'left'
self.image_proc = image_processor
kwargs_default = dict(max_new_tokens=512, num_beams=3)
kwargs_default.update(kwargs)
self.kwargs = kwargs_default
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
def generate_inner(self, message, dataset=None):
vision_x = []
prompt = ''
for msg in message:
if msg['type'] == 'image':
img = Image.open(msg['value'])
vision_x.append(self.image_proc(img).unsqueeze(0))
prompt += '<image>'
elif msg['type'] == 'text':
prompt += msg['value']
prompt += 'Answer: '
vision_x = torch.cat(vision_x, dim=0) if len(vision_x) > 1 else vision_x[0]
vision_x = vision_x.unsqueeze(1).unsqueeze(0)
lang_x = self.tokenizer([prompt], return_tensors='pt')
generated_text = self.model.generate(
vision_x=vision_x.cuda(),
lang_x=lang_x['input_ids'].cuda(),
attention_mask=lang_x['attention_mask'].cuda(),
**self.kwargs)
generated_text = self.tokenizer.decode(generated_text[0])
text = generated_text[len(prompt):].split('<|endofchunk|>')[0]
return text
import torch
from transformers import AutoModelForCausalLM
from .base import BaseModel
from ..dataset import DATASET_TYPE
from ..smp import *
class Ovis(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self, model_path='AIDC-AI/Ovis1.5-Llama3-8B', **kwargs):
assert model_path is not None
# Recommend to install `transformers==4.43.2` and `torch==2.1.2`.
self.model_path = model_path
self.device = torch.cuda.current_device()
self.dtype = torch.bfloat16
self.model = AutoModelForCausalLM.from_pretrained(
self.model_path,
torch_dtype=self.dtype,
multimodal_max_length=8192,
trust_remote_code=True
)
self.model = self.model.eval().to(device=self.device)
self.eos_token_id = self.model.generation_config.eos_token_id
self.text_tokenizer = self.model.get_text_tokenizer()
self.pad_token_id = self.text_tokenizer.pad_token_id
self.visual_tokenizer = self.model.get_visual_tokenizer()
self.conversation_formatter = self.model.get_conversation_formatter()
self.image_placeholder = '<image>'
self.gen_kwargs = dict(
max_new_tokens=1024,
do_sample=False,
top_p=None,
top_k=None,
temperature=None,
repetition_penalty=None,
eos_token_id=self.eos_token_id,
pad_token_id=self.pad_token_id,
use_cache=True
)
def use_custom_prompt(self, dataset):
if DATASET_TYPE(dataset) == 'Y/N' or DATASET_TYPE(dataset) == 'MCQ':
return True
return False
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
if DATASET_TYPE(dataset) == 'Y/N':
prompt = self.built_yorn_prompt(line, dataset)
elif DATASET_TYPE(dataset) == 'MCQ':
prompt = self.build_multi_choice_prompt(line, dataset)
else:
raise RuntimeError(f'Invalid dataset type: {DATASET_TYPE(dataset)}')
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
# interleave dataset
if dataset.startswith('MMMU_'):
from .. import MMMUDataset
message = MMMUDataset.split_MMMU(message)
return message
def built_yorn_prompt(self, line, dataset=None):
prompt = line['question']
if listinstr(['HallusionBench'], dataset):
prompt += ' Please answer yes or no.'
prompt += '\n请用单个词或短语回答问题。' if cn_string(
prompt) else '\nAnswer the question using a single word or phrase.'
return prompt
def build_multi_choice_prompt(self, line, dataset=None):
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if len(options):
prompt += '\n请直接回答选项字母。' if cn_string(
prompt) else "\nAnswer with the option's letter from the given choices directly."
else:
prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
return prompt
def generate_inner(self, message, dataset=None):
prompt, input_ids, attention_mask, pixel_values = self.prepare_inputs(message)
output_ids = self.model.generate(
input_ids,
pixel_values=pixel_values,
attention_mask=attention_mask,
**self.gen_kwargs
)
response = self.text_tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
return response
def prepare_inputs(self, message):
# build query
images = [x['value'] for x in message if x['type'] == 'image']
texts = [x['value'] for x in message if x['type'] == 'text']
if len(images) == 0:
query = '\n'.join(texts)
elif len(images) == 1 and len(texts) == 1:
query = self.image_placeholder + '\n' + texts[0]
else: # interleave sample
chunks = [x['value'] if x['type'] == 'text' else self.image_placeholder for x in message]
query = '\n'.join(chunks)
# format conversation
prompt, input_ids = self.conversation_formatter.format_query(query)
attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id)
input_ids = input_ids.unsqueeze(0).to(device=self.device)
attention_mask = attention_mask.unsqueeze(0).to(device=self.device)
# preprocess images
if len(images) == 0:
pixel_values = [None]
else:
preprocessed_images = [self.visual_tokenizer.preprocess_image(Image.open(image)) for image in images]
pixel_values = [torch.cat(preprocessed_images, dim=0).to(device=self.device, dtype=self.dtype)]
return prompt, input_ids, attention_mask, pixel_values
class Ovis1_6(BaseModel):
INSTALL_REQ = False
INTERLEAVE = True
def __init__(self, model_path='AIDC-AI/Ovis1.6-Gemma2-9B', **kwargs):
assert model_path is not None
# Recommend to install `python=3.10`, `transformers==4.44.2`, `torch==2.2.0`, and `numpy==1.24.3`
self.model_path = model_path
self.device = torch.cuda.current_device()
self.dtype = torch.bfloat16
self.model = AutoModelForCausalLM.from_pretrained(
self.model_path,
torch_dtype=self.dtype,
multimodal_max_length=8192,
trust_remote_code=True
)
self.model = self.model.eval().to(device=self.device)
self.eos_token_id = self.model.generation_config.eos_token_id
self.text_tokenizer = self.model.get_text_tokenizer()
self.pad_token_id = self.text_tokenizer.pad_token_id
self.visual_tokenizer = self.model.get_visual_tokenizer()
self.max_partition = 9
self.image_placeholder = '<image>'
self.gen_kwargs = dict(
max_new_tokens=1024,
do_sample=False,
top_p=None,
top_k=None,
temperature=None,
repetition_penalty=None,
eos_token_id=self.eos_token_id,
pad_token_id=self.pad_token_id,
use_cache=True
)
def use_custom_prompt(self, dataset):
if DATASET_TYPE(dataset) == 'Y/N' or DATASET_TYPE(dataset) == 'MCQ':
return True
return False
def built_yorn_prompt(self, line, dataset=None):
prompt = line['question'] + '\nAnswer the question using a single word or phrase.'
return prompt
def build_multi_choice_prompt(self, line, dataset=None):
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if len(options):
prompt += "\nAnswer with the option's letter from the given choices directly."
return prompt
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
if DATASET_TYPE(dataset) == 'Y/N':
prompt = self.built_yorn_prompt(line, dataset)
elif DATASET_TYPE(dataset) == 'MCQ':
prompt = self.build_multi_choice_prompt(line, dataset)
else:
raise RuntimeError(f'Invalid dataset type: {DATASET_TYPE(dataset)}')
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=s) for s in tgt_path])
# interleave dataset
if dataset.startswith('MMMU_'):
from .. import MMMUDataset
message = MMMUDataset.split_MMMU(message)
return message
def generate_inner(self, message, dataset=None):
prompt, input_ids, attention_mask, pixel_values = self.prepare_inputs(message)
output_ids = self.model.generate(
input_ids,
pixel_values=pixel_values,
attention_mask=attention_mask,
**self.gen_kwargs
)
response = self.text_tokenizer.decode(output_ids[0], skip_special_tokens=True)
return response
def prepare_inputs(self, message):
# build query
images = [x['value'] for x in message if x['type'] == 'image']
texts = [x['value'] for x in message if x['type'] == 'text']
if len(images) == 0:
query = '\n'.join(texts)
elif len(images) == 1 and len(texts) == 1:
query = self.image_placeholder + '\n' + texts[0]
else: # interleaved sample
chunks = [x['value'] if x['type'] == 'text' else self.image_placeholder for x in message]
query = '\n'.join(chunks)
# preprocess inputs
prompt, input_ids, pixel_values = self.model.preprocess_inputs(
query, [Image.open(image) for image in images], max_partition=self.max_partition
)
# move to self.device
attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id)
input_ids = input_ids.unsqueeze(0).to(device=self.device)
attention_mask = attention_mask.unsqueeze(0).to(device=self.device)
pixel_values = [
pixel_values.to(device=self.device, dtype=self.dtype) if pixel_values is not None else None
]
return prompt, input_ids, attention_mask, pixel_values
from PIL import Image
import torch
from .base import BaseModel
from ..smp import *
class PaliGemma(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='google/paligemma-3b-mix-448', **kwargs):
try:
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
except:
warnings.warn('Please install the latest version transformers.')
sys.exit(-1)
model = PaliGemmaForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=torch.bfloat16,
device_map='cpu',
revision='bfloat16',
).eval()
self.model = model.cuda()
self.processor = AutoProcessor.from_pretrained(model_path)
self.kwargs = kwargs
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
image = Image.open(image_path).convert('RGB')
model_inputs = self.processor(
text=prompt, images=image, return_tensors='pt'
).to('cuda')
input_len = model_inputs['input_ids'].shape[-1]
with torch.inference_mode():
generation = self.model.generate(
**model_inputs, max_new_tokens=512, do_sample=False
)
generation = generation[0][input_len:]
res = self.processor.decode(generation, skip_special_tokens=True)
return res
import sys
import torch
import os.path as osp
import warnings
from .base import BaseModel
class PandaGPT(BaseModel):
INSTALL_REQ = True
INTERLEAVE = False
def __init__(self, name, root=None, **kwargs):
if root is None:
warnings.warn('Please set `root` to PandaGPT code directory, which is cloned from here: ')
sys.exit(-1)
assert name == 'PandaGPT_13B'
self.name = name
sys.path.append(osp.join(root, 'code'))
try:
from model.openllama import OpenLLAMAPEFTModel
except:
raise ImportError(
'Please first install PandaGPT and set the root path to use PandaGPT, '
'which is cloned from here: https://github.com/yxuansu/PandaGPT. '
)
self.args = {
'model': 'openllama_peft',
'imagebind_ckpt_path': osp.join(root, 'pretrained_ckpt/imagebind_ckpt'),
'vicuna_ckpt_path': osp.join(root, 'pretrained_ckpt/vicuna_ckpt/13b_v0'),
'delta_ckpt_path': osp.join(root, 'pretrained_ckpt/pandagpt_ckpt/13b/pytorch_model.pt'),
'stage': 2,
'max_tgt_len': 512,
'lora_r': 32,
'lora_alpha': 32,
'lora_dropout': 0.1,
}
model = OpenLLAMAPEFTModel(**self.args)
delta_ckpt = torch.load(self.args['delta_ckpt_path'], map_location=torch.device('cpu'))
model.load_state_dict(delta_ckpt, strict=False)
torch.cuda.empty_cache()
self.model = model.eval().half().cuda()
kwargs_default = {'top_p': 0.9, 'do_sample': False, 'max_tgt_len': 128, 'temperature': 0.001}
kwargs_default.update(kwargs)
self.kwargs = kwargs_default
warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
def generate_inner(self, message, dataset=None):
prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
struct = {
'prompt': prompt,
'image_paths': [image_path],
'audio_paths': [],
'video_paths': [],
'thermal_paths': [],
'modality_embeds': []
}
struct.update(self.kwargs)
resp = self.model.generate(struct)
return resp
import os
import torch
from PIL import Image
from abc import abstractproperty
from .base import BaseModel
from ..dataset import DATASET_TYPE
from ..smp import *
class Parrot(BaseModel):
INSTALL_REQ = False
INTERLEAVE = False
def __init__(self, model_path='AIDC-AI/Parrot-7B', **kwargs):
try:
from parrot.model.parrot_arch import ParrotMetaForCausalLM
from parrot.utils.constants import DEFAULT_IMAGE_TOKEN, BEGIN_LINE, END_LINE
from parrot.model.conversation_formatter import ConversationFormatter
from parrot.utils.mm_utils import process_images
except:
warnings.warn('Please install Parrot before using Parrot')
warnings.warn('Please install Parrot from https://github.com/AIDC-AI/Parrot')
warnings.warn('Using `pip install -e . --no-deps` in the Parrot directory')
warnings.warn('Recommend to install transformers==4.39.0')
sys.exit(-1)
self.process_images = process_images
self.ConversationFormatter = ConversationFormatter
self.DEFAULT_IMAGE_TOKEN = DEFAULT_IMAGE_TOKEN
self.BEGIN_LINE = BEGIN_LINE
self.END_LINE = END_LINE
try:
model_name = 'parrot_qwen2'
model, tokenizer, conversation_formatter = ParrotMetaForCausalLM.build(
model_name, model_path, mm_vision_tower='openai/clip-vit-large-patch14-336'
)
self.model = model.cuda()
self.vision_tower = self.model.get_vision_tower()
self.tokenizer = tokenizer
self.conversation_formatter = conversation_formatter
self.image_processor = self.model.get_vision_tower().image_processor
except Exception as e:
warnings.warn(f'Error when loading Parrot model:\n{e}')
exit(-1)
self.kwargs = dict(
do_sample=False,
num_beams=1,
max_new_tokens=512,
repetition_penalty=None,
use_cache=True,
eos_token_id=self.tokenizer.eos_token_id,
pad_token_id=self.tokenizer.pad_token_id
)
if int(os.environ.get('LOCAL_RANK', '0')) == 0:
print(f'Following kwargs {self.kwargs} will be used as generation config.')
self.count = 0
def use_custom_prompt(self, dataset):
if DATASET_TYPE(dataset) == 'Y/N' or DATASET_TYPE(dataset) == 'MCQ':
return True
return False
def build_prompt(self, line, dataset=None):
assert self.use_custom_prompt(dataset)
assert isinstance(dataset, str)
tgt_path = self.dump_image(line, dataset)
if DATASET_TYPE(dataset) == 'Y/N':
prompt = self.built_yorn_prompt(line, dataset)
elif DATASET_TYPE(dataset) == 'MCQ':
prompt = self.build_multi_choice_prompt(line, dataset)
else:
raise ValueError(f'Invalid dataset type: {DATASET_TYPE(dataset)}')
message = [dict(type='text', value=prompt)]
message.extend([dict(type='image', value=p) for p in tgt_path])
return message
def built_yorn_prompt(self, line, dataset=None):
prompt = line['question']
previous_suffixs = [' Please answer yes or no.', ' Yes or No', ' Answer in one sentence.']
for previous_suffix in previous_suffixs:
if prompt.endswith(previous_suffix):
prompt = prompt[:-len(previous_suffix)]
break
prompt += '\n请直接回答Yes或No。请用单个词或短语回答问题。' if cn_string(
prompt) else '\nPlease strictly answer Yes or No. Answer the question using a single word or phrase.'
return prompt
def build_multi_choice_prompt(self, line, dataset=None):
question = line['question']
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
if hint is not None:
question = hint + '\n' + question
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
for key, item in options.items():
question += f'\n{key}. {item}'
prompt = question
if len(options):
default_prompt = "\nAnswer with the option's letter from the given choices directly."
if dataset[-3:] == '_cn' or cn_string(prompt):
default_prompt = '\n请直接用给定选项中的选项字母回答。'
elif dataset[-3:] == '_pt':
default_prompt = '\nResponda diretamente com a letra da opção das escolhas dadas.'
elif dataset[-3:] == '_ar':
default_prompt = '\nأجب مباشرةً بحرف الخيار من الاختيارات المعطاة.'
elif dataset[-3:] == '_ru':
default_prompt = '\nОтветьте буквой варианта из предложенных вариантов напрямую.'
elif dataset[-3:] == '_tr':
default_prompt = '\nVerilen seçeneklerden doğrudan seçeneğin harfi ile cevap verin.'
prompt += default_prompt
# prompt += (
# '\n请直接回答选项字母。' if cn_string(prompt) else
# "\nAnswer with the option's letter from the given choices directly."
# )
else:
prompt += '\n请用单个词或短语回答问题。' if cn_string(
prompt) else '\nAnswer the question using a single word or phrase.'
return prompt
def process_answer_prefix(self, answer, prefixes):
for prefix in prefixes:
if prefix in answer.lower():
return answer[answer.lower().find(prefix) + len(prefix):]
return answer
def generate_inner(self, message, dataset=None):
query, image_paths = self.prepare_inputs(message)
images_list = [Image.open(image_path).convert('RGB') for image_path in image_paths]
args = abstractproperty()
args.image_aspect_ratio = 'pad'
image_tensors = self.process_images(images_list, self.image_processor, args).cuda()
prompt, input_ids = self.conversation_formatter.format_query(query)
input_ids = input_ids.unsqueeze(0).cuda()
with torch.inference_mode():
kwargs = dict(
images=image_tensors,
)
kwargs.update(self.kwargs)
output_ids = self.model.generate(input_ids, **kwargs)
input_token_len = input_ids.shape[1]
n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()
if n_diff_input_output > 0:
print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
response = self.tokenizer.batch_decode(output_ids[:, input_token_len:],
skip_special_tokens=True)[0].strip(string.whitespace)
answer = response
if query.endswith("Answer with the option's letter from the given choices directly.") or query.endswith(
'请直接回答选项字母。'):
qtype = 'multiple-choice'
while True:
answer = answer.strip(string.punctuation + string.whitespace)
if len(answer) > 1:
if answer[0] in string.ascii_uppercase and answer[1] in string.whitespace + string.punctuation:
answer = answer[0]
break
elif answer[-1] in string.ascii_uppercase and answer[-2] in string.whitespace + string.punctuation:
answer = answer[-1]
break
elif listinstr(['answer is', 'answer:'], answer.lower()):
answer = self.process_answer_prefix(answer, ['answer is', 'answer:'])
answer = self.process_answer_prefix(answer, ['option'])
else:
break
else:
break
else:
qtype = 'open'
if self.count % 50 == 0 and int(os.environ.get('LOCAL_RANK', '0')) == 0:
print(f'\n{self.BEGIN_LINE}')
print(f'image_paths: {image_paths}\n')
print(f'prompt: {prompt}\n')
print(f'qtype: {qtype}\n')
print(f'output: {response}\n')
print(f'answer: {answer}\n')
print(f'{self.END_LINE}\n', flush=True)
self.count += 1
return answer
def prepare_inputs(self, message):
prompt = ''
image_paths = []
image_count = 0
text_count = 0
pure_text = ''
for msg in message:
if msg['type'] == 'text':
text_count += 1
prompt += msg['value']
pure_text += msg['value']
elif msg['type'] == 'image':
image_count += 1
prompt += self.DEFAULT_IMAGE_TOKEN
image_paths.append(msg['value'])
if image_count == 1 and text_count == 1:
prompt = self.DEFAULT_IMAGE_TOKEN + '\n' + pure_text
return prompt, image_paths
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment